In [None]:
# repro processing ml1m

In [None]:
import pandas as pd

In [None]:
# read ML1M ratings provided by grouplens: https://grouplens.org/datasets/movielens/ 
ratings = pd.read_csv('original_data/ratings.dat', sep='::', names=['user', 'item', 'rating', 'timestamp'])
ratings.head(5)
print(ratings.head(5))

   user  item  rating  timestamp
0     1  1193       5  978300760
1     1   661       3  978302109
2     1   914       3  978301968
3     1  3408       4  978300275
4     1  2355       5  978824291


In [None]:
# read movie metadata
movies = pd.read_csv('original_data/movies.dat', sep='::', names=['id', 'name', 'genres'], encoding='ISO-8859-1')
print(movies.head(5))

   id                                name                        genres
0   1                    Toy Story (1995)   Animation|Children's|Comedy
1   2                      Jumanji (1995)  Adventure|Children's|Fantasy
2   3             Grumpier Old Men (1995)                Comedy|Romance
3   4            Waiting to Exhale (1995)                  Comedy|Drama
4   5  Father of the Bride Part II (1995)                        Comedy


In [None]:
# print dataset statistics
print(f'Ratings: {len(ratings)}')
print(f'Users: {len(ratings["user"].unique())}')
print(f'Items: {len(ratings["item"].unique())}')

Ratings: 1000209
Users: 6040
Items: 3706


In [None]:
# apply core-5 filtering to interactions

def core_k_filtering(interactions, k):
    """
    Perform Core5 filtering on a user-item-rating DataFrame.
    Ensures that every user and item has at least 5 interactions.
    """
    while True:
        user_counts = interactions['user'].value_counts()
        item_counts = interactions['item'].value_counts()
        
        valid_users = user_counts[user_counts >= 5].index
        valid_items = item_counts[item_counts >= 5].index
        
        core_k = interactions[interactions['user'].isin(valid_users) & interactions['item'].isin(valid_items)]
        
        if len(core_k) == len(interactions):
            break
        
        interactions = core_k

    return core_k

ratings_core5 = core_k_filtering(ratings, 5)

In [None]:
# print stats of the core-5 dataset
print(f'Ratings: {len(ratings_core5)}')
print(f'Users: {len(ratings_core5["user"].unique())}')
print(f'Items: {len(ratings_core5["item"].unique())}')

Ratings: 999611
Users: 6040
Items: 3416


In [None]:
# read our extended mapping
multimodal_mapping = pd.read_csv('ml1m_full_extended_mapping.tsv', sep='\t')
print(multimodal_mapping.head(5))

   movie_id  ...    youtubeId
0         2  ...  3LPANjHlPxo
1         3  ...  rEnOoWs3FuA
2         4  ...  j9xml1CxgXI
3         5  ...  BbvnDlu_Zjc
4         6  ...  2GfZl4kuVNI

[5 rows x 6 columns]


In [None]:
mapped_items = set(multimodal_mapping['movie_id'])
print(f'Out extended mapping has {len(mapped_items)} items')

Out extended mapping has 3197 items


In [None]:
missing = len(set(ratings_core5['item']) - mapped_items)
print(f'In total, there are {missing} items for which we do not have the movie poster, the movie trailer, or the movie plot')

In total, there are 365 items for which we do not have the movie poster, the movie trailer, or the movie plot


In [None]:
# filter out the rows involving the missing items
multimodal_ratings = ratings_core5[ratings_core5['item'].isin(mapped_items)]

# print the updated statistics
print(f'Ratings: {len(multimodal_ratings)}')
print(f'Users: {len(multimodal_ratings["user"].unique())}')
print(f'Items: {len(multimodal_ratings["item"].unique())}')

Ratings: 946780
Users: 6040
Items: 3051


In [1]:
# Now we need convert each .pkl file learned during the extraction process into an .npy file, stored as np.array from 0 to k-1, where k is the total number of items
# Moreover, as required by MMRec, we need to provide .npy files with the same number of items

# For this reason, we need to filter out all the items that have no any of the multimodal features. 
# Note that this has been done only in our setting to perform the experiments, but in your case few modalities might be enough,
# so you might need to filter out less items or no items at all.

# Moreover, this is needed for the MMRec framework, but if you are using other framework - or you own one - this might be not necessary.
# This is also the reason we choose to provide the .pkl dict files, in such a way any user can use them the way they need and want to.

In [None]:
# Read all the items for which we have all the multimodal features.
# This is required by MMRec, but is not mandatory if you focus only on a sinlge modality or a subset of them

import pickle as pkl
import os 
import numpy as np

item_keys = None

for pkl_file in os.listdir('multimodal_features/dict/'):

    if '.pkl' in pkl_file:
        print(pkl_file)
        data_dict = pkl.load(open(f'multimodal_features/dict/{pkl_file}', 'rb'))

        if item_keys is None:
            item_keys = set(data_dict.keys())
        else:
            item_keys &= set(data_dict.keys())
print(len(item_keys))



whisper.pkl
all-mpnet-base-v2.pkl
r2p1d.pkl
vit_cls.pkl
all-MiniLM-L6-v2.pkl
vggish.pkl
resnet152.pkl
vit_avg.pkl
vgg.pkl
3096


In [None]:
multimodal_ratings = multimodal_ratings[multimodal_ratings['item'].isin(item_keys)]

3096


In [None]:
# print the updated statistics
print(f'Ratings: {len(multimodal_ratings)}')
print(f'Users: {len(multimodal_ratings["user"].unique())}')
print(f'Items: {len(multimodal_ratings["item"].unique())}')

Ratings: 942799
Users: 6040
Items: 2981


In [None]:
# split into train, valid, test - in this case, we use the same format required by MMRec, but any strategy can be applied
# to avoid any error in the training of the models, we split into 80-10-10 and ensure that all users and items appear in the training

import numpy as np

def split_data(df, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1, seed=42):
    """
    Splits a dataset into train, validation, and test sets while ensuring that:
    - All users and items appear in the training set.
    - Validation and test sets do not introduce new users or items.
    """
    np.random.seed(seed)
    
    train_list, valid_list, test_list = [], [], []
    
    for _, user_df in df.groupby("user"):
        user_df = user_df.sample(frac=1, random_state=seed)
        
        num_interactions = len(user_df)
        train_end = int(num_interactions * train_ratio)
        valid_end = train_end + int(num_interactions * valid_ratio)
        
        train_list.append(user_df.iloc[:train_end])
        valid_list.append(user_df.iloc[train_end:valid_end])
        test_list.append(user_df.iloc[valid_end:])
    
    train_df = pd.concat(train_list).reset_index(drop=True)
    valid_df = pd.concat(valid_list).reset_index(drop=True)
    test_df = pd.concat(test_list).reset_index(drop=True)
    
    # Ensure valid & test sets contain only users & items from training
    train_users, train_items = set(train_df['user']), set(train_df['item'])
    
    valid_df = valid_df[valid_df['user'].isin(train_users) & valid_df['item'].isin(train_items)].reset_index(drop=True)
    test_df = test_df[test_df['user'].isin(train_users) & test_df['item'].isin(train_items)].reset_index(drop=True)
    
    return train_df, valid_df, test_df

# split data into train, valid, test
train, valid, test = split_data(multimodal_ratings)

print(f'Len training: {len(train)}')
print(f'Len validation: {len(valid)}')
print(f'Len testing: {len(test)}')

Len training: 751879
Len validation: 91603
Len testing: 99317


In [None]:
# format data into MMRec format
# MMRec requires a single .inter file, with a column named 'x_label' that can assume 3 possible values:
# - 0: the interaction is in the training set
# - 1: the interaction is in the validation set
# - 2: the interaction is in the test set
train['x_label'] = 0
valid['x_label'] = 1
test['x_label'] = 2

# now, we concat the three dataframe to save the unique .inter file required by MMRec
split_data = pd.concat([train, valid, test])

# binarize rating
# if rating <= 3    --> 0
# if rating > 4     --> 1
split_data['rating'] = split_data['rating'].apply(lambda x: 0 if x <= 3 else 1)

# rename columns
split_data.columns = ['userID', 'itemID', 'rating', 'timestamp', 'x_label']

split_data.to_csv('movielens_1m.inter', sep='\t', index=False)

In [None]:
# build the maps for users and item, so that their IDs go from 0 to x-1

map_user = {userID: new_id for new_id, userID in enumerate(split_data['userID'].unique())}
map_item = {itemID: new_id for new_id, itemID in enumerate(split_data['itemID'].unique())}

# inverse_map_user = {new_id: userID for new_id, userID in map_user.items()}
inverse_map_item = {new_id: itemID for itemID, new_id in map_item.items()}

In [None]:
# for each dict of embedding:
# 1. get the items that must be kept
# 2. map with the new IDs
# 3. store them as np.array

for pkl_file in os.listdir('multimodal_features/dict/'):

    if '.pkl' in pkl_file:

        name_mod = pkl_file.split('.')[0]
        print(name_mod)

        emb_list = []
        
        data_dict = pkl.load(open(f'multimodal_features/dict/{pkl_file}', 'rb'))

        for index in range(len(inverse_map_item)):
            old_id = inverse_map_item[index]
            emb = data_dict[old_id]
            emb_list.append(emb)
        
        emb_array = np.array(emb_list)
        np.save(open(f'multimodal_features/mmrec_npy/{name_mod}.npy', 'wb'), emb_array)
        

whisper
all-mpnet-base-v2
r2p1d
vit_cls
all-MiniLM-L6-v2
vggish
resnet152
vit_avg
vgg


In [None]:
# Now we remap the original dataset and save the remapped version
split_data['userID'] = split_data['userID'].map(map_user)
split_data['itemID'] = split_data['itemID'].map(map_item)

split_data.to_csv('processed_data/movielens_1m.inter', sep='\t', index=False)

In [None]:
# just to be sure that numpy library version mismatch should affect the readability of our embeddings, 
# we save the multimodal features as json files as well

import pickle as pkl
import os 
import numpy as np
import json

item_keys = None

for pkl_file in os.listdir('multimodal_features/dict/'):

    if '.pkl' in pkl_file:
        print(pkl_file)
        data_dict = pkl.load(open(f'multimodal_features/dict/{pkl_file}', 'rb'))
        
        data_dict_serializable = {key: value.tolist() for key, value in data_dict.items()}
        with open(f'multimodal_features/json/{pkl_file.split(".pkl")[0]}.json', 'w') as f:
            json.dump(data_dict_serializable, f, indent=4)