In [None]:
# repro processing lastfm2k

In [None]:
import pandas as pd

In [None]:
# read LFM2k ratings provided by GroupLens: https://grouplens.org/datasets/hetrec-2011/
ratings = pd.read_csv('original_data/user_artists.dat', sep='\t',)
print(ratings.head(5))

   userID  artistID  weight
0       2        51   13883
1       2        52   11690
2       2        53   11351
3       2        54   10300
4       2        55    8983


In [None]:
# print dataset statistics
print(f'Ratings: {len(ratings)}')
print(f'Users: {len(ratings["userID"].unique())}')
print(f'Items: {len(ratings["artistID"].unique())}')

Ratings: 92834
Users: 1892
Items: 17632


In [None]:
# apply core-5 filtering to interactions

def core_k_filtering(interactions, k):
    """
    Perform Core5 filtering on a user-item-rating DataFrame.
    Ensures that every user and item has at least 5 interactions.
    """
    while True:
        user_counts = interactions['userID'].value_counts()
        item_counts = interactions['artistID'].value_counts()
        
        valid_users = user_counts[user_counts >= 5].index
        valid_items = item_counts[item_counts >= 5].index
        
        core_k = interactions[interactions['userID'].isin(valid_users) & interactions['artistID'].isin(valid_items)]
        
        if len(core_k) == len(interactions):
            break
        
        interactions = core_k

    return core_k

ratings_core5 = core_k_filtering(ratings, 5)

In [None]:
# print stats of the core-5 dataset
print(f'Ratings: {len(ratings_core5)}')
print(f'Users: {len(ratings_core5["userID"].unique())}')
print(f'Items: {len(ratings_core5["artistID"].unique())}')

Ratings: 71355
Users: 1859
Items: 2823


In [None]:
artist_info = pd.read_csv('original_data/artists.dat', sep='\t')
print(artist_info)

          id  ...                                         pictureURL
0          1  ...    http://userserve-ak.last.fm/serve/252/10808.jpg
1          2  ...  http://userserve-ak.last.fm/serve/252/3052066.jpg
2          3  ...  http://userserve-ak.last.fm/serve/252/40222717...
3          4  ...  http://userserve-ak.last.fm/serve/252/54697835...
4          5  ...  http://userserve-ak.last.fm/serve/252/14789013...
...      ...  ...                                                ...
17627  18741  ...  http://userserve-ak.last.fm/serve/252/16352971...
17628  18742  ...   http://userserve-ak.last.fm/serve/252/207445.jpg
17629  18743  ...   http://userserve-ak.last.fm/serve/252/344868.jpg
17630  18744  ...  http://userserve-ak.last.fm/serve/252/29297695...
17631  18745  ...  http://userserve-ak.last.fm/serve/252/59486303...

[17632 rows x 4 columns]


In [None]:
# filter out artists not in the core5
artist_info_core5 = artist_info[artist_info['id'].isin(set(ratings_core5['artistID']))]
print(f'Len artist name after core5: {len(artist_info_core5)}')

Len artist name after core5: 2823


In [None]:
audio_links = pd.read_csv('lfm2k_song_extended_mapping.tsv', sep='\t')
cover_links = pd.read_csv('lfm2k_covers_extended_mapping.tsv', sep='\t')
texts = pd.read_csv('lfm2k_text.tsv', sep='\t')

In [None]:
audio_ids = set(audio_links['artistID'])
cover_ids = set(cover_links['artistID'])
text_ids = set(texts['artistID'])
print(len(audio_ids))
print(len(cover_ids))
print(len(text_ids))

2825
2748
12523


In [None]:
print(len(set(artist_info_core5['id']).intersection(audio_ids)))
print(len(set(artist_info_core5['id']).intersection(cover_ids)))
print(len(set(artist_info_core5['id']).intersection(text_ids)))

2820
2743
2813


In [None]:
# compute intersections between all items
keep_items = audio_ids.intersection(cover_ids).intersection(text_ids).intersection(set(artist_info_core5['id']))
print(len(keep_items))

2731


In [None]:
multimodal_ratings = ratings_core5[ratings_core5['artistID'].isin(keep_items)]
print(f'Ratings: {len(multimodal_ratings)}')
print(f'Users: {len(multimodal_ratings["userID"].unique())}')
print(f'Items: {len(multimodal_ratings["artistID"].unique())}')

Ratings: 70030
Users: 1859
Items: 2731


In [None]:
# Now we need convert each .pkl file learned during the extraction process into an .npy file, stored as np.array from 0 to k-1, where k is the total number of items
# Moreover, as required by MMRec, we need to provide .npy files with the same number of items

# For this reason, we need to filter out all the items that have no any of the multimodal features. 
# Note that this has been done only in our setting to perform the experiments, but in your case few modalities might be enough,
# so you might need to filter out less items or no items at all.

# Moreover, this is needed for the MMRec framework, but if you are using other framework - or you own one - this might be not necessary.
# This is also the reason we choose to provide the .pkl dict files, in such a way any user can use them the way they need and want to.

In [None]:
# Read all the items for which we have all the multimodal features.
# This is required by MMRec, but is not mandatory if you focus only on a sinlge modality or a subset of them

import pickle as pkl
import os 
import numpy as np

item_keys = None

for pkl_file in os.listdir('multimodal_features/dict/'):

    if '.pkl' in pkl_file:
        print(pkl_file)
        data_dict = pkl.load(open(f'multimodal_features/dict/{pkl_file}', 'rb'))
        data_keys = set()

        for x in data_dict.keys():
            if isinstance(x, str):
                k = x.split('_')[0]
                data_keys.add(int(k))
            else:
                data_keys.add(x)

        if item_keys is None:
            item_keys = set(data_keys)
        else:
            item_keys &= set(data_keys)

print(len(item_keys))

whisper.pkl
all-mpnet-base-v2.pkl
vit_cls.pkl
all-MiniLM-L6-v2.pkl
vggish.pkl
resnet152.pkl
vit_avg.pkl
vgg.pkl


In [None]:
multimodal_ratings = multimodal_ratings[multimodal_ratings['artistID'].isin(item_keys)]

In [None]:
# print the updated statistics
print(f'Ratings: {len(multimodal_ratings)}')
print(f'Users: {len(multimodal_ratings["userID"].unique())}')
print(f'Items: {len(multimodal_ratings["artistID"].unique())}')

Ratings: 56346
Users: 1859
Items: 1425


In [None]:
# split into train, valid, test - in this case, we use the same format required by MMRec, but any strategy can be applied
# to avoid any error in the training of the models, we split into 80-10-10 and ensure that all users and items appear in the training

import numpy as np

def split_data(df, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1, seed=42):
    """
    Splits a dataset into train, validation, and test sets while ensuring that:
    - All users and items appear in the training set.
    - Validation and test sets do not introduce new users or items.
    """
    np.random.seed(seed)
    
    train_list, valid_list, test_list = [], [], []
    
    for _, user_df in df.groupby("userID"):
        user_df = user_df.sample(frac=1, random_state=seed)
        
        num_interactions = len(user_df)
        train_end = int(num_interactions * train_ratio)
        valid_end = train_end + int(num_interactions * valid_ratio)
        
        train_list.append(user_df.iloc[:train_end])
        valid_list.append(user_df.iloc[train_end:valid_end])
        test_list.append(user_df.iloc[valid_end:])
    
    train_df = pd.concat(train_list).reset_index(drop=True)
    valid_df = pd.concat(valid_list).reset_index(drop=True)
    test_df = pd.concat(test_list).reset_index(drop=True)
    
    # Ensure valid & test sets contain only users & items from training
    train_users, train_items = set(train_df['userID']), set(train_df['artistID'])
    
    valid_df = valid_df[valid_df['userID'].isin(train_users) & valid_df['artistID'].isin(train_items)].reset_index(drop=True)
    test_df = test_df[test_df['userID'].isin(train_users) & test_df['artistID'].isin(train_items)].reset_index(drop=True)
    
    return train_df, valid_df, test_df

# split data into train, valid, test
train, valid, test = split_data(multimodal_ratings)

print(f'Len training: {len(train)}')
print(f'Len training: {len(valid)}')
print(f'Len training: {len(test)}')

Len training: 44327
Len training: 4799
Len training: 7219


In [None]:
split_data = pd.concat([train, valid, test])
split_data

Unnamed: 0,userID,artistID,weight
0,2,95,1363
1,2,98,1332
2,2,55,8983
3,2,67,3301
4,2,90,1471
...,...,...,...
8127,2099,1943,410
8128,2099,2605,397
8129,2100,1281,573
8130,2100,3806,389


In [None]:
split_data = pd.concat([train, valid, test])

In [None]:
# format data into MMRec format
# MMRec requires a single .inter file, with a column named 'x_label' that can assume 3 possible values:
# - 0: the interaction is in the training set
# - 1: the interaction is in the validation set
# - 2: the interaction is in the test set
train['x_label'] = 0
valid['x_label'] = 1
test['x_label'] = 2

# now, we concat the three dataframe to save the unique .inter file required by MMRec
split_data = pd.concat([train, valid, test])

# conisder all the interactions as positive
split_data['rating'] = 1

# remove weight column
split_data = split_data[['userID', 'artistID', 'rating', 'x_label']]

# rename columns
split_data.columns = ['userID', 'itemID', 'rating', 'x_label']

# we save this dataset with the original IDs
split_data.to_csv('processed_data/lfm2k_og_ids.inter', sep='\t', index=False)




In [None]:
# we need to remap both user and item IDs from 0 to n-1

map_users = {user_id: i for i, user_id in enumerate(split_data['userID'].unique())}
map_items = {item_id: i for i, item_id in enumerate(split_data['itemID'].unique())}

inverse_map_item = {i: item_id for item_id, i in map_items.items()}

In [None]:
import numpy as np
from collections import defaultdict

def average_grouped_arrays(data_dict):
    """
    Averages all NumPy arrays that share the same prefix in their keys.

    Args:
        data_dict (dict): A dictionary where keys are strings like '6347_1', '6347_2', 
                          and values are NumPy arrays.

    Returns:
        dict: A new dictionary where keys are prefixes (e.g., '6347') and values are 
              the averaged NumPy arrays.
    """
    grouped_arrays = defaultdict(list)

    # Group arrays by their prefix
    for key, array in data_dict.items():
        if isinstance(key, str):
            prefix = int(key.split('_')[0])  # Extract prefix before '_'
            grouped_arrays[prefix].append(array)
        else:
            grouped_arrays[key].append(array)

    # Compute the average for each prefix group
    averaged_dict = {prefix: np.mean(arrays, axis=0) for prefix, arrays in grouped_arrays.items()}

    return averaged_dict


In [None]:
# for each dict of embedding:
# 1. get the items that must be kept
# 2. compute the centroid related to the same artists
# 3. map with the new IDs
# 4. store them as np.array

for pkl_file in os.listdir('multimodal_features/dict/'):

    if '.pkl' in pkl_file:

        name_mod = pkl_file.split('.')[0]
        print(name_mod)

        emb_list = []
        
        data_dict = pkl.load(open(f'multimodal_features/dict/{pkl_file}', 'rb'))
        avg_data_dict = average_grouped_arrays(data_dict)

        print(len(avg_data_dict))

        for index in range(len(inverse_map_item)):
            old_id = inverse_map_item[index]
            emb = avg_data_dict[old_id]
            emb_list.append(emb)
        
        emb_array = np.array(emb_list)
        np.save(open(f'multimodal_features/mmrec_npy/{name_mod}.npy', 'wb'), emb_array)
        

whisper
2364
all-mpnet-base-v2
12523
vit_cls
2314
all-MiniLM-L6-v2
12523
vggish
2364
resnet152
2314
vit_avg
2314
vgg
2314


In [None]:
split_data['userID'] = split_data['userID'].map(map_users)
split_data['itemID'] = split_data['itemID'].map(map_items)

split_data.to_csv('processed_data/lfm2k.inter', sep='\t', index=False)


In [None]:
# just to be sure that numpy library version mismatch should affect the readability of our embeddings, 
# we save the multimodal features as json files as well

import pickle as pkl
import os 
import numpy as np
import json

item_keys = None

for pkl_file in os.listdir('multimodal_features/dict/'):

    if '.pkl' in pkl_file:
        print(pkl_file)
        data_dict = pkl.load(open(f'multimodal_features/dict/{pkl_file}', 'rb'))
        
        data_dict_serializable = {key: value.tolist() for key, value in data_dict.items()}
        with open(f'multimodal_features/json/{pkl_file.split(".pkl")[0]}.json', 'w') as f:
            json.dump(data_dict_serializable, f, indent=4)




whisper.pkl
all-mpnet-base-v2.pkl
vit_cls.pkl
all-MiniLM-L6-v2.pkl
vggish.pkl
resnet152.pkl
vit_avg.pkl
vgg.pkl
