In [7]:
import pandas as pd
import os

In [14]:
# read ratings and movies metadata
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

In [23]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [15]:
# get the set of the movie IDs in the datase
movie_ids = set(movies['movieId'])

In [None]:
# gather movie IDs for which all multimodal features are available
import os
from pathlib import Path

# Define your base directories
# this is where the .json files are stored
base_paths = ['../../2_encode_features/text_feat/minilm/', '../../2_encode_features/text_feat/mpnet/', '../../2_encode_features/text_feat/clip-text/', 
              '../../2_encode_features/image_feat/vgg/', '../../2_encode_features/image_feat/vit/', '../../2_encode_features/image_feat/clip-image/', 
              '../../2_encode_features/video_feat/r2p1d/', '../../2_encode_features/video_feat/mvit/', '../../2_encode_features/video_feat/slowfast/', 
              '../../2_encode_features/audio_feat/ast/', '../../2_encode_features/audio_feat/vggish/', '../../2_encode_features/audio_feat/whisper'
    ]

def get_all_ids(base_path):
    ids = set()
    
    # finds all .json files in all subdirectories
    for file_path in Path(base_path).rglob('*.json'):
        # .stem returns the filename without the extension (e.g., '1' from '1.json')
        ids.add(int(file_path.stem))
    return ids

# 1. Collect sets of IDs for each folder
sets_of_ids = [get_all_ids(p) for p in base_paths]

# 2. Find the intersection of all sets
common_ids = set.intersection(*sets_of_ids)

# Output results
print(f"Total common IDs found: {len(common_ids)}")

Total common IDs found: 19227
Sample IDs: [131072, 1, 131074, 65538, 32770]


In [None]:
base_paths = ['../../2_encode_features/text_feat/minilm/', '../../2_encode_features/text_feat/mpnet/', '../../2_encode_features/text_feat/clip-text/', 
              '../../2_encode_features/image_feat/vgg/', '../../2_encode_features/image_feat/vit/', '../../2_encode_features/image_feat/clip-image/', 
              '../../2_encode_features/video_feat/r2p1d/', '../../2_encode_features/video_feat/mvit/', '../../2_encode_features/video_feat/slowfast/', 
              '../../2_encode_features/audio_feat/ast/', '../../2_encode_features/audio_feat/vggish/', '../../2_encode_features/audio_feat/whisper'
              ]

# check that the common IDs have been correctly computed
for path in base_paths:
    print(path)
    ids = [int(id.split('.')[0]) for id in os.listdir(path) if '.json' in id]
    for id in common_ids:
        if id not in ids:
            print(id, 'not found')
            

../../text_feat/minilm/
../../text_feat/mpnet/
../../text_feat/clip-text/
../../image_feat/vgg/
../../image_feat/vit/
../../image_feat/clip-image/
../../video_feat/r2p1d/
../../video_feat/mvit/
../../video_feat/slowfast/
../../audio_feat/ast/
../../audio_feat/vggish/
../../audio_feat/whisper


In [22]:
# filter the original rating df so to keep only interactions with items for which all modalities are available
ratings_mm = ratings[ratings['movieId'].isin(common_ids)]

print(f'Original interactions: {len(ratings)}')
print(f'Filtered interactions: {len(ratings_mm)}')

Original interactions: 20000263
Filtered interactions: 18777965


In [35]:
ratings_mm[ratings_mm['movieId'].isna()]

Unnamed: 0,userId,movieId,rating,timestamp


In [36]:
ratings_mm[ratings_mm['userId'].isna()]

Unnamed: 0,userId,movieId,rating,timestamp


In [43]:
print(f"Number of users: {len(set(ratings_mm['userId']))}")
print(f"Number of items: {len(set(ratings_mm['movieId']))}")

Number of users: 138493
Number of items: 19009


In [71]:
# remap users and items from 0 to n-1/m-1 in the filtered df version -> useful for MMRec
user_map = {old: new for new, old in enumerate(sorted(set(ratings_mm['userId'])))}
item_map = {old: new for new, old in enumerate(sorted(set(ratings_mm['movieId'])))}

print(f'User map len: {len(user_map)}')
print(f'Item map len: {len(item_map)}')

# reconstruct the reverse item map - it will be useful later for multimodal features remapping
reverse_item_map = {new: old for old, new in item_map.items()}

User map len: 138493
Item map len: 19009


In [65]:
import copy 

# remap the ratings df
remapped_mm_ratings = copy.deepcopy(ratings_mm)
remapped_mm_ratings['userId'] = ratings_mm['userId'].map(user_map)
remapped_mm_ratings['movieId'] = ratings_mm['movieId'].map(item_map)

In [66]:
# we consider rating >= 3.5 as positive, the other negative. we also remove the timestamp colum
remapped_mm_ratings['r'] = remapped_mm_ratings['rating']
remapped_mm_ratings = remapped_mm_ratings.drop(columns=['rating'])
remapped_mm_ratings['rating'] = (remapped_mm_ratings['r'] >= 3.5).astype(int)
remapped_mm_ratings = remapped_mm_ratings.drop(columns=['r', 'timestamp'])

In [67]:
remapped_mm_ratings.head(5)

Unnamed: 0,userId,movieId,rating
1,0,25,1
2,0,28,1
3,0,42,1
4,0,45,1
5,0,94,1


In [None]:
# remap each multimodal features.
# 1. take each item id sorted 
# 2. reconstruct the old id
# 3. take each modality from the json
# 4. stack as numpy array so to have arrays in the shape [n_items x k]

import numpy as np
import json

def gather_mm_feat(folder, ids):

    mm_feat = []
    for id in sorted(ids):
        old_id = reverse_item_map[id]
        json_dict = json.load(open(f'{folder}{old_id}.json', 'r'))
        emb = np.array(json_dict[str(old_id)])
        mm_feat.append(emb)


    mm_array = np.array(mm_feat)
    print(mm_array.shape)
    return mm_array

# 1. Define all paths for the different modalities
paths = {
    # Text Encoders
    'minilm':      '../../text_feat/minilm/',
    'mpnet':       '../../text_feat/mpnet/',
    'clip_text':   '../../text_feat/clip-text/',
    
    # Image Encoders
    'vgg':         '../../image_feat/vgg/',
    'vit':         '../../image_feat/vit/',
    'clip_image':  '../../image_feat/clip-image/',
    
    # Video Encoders
    'r2p1d':       '../../video_feat/r2p1d/',
    'mvit':        '../../video_feat/mvit/',
    'slowfast':    '../../video_feat/slowfast/',
    
    # Audio Encoders
    'ast':         '../../audio_feat/ast/',
    'vggish':      '../../audio_feat/vggish/',
    'whisper':     '../../audio_feat/whisper/'
}

# 2. Extract the set of movie IDs once
movie_ids = set(remapped_mm_ratings['movieId'])

# 3. Initialize storage dictionary
all_feats = {}

# 4. Loop through each encoder and gather features
print("Starting feature gathering...")
for name, path in paths.items():
    print(f"Processing {name}...")
    try:
        all_feats[name] = gather_mm_feat(path, movie_ids)
        print(f"Successfully loaded {name}. Shape: {all_feats[name].shape}")
    except FileNotFoundError as e:
        print(f"Error: Could not find files for {name} at {path}")
    except Exception as e:
        print(f"An error occurred while processing {name}: {e}")

print("\nAll processing complete.")

Starting feature gathering...
Processing minilm...
(19009, 384)
Successfully loaded minilm. Shape: (19009, 384)
Processing mpnet...
(19009, 768)
Successfully loaded mpnet. Shape: (19009, 768)
Processing clip_text...
(19009, 512)
Successfully loaded clip_text. Shape: (19009, 512)
Processing vgg...
(19009, 4096)
Successfully loaded vgg. Shape: (19009, 4096)
Processing vit...
(19009, 768)
Successfully loaded vit. Shape: (19009, 768)
Processing clip_image...
(19009, 512)
Successfully loaded clip_image. Shape: (19009, 512)
Processing r2p1d...
(19009, 512)
Successfully loaded r2p1d. Shape: (19009, 512)
Processing mvit...
(19009, 768)
Successfully loaded mvit. Shape: (19009, 768)
Processing slowfast...
(19009, 2304)
Successfully loaded slowfast. Shape: (19009, 2304)
Processing ast...
(19009, 768)
Successfully loaded ast. Shape: (19009, 768)
Processing vggish...
(19009, 128)
Successfully loaded vggish. Shape: (19009, 128)
Processing whisper...
(19009, 512)
Successfully loaded whisper. Shape: (

In [None]:
import os

base_output = 'data'

modality_map = {
    'minilm': 'text',
    'mpnet': 'text',
    'clip_text': 'text',
    'vgg': 'image',
    'vit': 'image',
    'clip_image': 'image',
    'r2p1d': 'video',
    'mvit': 'video',
    'slowfast': 'video',
    'ast': 'audio',
    'vggish': 'audio',
    'whisper': 'audio'
}

for name, array in all_feats.items():
    modality = modality_map.get(name)
    if modality:
        target_dir = os.path.join(base_output, modality)
        if not os.path.exists(target_dir):
            os.makedirs(target_dir)
        
        file_path = os.path.join(target_dir, f"{name}.npy")
        np.save(file_path, array)
        print(f"Saved: {file_path}")

Saved: text/minilm.npy
Saved: text/mpnet.npy
Saved: text/clip_text.npy
Saved: image/vgg.npy
Saved: image/vit.npy
Saved: image/clip_image.npy
Saved: video/r2p1d.npy
Saved: video/mvit.npy
Saved: video/slowfast.npy
Saved: audio/ast.npy
Saved: audio/vggish.npy
Saved: audio/whisper.npy


In [83]:
# finally, split the interaction data into train, valid, test as required by MMRec and save the interaction file
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(
    remapped_mm_ratings, 
    test_size=0.2, 
    random_state=42
)

valid_df, test_df = train_test_split(
    temp_df, 
    test_size=0.5, 
    random_state=42
)

train_df['x_label'] = 0
valid_df['x_label'] = 1
test_df['x_label'] = 2


remapped_mm_ratings = pd.concat([train_df, valid_df, test_df]).sort_index()

print(remapped_mm_ratings['x_label'].value_counts(normalize=True))

0    0.8
2    0.1
1    0.1
Name: x_label, dtype: float64


In [None]:
remapped_mm_ratings.columns = ['userID', 'itemID', 'rating', 'x_label']
remapped_mm_ratings.to_csv('data/ml20m.inter', sep='\t', index=False, header=True)