In [1]:
# repro processing dbbook

In [2]:
import pandas as pd

In [3]:
# read train and test split provided by https://github.com/swapUniba/Deep_CBRS_Amar/tree/master/datasets/dbbook
train = pd.read_csv('original_data/train.tsv', sep='\t', names=['userID', 'itemID', 'rating'])
test = pd.read_csv('original_data/test.tsv', sep='\t', names=['userID', 'itemID', 'rating'])

In [4]:
ratings = pd.concat([train, test])
ratings.head(5)

Unnamed: 0,userID,itemID,rating
0,6873,5950,1
1,6873,8010,1
2,6873,5232,1
3,6873,7538,1
4,6873,5231,0


In [5]:
# print dataset statistics
print(f'Ratings: {len(ratings)}')
print(f'Users: {len(ratings["userID"].unique())}')
print(f'Items: {len(ratings["itemID"].unique())}')

Ratings: 140360
Users: 6181
Items: 7672


In [6]:
# apply core-5 filtering to interactions

def core_k_filtering(interactions, k):
    """
    Perform Core5 filtering on a user-item-rating DataFrame.
    Ensures that every user and item has at least 5 interactions.
    """
    while True:
        user_counts = interactions['userID'].value_counts()
        item_counts = interactions['itemID'].value_counts()
        
        valid_users = user_counts[user_counts >= 5].index
        valid_items = item_counts[item_counts >= 5].index
        
        core_k = interactions[interactions['userID'].isin(valid_users) & interactions['itemID'].isin(valid_items)]
        
        if len(core_k) == len(interactions):
            break
        
        interactions = core_k

    return core_k

ratings_core5 = core_k_filtering(ratings, 5)

In [7]:
# print stats of the core-5 dataset
print(f'Ratings: {len(ratings_core5)}')
print(f'Users: {len(ratings_core5["userID"].unique())}')
print(f'Items: {len(ratings_core5["itemID"].unique())}')

Ratings: 132837
Users: 6179
Items: 4622


In [8]:
# load extended mapping and texts
extended_mapping = pd.read_csv('full_extended_dbbook_img_links.tsv', sep='\t')
texts = pd.read_csv('dbbook_texts.tsv', sep='\t')

# load item IDs
mapping_ids = set(extended_mapping['id'])
texts = set(texts['id'])

core5_ids = set(ratings_core5['itemID'])

print(len(mapping_ids.intersection(texts)))
print(len(mapping_ids.intersection(texts).intersection(core5_ids)))

keep_items = mapping_ids.intersection(texts).intersection(core5_ids)

6170
4197


In [9]:
multimodal_ratings = ratings_core5[ratings_core5['itemID'].isin(keep_items)]
# print stats of the core-5 dataset
print(f'Ratings: {len(multimodal_ratings)}')
print(f'Users: {len(multimodal_ratings["userID"].unique())}')
print(f'Items: {len(multimodal_ratings["itemID"].unique())}')

Ratings: 121924
Users: 6179
Items: 4197


In [10]:
# split into train, valid, test - in this case, we use the same format required by MMRec, but any strategy can be applied
# to avoid any error in the training of the models, we split into 80-10-10 and ensure that all users and items appear in the training

import numpy as np

def split_data(df, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1, seed=42):
    """
    Splits a dataset into train, validation, and test sets while ensuring that:
    - All users and items appear in the training set.
    - Validation and test sets do not introduce new users or items.
    """
    np.random.seed(seed)
    
    train_list, valid_list, test_list = [], [], []
    
    for _, user_df in df.groupby("userID"):
        user_df = user_df.sample(frac=1, random_state=seed)
        
        num_interactions = len(user_df)
        train_end = int(num_interactions * train_ratio)
        valid_end = train_end + int(num_interactions * valid_ratio)
        
        train_list.append(user_df.iloc[:train_end])
        valid_list.append(user_df.iloc[train_end:valid_end])
        test_list.append(user_df.iloc[valid_end:])
    
    train_df = pd.concat(train_list).reset_index(drop=True)
    valid_df = pd.concat(valid_list).reset_index(drop=True)
    test_df = pd.concat(test_list).reset_index(drop=True)
    
    # Ensure valid & test sets contain only users & items from training
    train_users, train_items = set(train_df['userID']), set(train_df['itemID'])
    
    valid_df = valid_df[valid_df['userID'].isin(train_users) & valid_df['itemID'].isin(train_items)].reset_index(drop=True)
    test_df = test_df[test_df['userID'].isin(train_users) & test_df['itemID'].isin(train_items)].reset_index(drop=True)
    
    return train_df, valid_df, test_df

# split data into train, valid, test
train, valid, test = split_data(multimodal_ratings)

print(f'Len training: {len(train)}')
print(f'Len valid: {len(valid)}')
print(f'Len test: {len(test)}')

Len training: 95075
Len valid: 9489
Len test: 17350


In [11]:
# format data into MMRec format
# MMRec requires a single .inter file, with a column named 'x_label' that can assume 3 possible values:
# - 0: the interaction is in the training set
# - 1: the interaction is in the validation set
# - 2: the interaction is in the test set
train['x_label'] = 0
valid['x_label'] = 1
test['x_label'] = 2

# now, we concat the three dataframe to save the unique .inter file required by MMRec
split_data = pd.concat([train, valid, test])

# rename columns
split_data.columns = ['userID', 'itemID', 'rating', 'x_label']

# we save this dataset with the original IDs
split_data.to_csv('processed_data/dbbook_og_ids.inter', sep='\t', index=False)

In [12]:
# we need to remap both user and item IDs from 0 to n-1

map_users = {user_id: i for i, user_id in enumerate(split_data['userID'].unique())}
map_items = {item_id: i for i, item_id in enumerate(split_data['itemID'].unique())}

In [13]:
split_data['userID'] = split_data['userID'].map(map_users)
split_data['itemID'] = split_data['itemID'].map(map_items)
split_data.to_csv('processed_data/dbbook.inter', sep='\t', index=False)


In [14]:
# finally, use these mapping to remap the multimodal features and save them a np.array