In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import pickle

np.random.seed(42)
if np.random.choice(np.arange(1000)) != 102:
    raise ValueError("Random seed is not set correctly.")

In [5]:
# 1) Load and preprocess the MovieLens 1M dataset
DATASET = 'ml-1m'
base_artifacts = Path.cwd().resolve().parents[2] / 'CausalI2I_artifacts'

ratings_columns = ['user_id', 'item_id', 'interaction', 'timestamp']
data = pd.read_csv(
    base_artifacts / 'Datasets' / 'Raw' / DATASET / 'ratings.dat',
    sep='::', 
    names=ratings_columns, 
    engine='python')
data['interaction'] = 1
data = data.sort_values(by='timestamp')

item_data = pd.read_csv(
    base_artifacts / 'Datasets' / 'Raw' / DATASET / 'movies.dat',
    sep='::', 
    names=['item_id', 'title', 'genre'], 
    engine='python', 
    encoding='iso-8859-1')
get_item_name = {item_id: item_data[item_data['item_id'] == item_id]['title'].values[0] for item_id in item_data['item_id']}

In [6]:
# 2) Get unique users, items, and counts
unique_users = data['user_id'].unique()
unique_items = data['item_id'].unique()

n_users = len(unique_users)
n_items = len(unique_items)

In [7]:
# 3) Divide users and items into train and test sets
test_users = np.random.choice(unique_users, size=int(0.5 * n_users), replace=False)
train_users = np.setdiff1d(unique_users, test_users)
test_items = np.random.choice(unique_items, size=int(0.2 * n_items), replace=False)
for item in [858, 1221, 2023, 1270, 2011, 2012, 1954, 2409, 2410, 2411, 2412]:
    if item not in test_items:
        test_items = np.append(test_items, item)
train_items = np.setdiff1d(unique_items, test_items)

In [8]:
# 4) Create train and test sets
all_pairs = pd.merge(
    pd.DataFrame({'user_id': unique_users}), 
    pd.DataFrame({'item_id': unique_items}),
    how='cross'
)
data_imp = pd.merge(
    left=all_pairs, 
    right=data, 
    on=['user_id', 'item_id'], 
    how='left'
)
data_imp = data_imp[['user_id', 'item_id', 'interaction', 'timestamp']]
data_imp['interaction'] = data_imp['interaction'].fillna(0).astype(int)

train = data_imp[data_imp['user_id'].isin(train_users) | data_imp['item_id'].isin(train_items)].copy()
test =  data_imp[data_imp['user_id'].isin(test_users) & data_imp['item_id'].isin(test_items)].copy()
print("Proportion of 1's in imputed sets:")
print(f"    Train:  {train['interaction'].mean():.2%}")
print(f"    Test:   {test['interaction'].mean():.2%}")

Proportion of 1's in imputed sets:
    Train:  4.48%
    Test:   4.38%


In [9]:
# 5) Reindex users and items to start from 0
old2new_users = {old: new for new, old in enumerate(unique_users)}
old2new_items = {old: new for new, old in enumerate(unique_items)}

train['user_id'] = train['user_id'].map(old2new_users)
train['item_id'] = train['item_id'].map(old2new_items)
test['user_id'] = test['user_id'].map(old2new_users)
test['item_id'] = test['item_id'].map(old2new_items)

item_dict = {old2new_items[old]: get_item_name[old] for old in unique_items}

# Data for SASRec
data['user_id'] = data['user_id'].map(old2new_users)
data['item_id'] = data['item_id'].map(old2new_items)
data = data.sort_values(by=['user_id', 'timestamp', 'item_id']).reset_index(drop=True)

In [10]:
# 6) Save to disk
print("Saving files to disk...\n")
train.to_csv(base_artifacts / 'Datasets' / 'Processed' / DATASET / 'train.csv', index=False)
test.to_csv(base_artifacts / 'Datasets' / 'Processed' / DATASET / 'test.csv', index=False)
data.to_csv(base_artifacts / 'Datasets' / 'Processed' / DATASET / 'data_sasrec.csv', index=False)

with open(base_artifacts / 'Datasets' / 'Processed' / DATASET / 'item_dict.pkl', 'wb') as f:
    pickle.dump(item_dict, f)

print('Files saved to disk.')

Saving files to disk...

Files saved to disk.


In [11]:
# 7) Choose 10K pairs
full_data = pd.concat([train, test], axis=0)
big_pivot = full_data.pivot(index='user_id', columns='item_id', values='interaction')
test_items = test['item_id'].unique()

X = big_pivot[test_items].values
mean = X.mean(axis=0)
std  = np.maximum(X.std(axis=0, ddof=1), 1e-8)
M = (X - mean) / std
corr_mat = (M.T @ M) / (M.shape[0] - 1)
np.fill_diagonal(corr_mat, 0)

best_flat_pair_idx = corr_mat.flatten().argsort()[-10000:]
best_pairs_idx = [(i % len(test_items), i // len(test_items)) for i in best_flat_pair_idx]
best_pairs_titles = [(item_dict[test_items[i]], item_dict[test_items[j]]) for i, j in best_pairs_idx]

In [12]:
with open(base_artifacts / 'Chosen_Pairs' / f'{DATASET}_chosen_pairs.pkl', "wb") as f:
    pickle.dump(best_pairs_titles, f)

In [None]:
# with open(base_artifacts / 'Chosen_Pairs' / f'{DATASET}_chosen_pairs.pkl', "rb") as f:
#     up = pickle.load(f)