In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

np.random.seed(42)
if np.random.choice(np.arange(1000)) != 102:
    raise ValueError("Random seed is not set correctly.")

In [2]:
# 1) Load and preprocess the Synthetic MovieLens 1M dataset
base_artifacts = Path.cwd().resolve().parents[1] / 'CausalI2I_artifacts'
data_path = base_artifacts / 'Datasets' / 'Simulation'

data = pd.read_csv(data_path / 'synthetic.csv', engine='python')
data = data.sort_values(by='timestamp')

In [3]:
# 2) Divide users and items into train and test sets

unique_users = data['user_id'].unique()
unique_items = data['item_id'].unique()

n_users = len(unique_users)
n_items = len(unique_items)

test_users = np.random.choice(unique_users, size=int(0.5 * n_users), replace=False)
train_users = np.setdiff1d(unique_users, test_users)
test_items = np.random.choice(unique_items, size=int(0.2 * n_items), replace=False)
train_items = np.setdiff1d(unique_items, test_items)

In [4]:
# 4) Create train, validation, and test sets
all_pairs = pd.merge(
    left=pd.DataFrame({'user_id': unique_users}),
    right=pd.DataFrame({'item_id': unique_items}),
    how='cross')
data_imp = pd.merge(
    left=all_pairs, 
    right=data, 
    on=['user_id', 'item_id'], 
    how='left')
data_imp = data_imp[['user_id', 'item_id', 'watched', 'timestamp']]
data_imp['watched'] = data_imp['watched'].fillna(0).astype(int)
train = data_imp[(data_imp['user_id'].isin(train_users)) | (data_imp['item_id'].isin(train_items))].copy()
test = data_imp[data_imp['user_id'].isin(test_users) & data_imp['item_id'].isin(test_items)].copy()

In [5]:
# 5) Get ground truth causal effects
gt = pd.read_csv(data_path / 'ground_truth.csv', engine='python')
gt.columns = ['cause_id', 'effect_id', 'causal_effect']

In [6]:
# 6) Re-index user and item IDs
train['user_id'] = train['user_id'].apply(lambda x: x - 1)
train['item_id'] = train['item_id'].apply(lambda x: x - 1)
test['user_id'] = test['user_id'].apply(lambda x: x - 1)
test['item_id'] = test['item_id'].apply(lambda x: x - 1)
gt['cause_id'] = gt['cause_id'].apply(lambda x: x - 1)
gt['effect_id'] = gt['effect_id'].apply(lambda x: x - 1)

In [7]:
# 7) Save to disk
print("Saving files to disk...")
train.to_csv(data_path / 'train.csv', index=False)
test.to_csv(data_path / 'test.csv', index=False)
gt.to_csv(data_path / 'ground_truth_processed.csv', index=False)
print('Files saved to disk.')

Saving files to disk...
Files saved to disk.
