In [18]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch

import pickle
from pathlib import Path

import sys
sys.path.insert(0, str(Path.cwd().resolve().parents[0] / '2_Propensities'))
import MF_class as MF

# 1 Choosing Dataset

In [19]:
datasets = ['ml-1m', 'steam', 'goodreads']
DATASET = datasets[0]

print(f"Using dataset: {DATASET}")

Using dataset: ml-1m


# 2 Loading Dataset and Propensities Model

In [20]:
base_artifacts = Path.cwd().resolve().parents[1] / 'CausalI2I_artifacts'
path = base_artifacts / 'Datasets' / 'Processed' / DATASET
train = pd.read_csv(path / 'train.csv')
test = pd.read_csv(path / 'test.csv')
with open(path / 'item_dict.pkl', 'rb') as f:
    item_dict = pickle.load(f)

n_users = len(train['user_id'].unique())
n_items = len(item_dict)

full_data = pd.concat([train, test], ignore_index=True)
title2id = {v: k for k, v in item_dict.items()}

# Load chosen pairs and item dictionary
with open(base_artifacts / 'Chosen_Pairs' / f'{DATASET}_chosen_pairs.pkl', 'rb') as f:
    chosen_pairs = pickle.load(f)

chosen_pairs_ids = [
    (title2id[title_A], title2id[title_B])
    for title_A, title_B in chosen_pairs
]

In [21]:
model_path = base_artifacts / 'Propensity_Models'
all_model_names = [p.name for p in model_path.iterdir() if p.is_file()]
model_name = [name for name in all_model_names if DATASET in name][0]
n_factors = int(model_name.split('_')[0][2:])

model = MF.MatrixFactorizationTorch(n_users=n_users, n_items=n_items, n_factors=n_factors)
model.load(path=model_path / model_name)

Loaded model summary:
Model:                      MatrixFactorizationTorch
Number of users:            6040
Number of items:            3706
Number of factors:          20
Learning rate:              0.005
Weight decay:               1e-07
Positive weight:            1
Batch size:                 32768
Number of epochs:           20
Device:                     cuda:0
Use AMP:                    True
Timestamp:                  2025-12-19 16:50:42


# 3 Processing ChatGPT Response

In [22]:
folder = base_artifacts / 'API_Results' / DATASET
csvs_in_folder = [p.name for p in folder.iterdir() if p.is_file() and p.suffix == '.csv']
oracle_file_name = csvs_in_folder[0]
oracle = pd.read_csv(folder / oracle_file_name)

In [23]:
clean_string = lambda s: s[1:-1] if s[0] == "'" and s[-1] == "'" else s
oracle['title_A'] = oracle['title_A'].apply(clean_string)
oracle['title_B'] = oracle['title_B'].apply(clean_string)

known_titles = title2id.keys()
oracle['titles_known'] = oracle['title_A'].apply(lambda x: x in known_titles) & oracle['title_B'].apply(lambda x: x in known_titles)
print(f"Filling {(oracle['titles_known'] == False).sum()} pairs with unknown titles.")

filling_idx = oracle[oracle['titles_known'] == False].index
for idx in filling_idx:
    oracle.loc[idx, 'title_A'] = chosen_pairs[idx][0]
    oracle.loc[idx, 'title_B'] = chosen_pairs[idx][1]

pd.DataFrame(oracle['causal_effect'].value_counts())

Filling 2 pairs with unknown titles.


Unnamed: 0_level_0,count
causal_effect,Unnamed: 1_level_1
0,9862
1,84
2,34
3,20


# 4 Defining Baselines

In [24]:
pivot_real = full_data.pivot(index='user_id', columns='item_id', values='interaction').fillna(0)
itemid_to_colidx_pivot_real = {item_id: col_idx for col_idx, item_id in enumerate(pivot_real.columns)}
pivot_real_np = pivot_real.values

Q_normalized = (model.Q / torch.norm(model.Q, dim=1, keepdim=True)).cpu().detach().numpy()

def cosimilarity(idx1, idx2):
    """Calculate cosine similarity between two items."""
    return np.dot(Q_normalized[idx1], Q_normalized[idx2])

def correlation(idx1, idx2):
    """Calculate correlation between two items."""
    colidx1 = itemid_to_colidx_pivot_real[idx1]
    colidx2 = itemid_to_colidx_pivot_real[idx2]
    T = pivot_real_np[:, colidx1]
    Y = pivot_real_np[:, colidx2]
    if T.std() == 0 or Y.std() == 0:
        return 0
    return np.corrcoef(T, Y)[0, 1]

def diff_of_conditionals(idx1, idx2):
    """Calculate difference of conditionals P(Y|T) - P(Y|~T) between two items."""
    colidx1 = itemid_to_colidx_pivot_real[idx1]
    colidx2 = itemid_to_colidx_pivot_real[idx2]
    T = pivot_real_np[:, colidx1]
    Y = pivot_real_np[:, colidx2]
    p_T = np.clip(np.mean(T), 1e-6, 1-1e-6)
    p_Y = np.mean(Y)
    p_TY = np.mean(T * Y)
    return p_TY / p_T - (p_Y - p_TY) / (1 - p_T)

def jacard_index(idx1, idx2):
    """Calculate Jaccard index between two items."""
    colidx1 = itemid_to_colidx_pivot_real[idx1]
    colidx2 = itemid_to_colidx_pivot_real[idx2]
    T = pivot_real_np[:, colidx1]
    Y = pivot_real_np[:, colidx2]
    intersection = np.sum((T > 0) & (Y > 0))
    union = np.sum((T > 0) | (Y > 0))
    if union == 0:
        return 0
    return intersection / union

## 4.1 SASRec

In [25]:
sys.path.insert(0, str(Path.cwd().resolve().parents[0] / '4_SASRec'))
import SASRec_class as sasrec

model_path = base_artifacts / 'SASRec_Models'
with open(model_path / f'sasrec_{DATASET}_init_dict.pkl', 'rb') as f:
    init_dict_loaded = pickle.load(f)
sasrec_model = sasrec.SASRecTorch(**init_dict_loaded)
sasrec_model.load(model_path / f'sasrec_{DATASET}.pt')

Model loaded from /home/gouni/CausalI2I_artifacts/SASRec_Models/sasrec_ml-1m.pt.
num_items:     3706
max_seq_len:   200
device:        cuda
batch_size:    128
lr:            0.001
weight_decay:  0.0
num_epochs:    10
saved_at:      2025-12-22 18:23:04
note:          None




In [26]:
def make_sequence(cause_id):
    PAD = sasrec_model.num_items
    L = sasrec_model.max_seq_len
    seq = torch.full((1, L), PAD, dtype=torch.long, device=sasrec_model.device)
    seq[0, -1] = cause_id
    return seq

In [27]:
sasrec_model.eval()

sasrec_scores = {}
sasrec_ranks = {}
candidates = torch.arange(0, sasrec_model.num_items, device=sasrec_model.device)
for pair in tqdm(chosen_pairs_ids):
    cause_id, effect_id = pair
    seq = make_sequence(cause_id)
    candidates_scores = sasrec_model.predict_scores(seq, candidates).detach().cpu().numpy()[0]
    # Record Score
    sasrec_scores[pair] = candidates_scores[effect_id]
    # Record Rank
    rank = np.where(candidates_scores.argsort()[::-1] == effect_id)[0][0]   # 0-based rank
    percentile = 1 - rank / sasrec_model.num_items
    sasrec_ranks[pair] = percentile

  0%|          | 0/10000 [00:00<?, ?it/s]

# 5 Defining ATE

In [28]:
test_probs = model.predict_prob(
        torch.tensor(test['user_id'].values, dtype=torch.long),
        torch.tensor(test['item_id'].values, dtype=torch.long)
    )
test_copy = test.copy()
if test_copy['timestamp'].dtype == 'O':
    test_copy['timestamp'] = pd.to_datetime(test_copy['timestamp'], errors='coerce').astype(np.int64) // 10**9
    test_copy['timestamp'] = test_copy['timestamp'].apply(lambda x: x if x > 0 else np.inf)

pivot_test_timestamp = test_copy.pivot(index='user_id', columns='item_id', values='timestamp').fillna(np.inf)
pivot_test_timestamp_np = pivot_test_timestamp.values
itemid_to_colidx = {id: i for i, id in enumerate(pivot_test_timestamp.columns)}

test_copy['probability'] = test_probs.cpu().detach().numpy()
pivot_test_pred = test_copy.pivot(index='user_id', columns='item_id', values='probability')
pivot_test_pred_np = pivot_test_pred.values

In [29]:
test_interaction_time_cols  = {
    item: pivot_test_timestamp_np[:, colidx]
    for item, colidx in itemid_to_colidx.items()
}

pred_cols = {
    item: pivot_test_pred_np[:, colidx]
    for item, colidx in itemid_to_colidx.items()
}

all_interaction_cols = {
    item: pivot_real_np[:, colidx]
    for item, colidx in itemid_to_colidx_pivot_real.items()
}

In [30]:
def get_ATE(cause_times, effect_times, pi, clip=0, drop_inverted=True, stabilized=True):
    """
    If stabilized=True  -> Hájek ratio IPW: E[Y(1)] ≈ mean(N1)/mean(D1), E[Y(0)] ≈ mean(N0)/mean(D0)
    If stabilized=False -> Horvitz–Thompson IPW:   E[Y(1)] ≈ mean(N1),    E[Y(0)] ≈ mean(N0)
    Returns (ATE, STD) unless return_STD=False.
    """

    if drop_inverted:
        users_to_keep = np.where((cause_times <= effect_times) | (cause_times == np.inf))[0]

        cause_time_filtered = cause_times[users_to_keep]
        effect_time_filtered = effect_times[users_to_keep]

        T = cause_time_filtered < np.inf
        Y = effect_time_filtered < np.inf
        pi = pi[users_to_keep]

    else:
        T  = cause_times < np.inf
        Y  = effect_times < np.inf
    
    n   = len(T)

    pi = np.clip(pi, clip, 1 - clip)

    # IPW pieces
    D_1 = T / pi                # B
    D_0 = (1 - T) / (1 - pi)    # D
    N_1 = Y * D_1               # A
    N_0 = Y * D_0               # C
    mN_1, mN_0 = N_1.mean(), N_0.mean()

    # Point estimate
    if stabilized:
        # Hájek (ratio) form
        mD_1, mD_0 = D_1.mean(), D_0.mean()
        EY_1 = mN_1 / mD_1 if mD_1 != 0 else 0.0
        EY_0 = mN_0 / mD_0 if mD_0 != 0 else 0.0
    else:
        # Horvitz–Thompson (mean) form
        EY_1 = mN_1
        EY_0 = mN_0

    ATE = EY_1 - EY_0

    # ---- Variance via explicit covariance matrix ----
    if stabilized:
        Z = np.column_stack([N_1, D_1, N_0, D_0])
        S = np.cov(Z, rowvar=False, ddof=1)     # 4x4 sample covariance
        g = np.array([
            1.0 / mD_1 if mD_1 != 0 else 0.0,
            -mN_1 / (mD_1 ** 2) if mD_1 != 0 else 0.0,
            -1.0 / mD_0 if mD_0 != 0 else 0.0,
            mN_0 / (mD_0 ** 2) if mD_0 != 0 else 0.0
        ])
    else:
        Z = np.column_stack([N_1, N_0])
        S = np.cov(Z, rowvar=False, ddof=1)     # 2x2 covariance
        g = np.array([1.0, -1.0])
    
    var_hat = (g @ S @ g) / n
    STD = float(np.sqrt(max(var_hat, 0.0)))     # numerical safety

    return {
        "ATE": ATE,
        "STD": STD,
    }

# 6 Generate Results

In [31]:
clip_dict = {
    'ml-1m': 0.1,
    'steam': 0.01,
    'goodreads': 0.01,
}
clip = clip_dict[DATASET]

def process_pair(pair):

    c = pair[0]
    e = pair[1]

    ate_dict = get_ATE(
        cause_times=test_interaction_time_cols[c], 
        effect_times=test_interaction_time_cols[e], 
        pi=pred_cols[c], 
        clip=clip,
        drop_inverted=True, 
        stabilized=True)
    
    abl_dict = get_ATE(
        cause_times=test_interaction_time_cols[c], 
        effect_times=test_interaction_time_cols[e], 
        pi=pred_cols[c], 
        clip=0.5, 
        drop_inverted=True, 
        stabilized=True)

    return {
        "cause_id": pair[0],
        "effect_id": pair[1],
        "ATE": ate_dict["ATE"],
        "STD": ate_dict["STD"],
        "ABLT": abl_dict["ATE"],
        "STD_ABLT": abl_dict["STD"],
        "cosine_similarity": cosimilarity(*pair),
        "correlation": correlation(*pair),
        "diff_of_conditionals": diff_of_conditionals(*pair),
        "jacard_index": jacard_index(*pair),
        "sasrec_score": sasrec_scores[pair],
    }

In [32]:
all_results = []
for pair in tqdm(chosen_pairs_ids):
    results = process_pair(pair)
    all_results.append(results)

raw_results = pd.DataFrame(all_results)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [33]:
merged = pd.merge(
    left=oracle,
    right=raw_results,
    left_index=True,
    right_index=True,
)

In [35]:
merged.to_csv(base_artifacts / 'Datasets' / 'Evaluated' / f'{DATASET}_evaluated.csv', index=False)