In [1]:
import sys
import os

# Get the project root (1 level up from this script)
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)
print(f"Project root set to: {PROJECT_ROOT}")

import config
from scripts.preprocessing.prep_utils import (
    load_data,
    load_data_without_bad_lines
)
from scripts.postprocessing.batchinference import run_batch_inference
from sklearn.model_selection import train_test_split
from scripts.training.two_tower_utils import load_data as load_tt_data
import pandas as pd

Project root set to: /Users/guptayas/1898483/INS_E2E_RecSys


In [2]:
users = load_data(PROJECT_ROOT+'/'+config.users)
train_df, _ = load_data_without_bad_lines(PROJECT_ROOT+'/'+config.training)
train_X, val_X = train_test_split(train_df, test_size=0.2, random_state=42)
topk = 50

In [3]:


train_output_path = PROJECT_ROOT+'/'+"artifacts/two_tower_topk_train.csv"
val_output_path = PROJECT_ROOT+'/'+"artifacts/two_tower_topk_val.csv"

# --- Train inference ---
if os.path.exists(train_output_path):
    print("Train inference file found. Reading existing file...")
    topk_train = pd.read_csv(train_output_path)
else:
    print("Train inference file not found. Running inference...")
    topk_train = run_batch_inference(
        mlruns_dir=os.path.join(PROJECT_ROOT, config.MODEL_CONFIG['mlruns_dir']),
        best_run_id=config.two_tower_best_id,
        users=users,
        test_content=train_X,
        output_dir=PROJECT_ROOT+'/'+"artifacts",
        topk=topk,
    )
    topk_train.to_csv(train_output_path, index=False)

# --- Validation inference ---
if os.path.exists(val_output_path):
    print("Validation inference file found. Reading existing file...")
    topk_val = pd.read_csv(val_output_path)
else:
    print("Validation inference file not found. Running inference...")
    topk_val = run_batch_inference(
        mlruns_dir=os.path.join(PROJECT_ROOT, config.MODEL_CONFIG['mlruns_dir']),
        best_run_id=config.two_tower_best_id,
        users=users,
        test_content=val_X,
        output_dir=PROJECT_ROOT+'/'+"artifacts",
        topk=topk,
    )
    topk_val.to_csv(val_output_path, index=False)


  from .autonotebook import tqdm as notebook_tqdm


Train inference file not found. Running inference...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 6/6 [00:00<00:00, 4897.98it/s] 

📦 Loaded model from MLflow run b0caa2b2421e4d4e80179cb9b29faff6





✅ Saved 10400 user embeddings → /Users/guptayas/1898483/INS_E2E_RecSys/artifacts/user_embeddings.pt
✅ Saved 6534 content embeddings → /Users/guptayas/1898483/INS_E2E_RecSys/artifacts/content_embeddings.pt
📊 Loaded 10400 user embeddings, 6534 content embeddings


Scoring users: 100%|██████████| 10400/10400 [10:48<00:00, 16.03it/s]


Validation inference file not found. Running inference...


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 6/6 [00:00<00:00, 4195.70it/s] 

📦 Loaded model from MLflow run b0caa2b2421e4d4e80179cb9b29faff6





✅ Saved 10400 user embeddings → /Users/guptayas/1898483/INS_E2E_RecSys/artifacts/user_embeddings.pt
✅ Saved 1634 content embeddings → /Users/guptayas/1898483/INS_E2E_RecSys/artifacts/content_embeddings.pt
📊 Loaded 10400 user embeddings, 1634 content embeddings


Scoring users: 100%|██████████| 10400/10400 [02:40<00:00, 64.92it/s]


# Generating True Labels

In [None]:
data = load_tt_data(PROJECT_ROOT+'/'+config.MODEL_CONFIG["data_path"])

tX_user, tX_content, ty = (
    data["tX_user"],
    data["tX_content"],
    data["ty"],
)
vX_user, vX_content, vy = (
    data["vX_user"],
    data["vX_content"],
    data["vy"],
)

In [7]:
tX_user

Unnamed: 0,platform,os_version,model,networkType,district_user,language_selected,days_since_last_active,days_since_signup
0,0,3,258,4,20,0,820,925
1,0,3,256,4,20,0,827,909
2,0,1,282,4,20,0,819,909
3,0,3,256,4,20,0,827,909
4,0,1,282,4,20,0,819,909
...,...,...,...,...,...,...,...,...
44314,0,14,736,2,20,0,820,920
44315,0,1,282,4,20,0,819,909
44316,0,1,618,4,20,0,819,923
44317,0,3,510,4,20,0,819,920


# check for different values of k

In [None]:
def evaluate_ranking_metrics(model, X_user, X_content, y_true, topk=50):

    model.eval()
    with torch.no_grad():
        # Compute user and content embeddings
        user_embs = model.user_tower(X_user)
        content_embs = model.content_tower(X_content)
        scores = model.output_layer(torch.cat([user_embs, content_embs], dim=1)).squeeze(-1)

    # Convert to numpy for metrics
    scores = scores.cpu().numpy()
    y_true = y_true.cpu().numpy()

    # Sort by score (descending)
    sorted_indices = np.argsort(-scores)
    y_sorted = y_true[sorted_indices]

    # --- Metrics ---
    hit_rate = int(np.any(y_sorted[:topk])) / 1.0  # whether any positive in top K

    # Mean Reciprocal Rank
    ranks = np.where(y_sorted == 1)[0]
    mrr = 0.0 if len(ranks) == 0 else 1.0 / (ranks[0] + 1)

    # NDCG@K
    dcg = np.sum(y_sorted[:topk] / np.log2(np.arange(2, topk + 2)))
    ideal_dcg = np.sum(sorted(y_true, reverse=True)[:topk] / np.log2(np.arange(2, topk + 2)))
    ndcg = dcg / ideal_dcg if ideal_dcg > 0 else 0.0

    return {"HitRate@K": hit_rate, "MRR@K": mrr, "NDCG@K": ndcg}


Unnamed: 0,platform,os_version,model,networkType,district_user,language_selected,days_since_last_active,days_since_signup
0,0,3,258,4,20,0,820,925
1,0,3,256,4,20,0,827,909
2,0,1,282,4,20,0,819,909
3,0,3,256,4,20,0,827,909
4,0,1,282,4,20,0,819,909
...,...,...,...,...,...,...,...,...
44314,0,14,736,2,20,0,820,920
44315,0,1,282,4,20,0,819,909
44316,0,1,618,4,20,0,819,923
44317,0,3,510,4,20,0,819,920


In [None]:
pwd

'/Users/guptayas/1898483/INS_E2E_RecSys/experiments/evaluation'

In [None]:
pwd

'/Users/guptayas/1898483/INS_E2E_RecSys/experiments/evaluation'