In [2]:
import sys
import os

# Get the project root (1 level up from this script)
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)
print(f"Project root set to: {PROJECT_ROOT}")

import config
from scripts.preprocessing.prep_utils import (
    load_data,
    load_data_without_bad_lines
)
from scripts.postprocessing.batchinference import run_batch_inference
from sklearn.model_selection import train_test_split
from scripts.training.two_tower_utils import load_data as load_tt_data
import pandas as pd

Project root set to: /Users/rakeshgupta/Documents/INS_E2E_RecSys


In [3]:
users = load_data(PROJECT_ROOT+'/'+config.users)
train_df, _ = load_data_without_bad_lines(PROJECT_ROOT+'/'+config.training)
train_X, val_X = train_test_split(train_df, test_size=0.2, random_state=42)
topk = 50

In [4]:


train_output_path = PROJECT_ROOT+'/'+"artifacts/two_tower_topk_train.csv"
val_output_path = PROJECT_ROOT+'/'+"artifacts/two_tower_topk_val.csv"

# --- Train inference ---
if os.path.exists(train_output_path):
    print("Train inference file found. Reading existing file...")
    topk_train = pd.read_csv(train_output_path)
else:
    print("Train inference file not found. Running inference...")
    topk_train = run_batch_inference(
        mlruns_dir=os.path.join(PROJECT_ROOT, config.MODEL_CONFIG['mlruns_dir']),
        best_run_id=config.two_tower_best_id,
        users=users,
        test_content=train_X,
        output_dir=PROJECT_ROOT+'/'+"artifacts",
        topk=topk,
    )
    topk_train.to_csv(train_output_path, index=False)

# --- Validation inference ---
if os.path.exists(val_output_path):
    print("Validation inference file found. Reading existing file...")
    topk_val = pd.read_csv(val_output_path)
else:
    print("Validation inference file not found. Running inference...")
    topk_val = run_batch_inference(
        mlruns_dir=os.path.join(PROJECT_ROOT, config.MODEL_CONFIG['mlruns_dir']),
        best_run_id=config.two_tower_best_id,
        users=users,
        test_content=val_X,
        output_dir=PROJECT_ROOT+'/'+"artifacts",
        topk=topk,
    )
    topk_val.to_csv(val_output_path, index=False)


Train inference file found. Reading existing file...
Validation inference file found. Reading existing file...


# Generating True Labels

In [5]:
data = load_tt_data(PROJECT_ROOT+'/'+config.MODEL_CONFIG["data_path"])

tX_user, tX_content, ty = (
    data["tX_user"],
    data["tX_content"],
    data["ty"],
)
vX_user, vX_content, vy = (
    data["vX_user"],
    data["vX_content"],
    data["vy"],
)

In [20]:
df = pd.concat([tX_user[['deviceId']].reset_index(drop=True),
                tX_content[['hashId']].reset_index(drop=True),
                ty.reset_index(drop=True).rename('engagement_score')],
               axis=1)

In [29]:
df.sort_values(['deviceId', 'hashId', 'engagement_score'], ascending=[True, True, False], inplace=True)

In [30]:
df[:20]

Unnamed: 0,deviceId,hashId,engagement_score
30471,00be93c4-ba45-4f34-bf99-169a899277f2,0bwleglu-1,0.3
222,00be93c4-ba45-4f34-bf99-169a899277f2,0u1ws9mg-1,0.3
6497,00be93c4-ba45-4f34-bf99-169a899277f2,0u1ws9mg-1,0.3
19097,00be93c4-ba45-4f34-bf99-169a899277f2,0u1ws9mg-1,0.3
27694,00be93c4-ba45-4f34-bf99-169a899277f2,0u1ws9mg-1,0.3
40186,00be93c4-ba45-4f34-bf99-169a899277f2,0u1ws9mg-1,0.3
16073,00be93c4-ba45-4f34-bf99-169a899277f2,2eq4chf5-1,0.3
22252,00be93c4-ba45-4f34-bf99-169a899277f2,2ftirsih-1,0.3
1453,00be93c4-ba45-4f34-bf99-169a899277f2,2g5ohepr-1,0.3
17224,00be93c4-ba45-4f34-bf99-169a899277f2,2g5ohepr-1,0.3


deviceId
00be93c4-ba45-4f34-bf99-169a899277f2    [j1dyn4tb-1, t7ynntpy-1, 0u1ws9mg-1, ovhlc6my-...
011a0d8c-f8ae-4a00-98e8-7db00ee42908    [a7qe6hux-1, ac4c5vpp-1, uragb4pi-2, aspn0mia-...
011c3b48-0d00-4069-89f0-1fbdb3ac0f08    [ycbcpqs8-1, a2epwodw-1, 2mjwafid-1, dbsibvjk-...
012bb4a5-1065-43ef-a778-ed67a48fc626    [2cnx8yfa-1, lwusqbc0-1, ezodxrek-1, 2g5ohepr-...
0147204e-d38b-411c-996a-f5eabb18cb35    [wudddmlz-1, rdroogm7-1, bx9jwxtu-1, e0ea5f8u-...
                                                              ...                        
fe8b9478-9719-426a-ba57-75eea74eb474                 [4hvxw5is-1, 4hvxw5is-1, j0ijdmfb-1]
feefdd3a-4d60-40c9-9332-4325d255eaed    [gluoi1gs-1, u3t2xymm-1, yz5vbk1e-1, u3t2xymm-...
ff60b3ce-d80c-46b8-9b70-b3d4b51e815c                                         [nw2cgvra-1]
ff674b76-9ece-4b2d-b745-a76509654abb    [i5pbrfzo-1, c3qst5bp-1, sj61uwvi-1, uxe2co9q-...
ff69e894-0f1f-4eea-9e80-24cb8a2a89b6    [osjythnx-1, 6ymdozs9-1, r3lfmv7e-1, osjythnx-...
L

In [17]:
ty

0        0.3
1        0.3
2        0.3
3        0.3
4        0.3
        ... 
44314    0.3
44315    0.3
44316    0.3
44317    0.3
44318    0.3
Name: engagement_score, Length: 44319, dtype: float64

# check for different values of k

In [None]:
def evaluate_ranking_metrics(model, X_user, X_content, y_true, topk=50):

    model.eval()
    with torch.no_grad():
        # Compute user and content embeddings
        user_embs = model.user_tower(X_user)
        content_embs = model.content_tower(X_content)
        scores = model.output_layer(torch.cat([user_embs, content_embs], dim=1)).squeeze(-1)

    # Convert to numpy for metrics
    scores = scores.cpu().numpy()
    y_true = y_true.cpu().numpy()

    # Sort by score (descending)
    sorted_indices = np.argsort(-scores)
    y_sorted = y_true[sorted_indices]

    # --- Metrics ---
    hit_rate = int(np.any(y_sorted[:topk])) / 1.0  # whether any positive in top K

    # Mean Reciprocal Rank
    ranks = np.where(y_sorted == 1)[0]
    mrr = 0.0 if len(ranks) == 0 else 1.0 / (ranks[0] + 1)

    # NDCG@K
    dcg = np.sum(y_sorted[:topk] / np.log2(np.arange(2, topk + 2)))
    ideal_dcg = np.sum(sorted(y_true, reverse=True)[:topk] / np.log2(np.arange(2, topk + 2)))
    ndcg = dcg / ideal_dcg if ideal_dcg > 0 else 0.0

    return {"HitRate@K": hit_rate, "MRR@K": mrr, "NDCG@K": ndcg}


Unnamed: 0,platform,os_version,model,networkType,district_user,language_selected,days_since_last_active,days_since_signup
0,0,3,258,4,20,0,820,925
1,0,3,256,4,20,0,827,909
2,0,1,282,4,20,0,819,909
3,0,3,256,4,20,0,827,909
4,0,1,282,4,20,0,819,909
...,...,...,...,...,...,...,...,...
44314,0,14,736,2,20,0,820,920
44315,0,1,282,4,20,0,819,909
44316,0,1,618,4,20,0,819,923
44317,0,3,510,4,20,0,819,920


In [None]:
pwd

'/Users/guptayas/1898483/INS_E2E_RecSys/experiments/evaluation'

In [None]:
pwd

'/Users/guptayas/1898483/INS_E2E_RecSys/experiments/evaluation'