In [123]:
import torch
import pandas as pd
import mlflow
import mlflow.pytorch
import os
import pickle
import numpy as np

# ---------------------
# Parameters
# ---------------------
best_run_id = "8408fd846c784f55a402b3be6bace2aa"
custom_mlflow_path = "../training/mlruns"
user_emb_path = "user_embeddings.pt"
content_emb_path = "content_embeddings.pt"

# Set MLflow tracking URI
mlflow.set_tracking_uri(f"file://{os.path.abspath(custom_mlflow_path)}")

# ---------------------
# Load model
# ---------------------
model_uri = f"runs:/{best_run_id}/model"
model = mlflow.pytorch.load_model(model_uri)
model.eval()  # eval mode


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

TwoTowerModel(
  (user_tower): Sequential(
    (0): Linear(in_features=8, out_features=32, bias=True)
    (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.28872460017670404, inplace=False)
    (4): Linear(in_features=32, out_features=32, bias=True)
    (5): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
  )
  (content_tower): Sequential(
    (0): Linear(in_features=132, out_features=32, bias=True)
    (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.28872460017670404, inplace=False)
    (4): Linear(in_features=32, out_features=32, bias=True)
    (5): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
  )
  (output_layer): Linear(in_features=64, out_features=1, bias=True)
)

In [108]:
# Load training and validation data using pickle

with open("../preprocessing/train_val_data.pkl", "rb") as f:
    data = pickle.load(f)

tX_user = data["tX_user"]
tX_content = data["tX_content"]
ty = data["ty"]
vX_user = data["vX_user"]
vX_content = data["vX_content"]
vy = data["vy"]
processed_users = data["processed_users"]
test_content = data["test_content"]

print("✅ Data loaded successfully from pickle")

✅ Data loaded successfully from pickle


In [109]:
def to_tensor(x):
    if torch.is_tensor(x):
        return x.float()
    elif hasattr(x, "values"):
        return torch.tensor(x.values, dtype=torch.float32)
    else:
        return torch.tensor(x, dtype=torch.float32)

# ---------------------
# Compute User Embeddings
# ---------------------

In [110]:
def compute_and_save_user_embeddings(user_df, save_path=user_emb_path):
    # Split IDs and features
    user_tensor = user_df.values
    device_ids = user_tensor[:, 0].astype(str)
    features = to_tensor(user_tensor[:, 1:].astype('float'))

    # Compute embeddings
    model.eval()
    with torch.no_grad():
        user_emb = model.user_tower(features)

    # Create mapping {device_id: embedding_tensor}
    user_emb_dict = {did: emb for did, emb in zip(device_ids, user_emb)}

    # Save as dictionary
    torch.save(user_emb_dict, save_path)
    print(f"✅ Saved {len(user_emb_dict)} user embeddings to {save_path}")

In [111]:
compute_and_save_user_embeddings(processed_users)

✅ Saved 10400 user embeddings to user_embeddings.pt


# ---------------------
# Compute Content Embeddings
# ---------------------

In [112]:
def compute_and_save_content_embeddings(content_df, save_path=content_emb_path):
    # Split IDs and features
    content_tensor = content_df.values
    hash_ids = content_tensor[:, 0]#.astype(str)
    features = to_tensor(content_tensor[:, 1:].astype('float'))

    # Compute embeddings
    model.eval()
    with torch.no_grad():
        content_emb = model.content_tower(features)

    # Create mapping {hash_id: embedding_tensor}
    content_emb_dict = {hid: emb for hid, emb in zip(hash_ids, content_emb)}

    # Save as dictionary
    torch.save(content_emb_dict, save_path)
    print(f"✅ Saved {len(content_emb_dict)} content embeddings to {save_path}")


In [113]:
compute_and_save_content_embeddings(test_content)

✅ Saved 970 content embeddings to content_embeddings.pt


# ---------------------
# Predict from Embeddings
# ---------------------

In [None]:
def predict_from_embeddings(user_emb, content_emb):
    with torch.no_grad():
        combined = torch.cat([user_emb, content_emb], dim=1)
        preds = model.output_layer(combined).squeeze(-1)
    return preds

# ---------------------
# Load Embeddings
# ---------------------

In [131]:
user_emb_dict = torch.load(user_emb_path, weights_only=False)
content_emb_dict = torch.load(content_emb_path, weights_only=False)

In [133]:
from tqdm import tqdm

In [None]:
# --- Compute top 50 for each user ---
results = []

for user_id, user_emb in tqdm(user_emb_dict.items(), desc="Users", total=len(user_emb_dict)):
    scores = []
    for hashid, content_emb in content_emb_dict.items():
        # Compute prediction score
        score = predict_from_embeddings(user_emb.reshape(1, -1), content_emb.reshape(1, -1))
        scores.append((hashid, score))

    # Sort by score (descending) and take top 50
    top_50 = sorted(scores, key=lambda x: x[1], reverse=True)[:50]

    for rank, (hashid, score) in enumerate(top_50, 1):
        results.append({
            "user_id": user_id,
            "hashid": hashid,
            "rank": rank,
            "score": score
        })

Users: 100%|██████████| 10400/10400 [02:33<00:00, 67.79it/s]


In [145]:
# --- Convert to DataFrame ---
df_top50 = pd.DataFrame(results)
df_top50.to_csv("user_top50_recommendations.csv", index=False)

print("✅ Saved top 50 recommendations for each user to user_top50_recommendations.csv")

✅ Saved top 50 recommendations for each user to user_top50_recommendations.csv


In [152]:
df_top50

Unnamed: 0,user_id,hashid,rank,score
0,197b123e-eb9e-4fc1-a32d-aa86aaea425e,z4xtyyjz-1,1,[tensor(0.3298)]
1,197b123e-eb9e-4fc1-a32d-aa86aaea425e,s3nxsntx-1,2,[tensor(0.3298)]
2,197b123e-eb9e-4fc1-a32d-aa86aaea425e,p18jiu5f-1,3,[tensor(0.3298)]
3,197b123e-eb9e-4fc1-a32d-aa86aaea425e,s98imoxc-1,4,[tensor(0.3298)]
4,197b123e-eb9e-4fc1-a32d-aa86aaea425e,kc3pd7h5-1,5,[tensor(0.3297)]
...,...,...,...,...
519995,017120da-5a11-4139-a438-97906a941a46,xy3xvzre-1,46,[tensor(0.3222)]
519996,017120da-5a11-4139-a438-97906a941a46,xq2dhwal-1,47,[tensor(0.3222)]
519997,017120da-5a11-4139-a438-97906a941a46,0zesqner-1,48,[tensor(0.3222)]
519998,017120da-5a11-4139-a438-97906a941a46,4v32qsjt-1,49,[tensor(0.3221)]
