In [None]:
import gc
import pandas as pd
import os
import pickle
import time
from tqdm.notebook import tqdm
import numpy as np
import json
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
from copy import deepcopy
import matplotlib.pyplot as plt

In [None]:
import pyarrow.parquet as pq

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
with open("features_info.json", "r") as file:
    data = json.load(file)
x = {'requestTime', 'live-popularity-embeddings-affinity-v1', 'live-popularity-embeddings-affinity-v2', 
     'user_creator_followed', 'time_since_livestream_started_ms', 'cohost_ranker_score', 'share_ranker_score', 
     'qscan_ranker_score', 'ts_ranker_score', 'cheer_ranker_score', 'comment_ranker_score', 'log_ts_ranker_score', 
     'like_ranker_score', 'request_ranker_score', 'method_latency_seconds', 'live-popularity-embeddings-affinity-v3', 
     'user_latLong_1_Lifetime', 'user_phoneModel_1_Lifetime'}
input_cols = [col for col in data["input_features_651f"] if col not in x ]
input_cols = list(input_cols)
f176 = data["input_features_176f"]
len(input_cols)

In [None]:
input_cols = f176

In [None]:
label_cols = [
          "label_like",
          "label_share",
          "label_cmt",
          "label_cheer",
          "label_cohost",
          "label_req",
          "label_ts",
          "label_qscan",
          "label_logts"
        ]

In [None]:
data_folder = "v2_data_176f/"

In [None]:
%%time
train_df = pq.ParquetFile(data_folder+'train_data_scaled.parquet')
train_df = train_df.read().to_pandas()
val_df = pq.ParquetFile(data_folder+'val_data_scaled.parquet')
val_df = val_df.read().to_pandas()

In [None]:
train_df["sampled_user"] = False
train_df["sampled_host"] = False

In [None]:
# data stored in gs://deep-ctr/deeksha/ranker_nn/xgb_19_26_07/

## torch model ##

In [None]:
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch
import torch.optim as optim
import random

In [None]:
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
random.seed(42)
np.random.seed(42)

In [None]:
users_tr = train_df["userId"].unique()
hosts_tr = train_df["hostId"].unique()
users_val = val_df["userId"].unique()
hosts_val = val_df["hostId"].unique()

#### sampling 10% userIds and hostIds for OOV init

In [None]:
num_samples = int(0.1 * len(users_tr))
sampled_user_ids = np.random.choice(users_tr, size=num_samples, replace=False)
num_samples = int(0.1 * len(hosts_tr))
sampled_host_ids = np.random.choice(hosts_tr, size=num_samples, replace=False)

In [None]:
sampled_df = train_df[(train_df["userId"].isin(sampled_user_ids)) | (train_df["hostId"].isin(sampled_host_ids))].reset_index(drop=True)
sampled_df.loc[sampled_df["userId"].isin(sampled_user_ids), "sampled_user"] = True
sampled_df.loc[sampled_df["hostId"].isin(sampled_host_ids), "sampled_host"] = True
len(sampled_df)

In [None]:
train_df = pd.concat([train_df, sampled_df], ignore_index=True)
seed = 42
train_df = train_df.sample(frac=1, random_state=seed)

In [None]:
user_id_mapping = {user_id: idx for idx, user_id in enumerate(users_tr)}
host_id_mapping = {host_id: idx for idx, host_id in enumerate(hosts_tr)}

In [None]:
oov_index_user = len(user_id_mapping)
oov_index_host = len(host_id_mapping)

In [None]:
train_df.loc[train_df["sampled_user"] == False, "userIndex"] = train_df.loc[train_df["sampled_user"] == False, "userId"].map(user_id_mapping)
train_df.loc[train_df["sampled_user"] == True, "userIndex"] = oov_index_user
train_df.loc[train_df["sampled_host"] == False, "hostIndex"] = train_df.loc[train_df["sampled_host"] == False, "hostId"].map(host_id_mapping)
train_df.loc[train_df["sampled_host"] == True, "hostIndex"] = oov_index_host

In [None]:
val_df['userIndex'] = val_df['userId'].map(user_id_mapping)
val_df['hostIndex'] = val_df['hostId'].map(host_id_mapping)
val_df['userIndex'] = val_df['userIndex'].fillna(oov_index_user)
val_df['hostIndex'] = val_df['hostIndex'].fillna(oov_index_host)

In [None]:
train_df['userIndex'] = train_df['userIndex'].astype(int)
train_df['hostIndex'] = train_df['hostIndex'].astype(int)

val_df['userIndex'] = val_df['userIndex'].astype(int)
val_df['hostIndex'] = val_df['hostIndex'].astype(int)

In [None]:
f"counter features used:{len(input_cols)}"

In [None]:
class rankerDataset(Dataset):
    def __init__(self, df, input_cols, label_cols):
        self.df = df.reset_index(drop=True)    
        self.userIds = self.df["userIndex"].values
        self.hostIds = self.df["hostIndex"].values
        self.input = self.df[input_cols].values
        self.label_cols = label_cols
        self.labels = self.df[self.label_cols].values        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        dd = {
            "input": self.input[idx],
        }
        for i, lb in enumerate(self.label_cols):
            dd[lb] = self.labels[idx][i]
        dd["userIndex"] = self.userIds[idx]
        dd["hostIndex"] = self.hostIds[idx]
        return dd

In [None]:
bs = 4096
train_dataset = rankerDataset(train_df, input_cols, label_cols)
train_dataloader = DataLoader(train_dataset, batch_size=bs, shuffle=True)

In [None]:
val_dataset = rankerDataset(val_df, input_cols, label_cols)
val_dataloader = DataLoader(val_dataset, batch_size=bs, shuffle=False)

In [None]:
class RankerV0(nn.Module):
    def __init__(self, input_features, num_users, num_hosts, embedding_dim, oov_embedding_value):
        super(RankerV0, self).__init__()
        self.shared_tower = nn.Sequential(
            nn.Linear(input_features, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32)
        )
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.host_embedding = nn.Embedding(num_hosts, embedding_dim)
        
        nn.init.xavier_uniform_(self.user_embedding.weight)
        nn.init.xavier_uniform_(self.host_embedding.weight)
#         nn.init.normal_(self.user_embedding.weight, mean=0.0, std=0.1)
#         nn.init.normal_(self.host_embedding.weight, mean=0.0, std=0.1)
    
        self.user_embedding.weight.data[num_users-1].fill_(oov_embedding_value)
        self.host_embedding.weight.data[num_hosts-1].fill_(oov_embedding_value)
        
        self.fc1 = nn.Linear(2*embedding_dim , embedding_dim)
        self.drop = nn.Dropout(0.3)
        
        dd = {}
        for lb in label_cols:
            dd[lb] = nn.Linear(64, 1)
        self.alone_towers = nn.ModuleDict(dd)
        
    def forward(self, x, uid, hid):
        x = self.shared_tower(x)
        user_embed = self.user_embedding(uid)
        host_embed = self.host_embedding(hid)
        y = self.drop(self.fc1(torch.cat([user_embed, host_embed], dim=1)))

        x = torch.cat([x, y], dim=1)
        out = {}
        for label, alone_nw in self.alone_towers.items():
            if label == "label_logts":
                out[label] = alone_nw(x)
                continue
            out[label] = torch.sigmoid(alone_nw(x))
        return out

In [None]:
EPOCHS = 3
embedding_dim = 32
oov_embedding_value = 0.0
device = "cuda"
device = torch.device(device)
num_users = len(user_id_mapping)
num_hosts = len(host_id_mapping)
modelV0 = RankerV0(len(input_cols), num_users+1, num_hosts+1, embedding_dim, oov_embedding_value)
modelV0.to(float)
modelV0.to(device)

In [None]:
user_embedding_weights = modelV0.user_embedding.weight.data
print("User Embedding Weights:")
print(user_embedding_weights)
host_embedding_weights = modelV0.host_embedding.weight.data
print("Host Embedding Weights:")
print(host_embedding_weights)

In [None]:
learning_rate = 2e-5
optimizer = optim.Adam(modelV0.parameters(), lr=learning_rate, weight_decay=1e-4)
loss_fn = {}
total_samples = len(train_dataset.df)
for lb in label_cols:
    if lb == "label_logts":
        loss_fn[lb] =  nn.MSELoss()
    else:
        loss_fn[lb] =  nn.BCELoss()

In [None]:
import math
def calculate_rmse(list1, list2):
    if len(list1) != len(list2):
        raise ValueError("Lists must have the same length.")
    squared_diffs = [(x - y) ** 2 for x, y in zip(list1, list2)]
    mean_squared_diffs = sum(squared_diffs) / len(list1)
    rmse = math.sqrt(mean_squared_diffs)
    return rmse

In [None]:
w = {
    'label_like': 1.0,
    'label_share': 1.0,
    'label_cmt': 1.0,
    'label_cheer': 1.0,
    'label_cohost': 0.8,
    'label_req': 0.8,
    'label_ts': 1.2,
    'label_qscan': 1.2,
    'label_logts': 1.2
}

In [None]:
for i in w:
    w[i] = 1
w

In [None]:
loss_lis_train = [] # batchwise
loss_lis_train_lb = {}
ep_train_loss_lb, ep_val_loss_lb = {}, {}
for lb in label_cols:
    loss_lis_train_lb[lb] = []
    ep_train_loss_lb[lb] = []
    ep_val_loss_lb[lb] = []
    
ep_train_loss = []
ep_val_loss = []
best_metrics = {}
best_val_loss = 1000000.0
best_ep = 0
params_ = sum(p.numel() for p in modelV0.parameters() if p.requires_grad)
best_metrics["params"] = params_
for ep in range(10):
    print(f"\n ========== EPOCH: {ep} ==========")
    total_loss = 0.0
    step = 0
    modelV0.train()
    for batch in tqdm(train_dataloader):
        modelV0.zero_grad()
        input_176 = batch["input"].to(device)
        userIndex = batch["userIndex"].to(device)
        hostIndex = batch["hostIndex"].to(device)
        
        output = modelV0(input_176, userIndex, hostIndex)
        losses = {}
        bloss = 0.0
        for lb in label_cols:
            losses[lb] =  loss_fn[lb](output[lb],  batch[lb].to(device).unsqueeze(1).to(float))
            bloss += w[lb]*losses[lb]
            loss_lis_train_lb[lb].append(losses[lb].item())
        bloss.backward()        
        optimizer.step()
        bloss_item = bloss.item()
        loss_lis_train.append(bloss_item)
        total_loss += bloss_item
        
        step += 1
    
    
    ep_train_loss.append(total_loss / step)
    print("train loss:", total_loss / step)
    
    for lb in label_cols:
        ep_train_loss_lb[lb].append(sum(loss_lis_train_lb[lb][-step:])/step)
        print(f"epoch train loss [{lb}]: {ep_train_loss_lb[lb][-1]}")
        
    print()
    true = {}
    pred = {}
    temp_vloss = {}
    for lb in label_cols:
        true[lb] = []
        pred[lb] = []
        temp_vloss[lb] = []
    with torch.no_grad():
        modelV0.eval()
        total_val_loss = 0
        val_step = 0
        for batch in val_dataloader: 
            input_176 = batch["input"].to(device)
            userIndex = batch["userIndex"].to(device)
            hostIndex = batch["hostIndex"].to(device)
            output = modelV0(input_176, userIndex, hostIndex)
            bloss = 0.0
            for lb in label_cols:
                x = loss_fn[lb](output[lb],  batch[lb].to(device).unsqueeze(1).to(float))
                bloss += w[lb]*x
                true[lb].extend(batch[lb].cpu().tolist())
                pred[lb].extend(output[lb].squeeze(1).cpu().tolist())
                temp_vloss[lb].append(x.item())
            val_step += 1
            total_val_loss += bloss.item()
        val_loss = total_val_loss / val_step
        ep_val_loss.append(val_loss)
        
        print("val loss:", val_loss)
        for lb in label_cols:
            ep_val_loss_lb[lb].append(sum(temp_vloss[lb])/val_step)
            print(f"epoch val loss [{lb}]: {ep_val_loss_lb[lb][-1]}")
    print()
    pr_auc, roc = {}, {}
    rmse = 0.0
    for lb in label_cols:
        if lb == "label_logts":
            rmse = calculate_rmse(true[lb], pred[lb])
            print(f"{lb} - rmse: {rmse} mse:{rmse*rmse}")
            continue
        precision, recall, thresholds = precision_recall_curve(true[lb], pred[lb])
        pr_auc[lb] = auc(recall, precision)
        roc[lb] = roc_auc_score(true[lb], pred[lb])
        print(f"[{lb}] roc auc : {roc[lb]}, pr auc: {pr_auc[lb]}")
        
    if val_loss < best_val_loss: 
        best_val_loss = val_loss
        best_ep = ep
        best_model = deepcopy(modelV0)
        best_metrics["epoch"] = ep
        best_metrics["train_loss"] = ep_train_loss[-1]
        best_metrics["val_loss"] = val_loss    
        best_metrics["pr_auc"] = pr_auc
        best_metrics["roc"] = roc    
        best_metrics["rmse"] = rmse
    if ep >= 2 and ep > best_ep + 10:
        print("========== no improvement over last 10 epochs ==========")
        break

In [None]:
best_metrics

In [None]:
for name, param in modelV0.named_parameters():
    if param.grad is not None:
        print(name, param.grad.norm())

In [None]:
for i, lb in enumerate(label_cols):
    x, y = np.mean(ep_train_loss_lb[lb]), np.std(ep_train_loss_lb[lb])
    x1, y1 = np.mean(ep_val_loss_lb[lb]), np.std(ep_val_loss_lb[lb])
    
    print(f"[{lb}] train: mean:{round(x,4)} std dev:{round(y,4)}; val: mean:{round(x1,4)} std dev:{round(y1,4)}")

### ploting train loss vs validation loss

In [None]:
def plot_both(train_loss, validation_loss, which_ep="", lb = ""):
    epochs = range(1, len(train_loss) + 1)
    plt.plot(epochs, train_loss, 'b', label='Training Loss')
    plt.plot(epochs, validation_loss, 'r', label='Validation Loss')
    plt.title(lb +' Loss over '+ which_ep +' Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [None]:
plot_both(ep_train_loss[:], ep_val_loss[:])

In [None]:
num_rows, num_cols = 3, 3
fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 12))
fig.suptitle('Train and Validation Losses for Each Label', fontsize=10)

for i, lb in enumerate(label_cols):
    row = i // num_cols
    col = i % num_cols
    
    train_loss = ep_train_loss_lb[lb]
    valid_loss = ep_val_loss_lb[lb]
    axes[row, col].plot(range(1, len(train_loss)+1), train_loss, label='Train Loss', color='blue')
    axes[row, col].plot(range(1, len(train_loss)+1), valid_loss, label='Validation Loss', color='red')

    axes[row, col].set_title(lb)
    axes[row, col].set_xlabel('Epochs')
    axes[row, col].set_ylabel('Loss')
    axes[row, col].legend()

plt.tight_layout()
plt.show()