<a href="https://colab.research.google.com/github/goelnikhils-lgtm/languagemodels/blob/main/moviepredictionusingDCNv2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install pandas numpy matplotlib seaborn tqdm scikit-learn imbalanced-learn torch

In [None]:
#code for calculating CTR for the movie lens dataset
""" core libraries"""
import os , zipfile , requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

""" machine learning libraries"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset , DataLoader
from sklearn.preprocessing import StandardScaler , OneHotEncoder , LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score , accuracy_score , classification_report , confusion_matrix
from sklearn.manifold import TSNE
from imblearn.over_sampling import RandomOverSampler


#enviornment setup
os.environ["KMP_DUPLICATE_LIB_OK"] = "0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"using device : {device}")

#data preparation
print("----Step 1 : Preparing Movielens 1M Data----")
if not os.path.exists("ml-1m.zip"):
    print("Downloading movielens 1M dataset...")
    os.system("wget -q https://files.grouplens.org/datasets/movielens/ml-1m.zip")
    os.system("unzip -q ml-1m.zip")
    print("Download complete.")
#load data
print("Loading data...")
data_dir = "ml-1m" # Corrected data directory
ratings = pd.read_csv(os.path.join(data_dir, "ratings.dat"), sep="::", engine="python", names=["UserID", "MovieID", "Rating", "Timestamp"])
users = pd.read_csv(os.path.join(data_dir, "users.dat"), sep="::", engine="python", names=["UserID", "Gender", "Age", "Occupation", "Zip-code"])
movies = pd.read_csv(os.path.join(data_dir, "movies.dat"), sep="::", engine="python", names=["MovieID", "Title", "Genres"], encoding='latin-1')

#merge data
print("Merging data...")
#data frame a tabular data frame
df = ratings.merge(users, on="UserID").merge(movies, on="MovieID")

#create binary target variable(1 if rating >=4 , else 0)
df['Target'] = (df['Rating'] >= 4).astype(int)

#encode numerical features
print("Encoding features...")
encoders = {}
categorical_features = ["UserID", "MovieID", "Gender", "Occupation", "Age"]
for feature in categorical_features:
    encoders[feature] = LabelEncoder()
    df[feature] = encoders[feature].fit_transform(df[feature])

#dataset shape
print(f"Dataset shape: {df.shape}")
print(f"Target Distribution:{df['Target'].value_counts(normalize=True)}")

#--- DCNV2 model preparation ---
class CrossLayerV2(nn.Module):
    def __init__(self, input_dim):
        super(CrossLayerV2, self).__init__()
        self.input_dim = input_dim
        self.weight = nn.Parameter(torch.randn(input_dim, input_dim))
        self.bias = nn.Parameter(torch.zeros(input_dim)) # Bias should be a vector

    def forward(self, x0, xl):
        # x0: original input, xl: output from previous layer
        # Element-wise multiplication and addition
        # matrix based cross layer : xl+1 = x0 * (xl @ W) + b + xl
        return x0 * (torch.matmul(xl, self.weight) + self.bias) + xl


class DCNv2(nn.Module):
    def __init__(self, feature_dims, embedding_dim=16, num_cross_layers=3, deep_layers=[512, 256, 128]):
        super(DCNv2, self).__init__()

        # Embedding layers for each feature
        self.embeddings = nn.ModuleDict({
            feature: nn.Embedding(dim, embedding_dim) for feature, dim in feature_dims.items()
        })
        #input dimensions for cross and deep networks
        input_dim = len(feature_dims) * embedding_dim

        # Cross Network V2 Matrix based
        self.cross_layers = nn.ModuleList()
        for _ in range(num_cross_layers):
            self.cross_layers.append(CrossLayerV2(input_dim)) # Instantiate CrossLayerV2

        #deep network
        deep_input_dim = input_dim
        self.deep_layers = nn.ModuleList()
        for hidden_dim in deep_layers:
            self.deep_layers.append(nn.Linear(deep_input_dim, hidden_dim))
            deep_input_dim = hidden_dim

        #final output layer
        final_dim = input_dim + deep_layers[-1] #cross network output + deep network output
        self.output_layer = nn.Linear(final_dim, 1)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        #get embeddings
        embeddings = []
        for feature, emb_layer in self.embeddings.items():
            embeddings.append(emb_layer(x[feature]))

        x0 = torch.cat(embeddings, dim=1) #concatenate all embeddings

        # Cross Network V2 forward pass
        xl = x0
        for layer in self.cross_layers:
            xl = layer(x0,xl)  # Use instantiated layer

        # Deep Network forward pass
        deep_out = x0
        for layer in self.deep_layers:
            deep_out = F.relu(layer(deep_out))
            deep_out = self.dropout(deep_out)

        # Concatenate cross and deep outputs
        combined = torch.cat([xl, deep_out], dim=1)

        # Final output layer
        out = torch.sigmoid(self.output_layer(combined))
        return out


#data set class
class MovieLensDataset(Dataset):
        def __init__(self, dataframe, categorical_features):
            self.data = {}  # Initialize self.data as a dictionary
            for feature in categorical_features:
                self.data[feature] = torch.LongTensor(dataframe[feature].values)
            self.target = torch.FloatTensor(dataframe['Target'].values)

        def __len__(self):
            return len(self.target)

        def __getitem__(self, idx):
            sample = {feature: self.data[feature][idx] for feature in self.data}
            return sample, self.target[idx]

# 4 training step
#split data
print("Splitting data...")
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['Target'], random_state=42)
#feature dimenions
feature_dims = {
  "UserID": df['UserID'].nunique(),
  "MovieID": df['MovieID'].nunique(),
  "Gender": df['Gender'].nunique(),
  "Age": df['Age'].nunique(),
  "Occupation": df['Occupation'].nunique()
}

#create datasets and dataloaders
train_dataset = MovieLensDataset(train_df, categorical_features)
test_dataset = MovieLensDataset(test_df, categorical_features)

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

#intialize model , loss function and optimizer
model = DCNv2(feature_dims, embedding_dim=16, num_cross_layers=3, deep_layers=[512, 256, 128]).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

print("Model intialized with {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters")

#training function
num_epochs = 25
train_loss=[]
test_auc = []

for epoch in range(num_epochs):
    #training
    model.train()
    epoch_loss = 0
    num_batches = 0
    for batch_data , batch_targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        #move to device
        batch_data = {k: v.to(device) for k, v in batch_data.items()}
        batch_targets = batch_targets.to(device)


        optimizer.zero_grad()
        outputs = model(batch_data).squeeze()
        loss = criterion(outputs, batch_targets)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        num_batches += 1
    avg_train_loss = epoch_loss / num_batches
    train_loss.append(avg_train_loss)

#Evaluation
model.eval()
all_predictions = []
all_targets = []

with torch.no_grad():
    for batch_data, batch_targets in tqdm(test_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Evaluating"):
        batch_data = {k: v.to(device) for k, v in batch_data.items()}
        outputs = model(batch_data).squeeze()
        all_predictions.extend(outputs.cpu().numpy())
        all_targets.extend(batch_targets.numpy())
    test_auc_score = roc_auc_score(all_targets, all_predictions)
    test_auc.append(test_auc_score)

print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Test AUC: {test_auc_score:.4f}")

#step 5 visualizing
print("----Step 5 : Visualizing Training Progress----")

#plot training loss and test AUC
fig , (ax1,ax2) = plt.subplots(1,2,figsize=(12,5))

#training loss
ax1.plot(range(1,num_epochs+1),train_loss,label="Train Loss",color='blue')
ax1.set_title("Training Loss over Epochs")
ax1.set_xlabel("Epochs")
ax1.set_ylabel("Binary Cross Entropy Loss")
ax1.grid(True,alpha=0.3)

#test AUC
ax2.plot(range(1,num_epochs+1),test_auc,label="Test AUC",color='green')
ax2.set_title("Test AUC over Epochs")
ax2.set_xlabel("Epochs")
ax2.set_ylabel("AUC Score")
ax2.grid(True,alpha=0.3)

plt.tight_layout()
plt.show()

#visualizing emnbeddings using t-SNE
model.eval()
with torch.no_grad():
  user_embeddings = model.embeddings['UserID'].weight.cpu().numpy()
  movie_embeddings = model.embeddings['MovieID'].weight.cpu().numpy()

#sample subset for visualization
n_samples = min(500, len(user_embeddings))
user_sample = np.random.choice(len(user_embeddings), n_samples, replace=False)
movie_sample = np.random.choice(len(movie_embeddings), n_samples, replace=False)

#apply t-sne
user_tsne = TSNE(n_components=2, random_state=42).fit_transform(user_embeddings[user_sample])
movie_tsne = TSNE(n_components=2, random_state=42).fit_transform(movie_embeddings[movie_sample])

#plot embeddings
fig , (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))


#user embeddings
scattter_1 = ax1.scatter(user_tsne[:,0], user_tsne[:,1], c=user_sample, cmap= "virdis", alpha=0.6)
ax1.set_title("User Embeddings t-SNE")
ax1.set_xlabel("t-SNE Component 1")
ax1.set_ylabel("t-SNE Component 2")
plt.colorbar(scattter_1, ax=ax1, label="User ID")

#movie embeddings
scattter_2 = ax2.scatter(movie_tsne[:,0], movie_tsne[:,1], c=movie_sample, cmap="plasma", alpha=0.6)
ax2.set_title("Movie Embeddings t-SNE")
ax2.set_xlabel("t-SNE Component 1")
ax2.set_ylabel("t-SNE Component 2")
plt.colorbar(scattter_2, ax=ax2, label="Movie ID")

plt.tight_layout()
plt.show()

#final result summary
print("----Step 6 : Final Evaluation on Test Set----")
print(f"Final Test AUC: {test_auc[-1]:.4f}")
print(f"Best Test AUC: {max(test_auc):.4f}")
print(f"User Embedding Shape: {user_embeddings.shape}")
print(f"Movie Embedding Shape: {movie_embeddings.shape}")