In [None]:
!pip install torch  # Ensure latest PyTorch (includes CUDA support for GPU)
!pip install pandas numpy  # For data processing

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5



In [1]:
from google.colab import drive
drive.mount('/content/drive')

# One-time per session to bring it to fast local storage

!cp "/content/drive/MyDrive/Electronics_5.json.gz" /content/

Mounted at /content/drive


In [2]:
!ls -lh /content/Electronics_5.json.gz

-rw------- 1 root root 1.2G May 21 20:46 /content/Electronics_5.json.gz


In [2]:
import gzip

file_path = '/content/Electronics_5.json.gz'

line_count = 0
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
    for _ in f:
        line_count += 1

print(f"📦 Total number of reviews (rows): {line_count}")

📦 Total number of reviews (rows): 6738382


In [4]:
import pandas as pd
file_path = '/content/Electronics_5.json.gz'

# Step 1: Read in chunks, only needed columns
chunks = []
chunk_size = 250_000

reader = pd.read_json(file_path, lines=True, compression='gzip', chunksize=chunk_size)

for chunk in reader:
    chunk = chunk[['reviewerID', 'asin', 'overall']]
    chunk.columns = ['user_id', 'item_id', 'rating']
    chunks.append(chunk)

# Step 2: Concatenate into single DataFrame
df = pd.concat(chunks, ignore_index=True)
del chunks  # Free memory

# Step 3: Sort and assign interaction_status
df = df.sort_values(by='rating', ascending=False).reset_index(drop=True)
total_rows = len(df)

df.loc[:int(0.025 * total_rows), 'interaction_status'] = 1.0
df.loc[int(0.025 * total_rows):int(0.10 * total_rows), 'interaction_status'] = 0.6
df.loc[int(0.10 * total_rows):int(0.35 * total_rows), 'interaction_status'] = 0.3
df.loc[int(0.35 * total_rows):, 'interaction_status'] = 0.0

# Step 4: Final save
final_df = df[['user_id', 'item_id', 'interaction_status']]
final_df.to_csv('/content/processed_interactions_realistic.csv', index=False)

print("✅ Saved to /content/processed_interactions_realistic.csv")

✅ Saved to /content/processed_interactions_realistic.csv


In [2]:
# Load the saved dataset
df_sample = pd.read_csv('/content/processed_interactions_realistic.csv')

# Show first 10 rows
print(df_sample.head(10))

          user_id     item_id  interaction_status
0  A1FGCIRPRNZWD5  B01HJF704M                 1.0
1   AAP7PPBU72QFM  0151004714                 1.0
2   AJJ7VX2L91X2W  B01HJH40WU                 1.0
3  A1ER5AYS3FQ9O3  0151004714                 1.0
4  A1T17LMQABMBN5  0151004714                 1.0
5  A2HUZO7MQAY5I2  B01HJH40WU                 1.0
6   AG3DXG002QSXP  B01HJA3OUG                 1.0
7   AE50B0MLAS1B9  B01HJA3OUG                 1.0
8  A2L12USPGEMCTM  B01HIZEW1C                 1.0
9  A2RU0H9MD4IH5M  B01HIZEW1C                 1.0


In [5]:
#1 means brought
#.6 means added to wishlist/cart
#.3 means intereacted
#0 means ignored
interaction_counts = df['interaction_status'].value_counts().sort_index()
interaction_percentages = df['interaction_status'].value_counts(normalize=True).sort_index() * 100

# Combine into one table
summary = pd.DataFrame({
    'count': interaction_counts,
    'percentage': interaction_percentages.round(2)
})

print(summary)

                      count  percentage
interaction_status                     
0.0                 4379949        65.0
0.3                 1684595        25.0
0.6                  505379         7.5
1.0                  168459         2.5


In [10]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
torch.manual_seed(42)

# Step 1:  Encode user_id and item_id
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
df['user_id'] = user_encoder.fit_transform(df['user_id'])
df['item_id'] = item_encoder.fit_transform(df['item_id'])

# Number of unique users and items
num_users = len(user_encoder.classes_)
num_items = len(item_encoder.classes_)
print(f"Number of users: {num_users}, Number of items: {num_items}")

# Step 2: Split into train, validation, and test sets (~88.5-10-1.5)
train_df, temp_df = train_test_split(df, train_size=0.885, random_state=42)
val_df, test_df = train_test_split(temp_df, train_size=(0.10 / 0.115), random_state=42)

print(f"Train size: {len(train_df)}, Val size: {len(val_df)}, Test size: {len(test_df)}")

# Step 3: Create a custom Dataset
class InteractionDataset(Dataset):
    def __init__(self, df):
        self.user_ids = torch.tensor(df['user_id'].values, dtype=torch.long)
        self.item_ids = torch.tensor(df['item_id'].values, dtype=torch.long)
        self.labels = torch.tensor(df['interaction_status'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.user_ids[idx], self.item_ids[idx], self.labels[idx]

train_dataset = InteractionDataset(train_df)
val_dataset = InteractionDataset(val_df)
test_dataset = InteractionDataset(test_df)

# DataLoaders
batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

# Step 4: Define the Enhanced Two-Tower Model
class TwoTowerModel(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=128):
        super(TwoTowerModel, self).__init__()
        # User tower
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.user_fc1 = nn.Linear(embedding_dim, embedding_dim // 2)
        self.user_bn1 = nn.BatchNorm1d(embedding_dim // 2)
        # Item tower
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.item_fc1 = nn.Linear(embedding_dim, embedding_dim // 2)
        self.item_bn1 = nn.BatchNorm1d(embedding_dim // 2)
        self.relu = nn.ReLU()

    def forward(self, user_ids, item_ids):
        # User tower
        user_emb = self.user_embedding(user_ids)
        user_out = self.user_fc1(user_emb)
        user_out = self.user_bn1(user_out)
        user_out = self.relu(user_out)
        # Item tower
        item_emb = self.item_embedding(item_ids)
        item_out = self.item_fc1(item_emb)
        item_out = self.item_bn1(item_out)
        item_out = self.relu(item_out)
        # Dot product
        prediction = (user_out * item_out).sum(dim=1)
        return torch.sigmoid(prediction)

# Step 5: Initialize model, loss, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

embedding_dim = 128
model = TwoTowerModel(num_users, num_items, embedding_dim).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Step 6: Training Loop with Logging
num_epochs = 3
print("Starting training...")
metrics = []
best_val_loss = float('inf')
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for user_ids, item_ids, labels in train_loader:
        user_ids, item_ids, labels = user_ids.to(device), item_ids.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(user_ids, item_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        # Debug gradient
        # for name, param in model.named_parameters():
        #     if param.grad is not None:
        #         print(f"{name} gradient norm: {param.grad.norm()}")
        optimizer.step()

        train_loss += loss.item() * user_ids.size(0)

    train_loss /= len(train_loader.dataset)

    # Validation
    model.eval()
    val_loss = 0.0
    val_mae = 0.0
    with torch.no_grad():
        for user_ids, item_ids, labels in val_loader:
            user_ids, item_ids, labels = user_ids.to(device), item_ids.to(device), labels.to(device)
            outputs = model(user_ids, item_ids)
            loss = criterion(outputs, labels)
            mae = torch.mean(torch.abs(outputs - labels))
            val_loss += loss.item() * user_ids.size(0)
            val_mae += mae.item() * user_ids.size(0)

    val_loss /= len(val_loader.dataset)
    val_mae /= len(val_loader.dataset)

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val MAE: {val_mae:.4f}")

    metrics.append({
        'epoch': epoch + 1,
        'train_loss': train_loss,
        'val_loss': val_loss,
        'val_mae': val_mae
    })

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            'model_state_dict': model.state_dict(),
            'user_encoder': user_encoder,
            'item_encoder': item_encoder,
            'embedding_dim': embedding_dim
        }, '/content/best_two_tower_model.pth')

# Step 7: Test Performance with Precision@10
model.eval()
test_loss = 0.0
test_mae = 0.0
# For Precision@10: Predict scores for test set, rank, and check top-10
test_predictions = []
test_labels = []
with torch.no_grad():
    for user_ids, item_ids, labels in test_loader:
        user_ids, item_ids, labels = user_ids.to(device), item_ids.to(device), labels.to(device)
        outputs = model(user_ids, item_ids)
        loss = criterion(outputs, labels)
        mae = torch.mean(torch.abs(outputs - labels))
        test_loss += loss.item() * user_ids.size(0)
        test_mae += mae.item() * user_ids.size(0)
        test_predictions.extend(outputs.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_loss /= len(test_loader.dataset)
test_mae /= len(test_loader.dataset)
print(f"Test Loss: {test_loss:.4f}, Test MAE: {test_mae:.4f}")

# Precision@10: For each user, rank items and check top-10
test_df_with_preds = test_df.copy()
test_df_with_preds['prediction'] = test_predictions
test_df_with_preds['label'] = test_labels
# Group by user and get top-10 predictions
top_n = 10
precision_at_n = 0
user_groups = test_df_with_preds.groupby('user_id')
for user_id, group in user_groups:
    group = group.sort_values('prediction', ascending=False)
    top_n_items = group.head(top_n)
    # Consider "bought" (1.0) or "added to cart" (0.6) as relevant
    relevant_items = len(top_n_items[top_n_items['label'] >= 0.6])
    precision_at_n += relevant_items / top_n
precision_at_n /= len(user_groups)
print(f"Precision@10: {precision_at_n:.4f}")

# Log test metrics
metrics.append({
    'epoch': 'final',
    'test_loss': test_loss,
    'test_mae': test_mae,
    'precision_at_10': precision_at_n
})

# Save metrics to CSV
metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv('/content/training_metrics.csv', index=False)
print("✅ Metrics saved to /content/training_metrics.csv")

# Step 8: Save the final model
torch.save({
    'model_state_dict': model.state_dict(),
    'user_encoder': user_encoder,
    'item_encoder': item_encoder,
    'embedding_dim': embedding_dim
}, '/content/two_tower_model.pth')
print("✅ Model saved to /content/two_tower_model.pth")

Number of users: 728678, Number of items: 159748
Train size: 5963468, Val size: 673838, Test size: 101076
Using device: cuda
Starting training...
Epoch 1/3, Train Loss: 0.1801, Val Loss: 0.1795, Val MAE: 0.3950
Epoch 2/3, Train Loss: 0.1795, Val Loss: 0.1795, Val MAE: 0.3950
Epoch 3/3, Train Loss: 0.1795, Val Loss: 0.1795, Val MAE: 0.3950
Test Loss: 0.1792, Test MAE: 0.3946
Precision@10: 0.0110
✅ Metrics saved to /content/training_metrics.csv
✅ Model saved to /content/two_tower_model.pth
