In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import gzip

In [None]:
file_path = "/content/drive/MyDrive/bt4222data/Raw Data/reviews_Electronics_5.json.gz"

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

reviews = getDF(file_path)

In [None]:
reviews_df = reviews

In [None]:
# Check for missing values
print(f"Missing values in reviewerID: {reviews_df['reviewerID'].isna().sum()}")
print(f"Missing values in asin: {reviews_df['asin'].isna().sum()}")
print(f"Missing values in overall: {reviews_df['overall'].isna().sum()}")

Missing values in reviewerID: 0
Missing values in asin: 0
Missing values in overall: 0


In [None]:
# Encode user IDs and item IDs
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

reviews_df['user_idx'] = user_encoder.fit_transform(reviews_df['reviewerID'])
reviews_df['item_idx'] = item_encoder.fit_transform(reviews_df['asin'])


In [None]:

n_users = reviews_df['user_idx'].nunique()
n_items = reviews_df['item_idx'].nunique()
print(f"Number of users: {n_users}")
print(f"Number of items: {n_items}")
print(f"Number of ratings: {len(reviews_df)}")
print(f"Average ratings per user: {len(reviews_df) / n_users:.2f}")
print(f"Rating density: {len(reviews_df) / (n_users * n_items) * 100:.4f}%")

Number of users: 192403
Number of items: 63001
Number of ratings: 1689188
Average ratings per user: 8.78
Rating density: 0.0139%


In [None]:
class AmazonReviewsDataset(Dataset):
    def __init__(self, user_indices, item_indices, ratings):
        self.user_indices = user_indices
        self.item_indices = item_indices
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.user_indices[idx], dtype=torch.long),
            torch.tensor(self.item_indices[idx], dtype=torch.long),
            torch.tensor(self.ratings[idx], dtype=torch.float)
        )

# Split the data into train and test sets
train_df, test_df = train_test_split(reviews_df, test_size=0.2, random_state=42)

In [None]:
# Create datasets
train_dataset = AmazonReviewsDataset(
    train_df['user_idx'].values,
    train_df['item_idx'].values,
    train_df['overall'].values
)

test_dataset = AmazonReviewsDataset(
    test_df['user_idx'].values,
    test_df['item_idx'].values,
    test_df['overall'].values
)

In [None]:
# Create data loaders
batch_size = 1024
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
class NCF(nn.Module):
    def __init__(self, n_users, n_items, embed_dim=64, layers=[128, 64, 32, 16]):
        super(NCF, self).__init__()

        # User and item embedding layers
        self.user_embedding_gmf = nn.Embedding(n_users, embed_dim)
        self.item_embedding_gmf = nn.Embedding(n_items, embed_dim)
        self.user_embedding_mlp = nn.Embedding(n_users, embed_dim)
        self.item_embedding_mlp = nn.Embedding(n_items, embed_dim)

        # MLP layers
        self.mlp_layers = nn.ModuleList()
        input_dim = 2 * embed_dim
        for next_dim in layers:
            self.mlp_layers.append(nn.Linear(input_dim, next_dim))
            self.mlp_layers.append(nn.ReLU())
            input_dim = next_dim

        # Output layer
        self.output = nn.Linear(layers[-1] + embed_dim, 1)

        # Initialize weights
        self._init_weights_()

    def _init_weights_(self):
        for m in self.modules():
            if isinstance(m, nn.Embedding):
                nn.init.normal_(m.weight, mean=0, std=0.01)
            elif isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

    def forward(self, user_indices, item_indices):
        # GMF path
        user_emb_gmf = self.user_embedding_gmf(user_indices)
        item_emb_gmf = self.item_embedding_gmf(item_indices)
        gmf_output = user_emb_gmf * item_emb_gmf

        # MLP path
        user_emb_mlp = self.user_embedding_mlp(user_indices)
        item_emb_mlp = self.item_embedding_mlp(item_indices)
        mlp_input = torch.cat([user_emb_mlp, item_emb_mlp], dim=-1)

        mlp_output = mlp_input
        for layer in self.mlp_layers:
            mlp_output = layer(mlp_output)

        # Concatenate GMF and MLP outputs
        concat = torch.cat([gmf_output, mlp_output], dim=-1)

        # Final prediction
        prediction = self.output(concat)
        return prediction.squeeze()

In [None]:
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for user_indices, item_indices, ratings in train_loader:
        user_indices, item_indices, ratings = user_indices.to(device), item_indices.to(device), ratings.to(device)

        # Forward pass
        predictions = model(user_indices, item_indices)
        loss = criterion(predictions, ratings)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)

In [None]:
def evaluate(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for user_indices, item_indices, ratings in data_loader:
            user_indices, item_indices, ratings = user_indices.to(device), item_indices.to(device), ratings.to(device)

            # Forward pass
            predictions = model(user_indices, item_indices)
            loss = criterion(predictions, ratings)

            total_loss += loss.item()

    return total_loss / len(data_loader)

In [None]:
# 5. Model Training

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [None]:
# Model parameters
embed_dim = 64
mlp_layers = [128, 64, 32, 16]
learning_rate = 0.001
n_epochs = 10

# Initialize model, loss, and optimizer
model = NCF(n_users, n_items, embed_dim=embed_dim, layers=mlp_layers).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(n_epochs):
        # Train for one epoch
        train_loss = train(model, train_loader, optimizer, criterion, device)
        test_loss = evaluate(model, test_loader, criterion, device)

        # Calculate RMSE for train and test sets
        with torch.no_grad():
            train_rmse = calculate_rmse(model, train_loader, device)
            val_rmse = calculate_rmse(model, test_loader, device)

        # Store losses and RMSEs
        train_losses.append(train_loss)
        val_losses.append(test_loss)
        train_rmses.append(train_rmse)
        val_rmses.append(val_rmse)

        print(f"Epoch {epoch+1}/{n_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")
        print(f"Train RMSE: {train_rmse:.4f}, Val RMSE: {val_rmse:.4f}")

    # Visualize training progress
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title('NCF Baseline Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)

    plt.subplot(1, 2, 2)
    plt.plot(train_rmses, label='Train RMSE')
    plt.plot(val_rmses, label='Validation RMSE')
    plt.title('NCF Baseline RMSE')
    plt.xlabel('Epoch')
    plt.ylabel('RMSE')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('ncf_baseline_training_history.png')
    plt.show()

Epoch 1/10, Train Loss: 1.7658, Test Loss: 1.2042
Epoch 2/10, Train Loss: 0.8723, Test Loss: 1.2554
Epoch 3/10, Train Loss: 0.2675, Test Loss: 1.3190
Epoch 4/10, Train Loss: 0.0805, Test Loss: 1.3409
Epoch 5/10, Train Loss: 0.0504, Test Loss: 1.3525
Epoch 6/10, Train Loss: 0.0580, Test Loss: 1.3500
Epoch 7/10, Train Loss: 0.0637, Test Loss: 1.3366
Epoch 8/10, Train Loss: 0.0522, Test Loss: 1.3240
Epoch 9/10, Train Loss: 0.0421, Test Loss: 1.3216
Epoch 10/10, Train Loss: 0.0399, Test Loss: 1.3265


In [None]:
def get_top_n_recommendations(model, user_id, n=10):
    """
    Generate top N recommendations for a user
    """
    model.eval()
    user = user_encoder.transform([user_id])[0]

    # Create tensor with all items for the user
    user_tensor = torch.tensor([user] * n_items, dtype=torch.long).to(device)
    item_tensor = torch.tensor(list(range(n_items)), dtype=torch.long).to(device)

    # Get predictions
    with torch.no_grad():
        predictions = model(user_tensor, item_tensor)

    # Get top N item indices
    _, indices = torch.topk(predictions, n)

    # Convert back to original item IDs
    recommended_items = [item_encoder.inverse_transform([idx.item()])[0] for idx in indices]

    return recommended_items

In [None]:
# Example: Get recommendations for a specific user
sample_user = reviews_df['reviewerID'].iloc[0]
recommendations = get_top_n_recommendations(model, sample_user, n=10)
print(f"Top 10 recommendations for user {sample_user}:")
for i, item in enumerate(recommendations, 1):
    print(f"{i}. {item}")


Top 10 recommendations for user AO94DHGC771SJ:
1. B001TQSFXS
2. B0023ZK2TY
3. B005UNFRU0
4. B0002Y5WZC
5. B009NB8WR0
6. B0046TJG1U
7. B001ULBP8E
8. B004G7D0EG
9. B004071ZXA
10. B007U5MGDC


In [None]:
def calculate_rmse(model, data_loader, device):
    """
    Calculate RMSE on the provided data loader
    """
    model.eval()
    predictions = []
    actuals = []

    with torch.no_grad():
        for user_indices, item_indices, ratings in data_loader:
            user_indices, item_indices = user_indices.to(device), item_indices.to(device)

            # Get predictions
            preds = model(user_indices, item_indices)
            predictions.extend(preds.cpu().numpy())
            actuals.extend(ratings.numpy())

    # Calculate RMSE
    rmse = np.sqrt(np.mean((np.array(predictions) - np.array(actuals)) ** 2))
    return rmse


In [None]:
def calculate_mae(model, data_loader, device):
    """
    Calculate MAE on the provided data loader
    """
    model.eval()
    predictions = []
    actuals = []

    with torch.no_grad():
        for user_indices, item_indices, ratings in data_loader:
            user_indices, item_indices = user_indices.to(device), item_indices.to(device)

            # Get predictions
            preds = model(user_indices, item_indices)
            predictions.extend(preds.cpu().numpy())
            actuals.extend(ratings.numpy())

    # Calculate MAE
    mae = np.mean(np.abs(np.array(predictions) - np.array(actuals)))
    return mae


Calculating the RMSE for the model

In [None]:
rmse = calculate_rmse(model, test_loader, device)
mae = calculate_mae(model, test_loader, device)

print(f"Test RMSE: {rmse:.4f}")
print(f"Test MAE: {mae:.4f}")


Test RMSE: 1.1517
Test MAE: 0.9039


Saving the model into Google Drive

In [None]:
save_path = "/content/drive/MyDrive/bt4222data/models/ncf_baseline_model.pth"

torch.save({
    'model_state_dict': model.state_dict(),
    'user_encoder': user_encoder,
    'item_encoder': item_encoder,
    'n_users': n_users,
    'n_items': n_items,
    'embed_dim': embed_dim,
    'mlp_layers': mlp_layers
}, save_path)

print("Model saved successfully!")

Model saved successfully!


Cold Start Analysis

In [None]:
item_counts = reviews_df['asin'].value_counts()
cold_start_items = item_counts[item_counts < 6].index.tolist()
print(f"\nNumber of cold-start items (fewer than 6 interactions): {len(cold_start_items)}")

cold_start_mask = test_df['asin'].isin(cold_start_items)
cold_start_test_df = test_df[cold_start_mask]
print(f"Cold-start test instances: {len(cold_start_test_df)}")

# Evaluate on cold-start items
def calculate_rmse_subset(model, user_indices, item_indices, ratings, device):
    model.eval()
    predictions = []
    actuals = []

    # Process in batches to avoid memory issues
    batch_size = 1024
    for i in range(0, len(user_indices), batch_size):
        batch_users = torch.tensor(user_indices[i:i+batch_size], dtype=torch.long).to(device)
        batch_items = torch.tensor(item_indices[i:i+batch_size], dtype=torch.long).to(device)
        batch_ratings = ratings[i:i+batch_size]

        with torch.no_grad():
            preds = model(batch_users, batch_items)
            predictions.extend(preds.cpu().numpy())
            actuals.extend(batch_ratings)

    if len(predictions) == 0:
        return float('nan')

    return np.sqrt(np.mean((np.array(predictions) - np.array(actuals)) ** 2))

cold_start_rmse = calculate_rmse_subset(
    model,
    cold_start_test_df['user_idx'].values,
    cold_start_test_df['item_idx'].values,
    cold_start_test_df['overall'].values,
    device
)

print("\n=== Cold-Start Performance Report ===")
print(f"Overall Test RMSE: {rmse:.4f}")
print(f"Cold-Start Items RMSE: {cold_start_rmse:.4f}")

Number of cold-start items (fewer than 6 interactions): 8796
Cold-start test instances: 8847

=== Cold-Start Performance Report ===
Overall Test RMSE: 1.1517
Cold-Start Items RMSE: 1.2329
