In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from tqdm import tqdm
import matplotlib.pyplot as plt
import gc
import os
import pickle

print("\n" + "="*80)
print("STEP 1: SETUP AND DATA LOADING")
print("="*80)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set random seed for reproducibility
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)

# Load data
reviews_df = pd.read_csv("/content/drive/My Drive/bt4222data/Reviews Data Cleaned/cleaned_reviews.csv", keep_default_na=False)
metadata_df = pd.read_csv("/content/drive/My Drive/bt4222data/Meta Data Cleaned/final_metadata_cleaned.csv", keep_default_na=False)


# Print basic info about the datasets
print(f"Reviews shape: {reviews_df.shape}")
print(f"Metadata shape: {metadata_df.shape}")

print("\n" + "="*80)
print("STEP 2: DATA PREPROCESSING")
print("="*80)

# Filter to common ASINs if needed
common_asins = set(reviews_df['asin']).intersection(set(metadata_df['asin']))
print(f"Common ASINs: {len(common_asins)}")

reviews_df = reviews_df[reviews_df['asin'].isin(common_asins)]
metadata_df = metadata_df[metadata_df['asin'].isin(common_asins)]

# Convert ratings to binary (1 if rating >= 3, else 0)
reviews_df['interaction'] = (reviews_df['overall'] >= 3).astype(int)

# Display binary interaction distribution
print(f"Interaction distribution (binary):\n{reviews_df['interaction'].value_counts()}")

# Encode user and item IDs
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

reviews_df['user_idx'] = user_encoder.fit_transform(reviews_df['reviewerID'])
reviews_df['item_idx'] = item_encoder.fit_transform(reviews_df['asin'])

# Create a mapping from asin to item_idx for later use
asin_to_idx = dict(zip(reviews_df['asin'], reviews_df['item_idx']))

# Add item_idx to metadata_df
metadata_df['item_idx'] = metadata_df['asin'].map(asin_to_idx)

# Fill any NaN in description with empty string
metadata_df['description'] = metadata_df['description'].fillna('')
metadata_df['title'] = metadata_df['title'].fillna('')

print(f"Unique users: {reviews_df['reviewerID'].nunique()}")
print(f"Unique items: {reviews_df['asin'].nunique()}")

print("\n" + "="*80)
print("STEP 3: LOAD PRECOMPUTED BERT EMBEDDINGS")
print("="*80)

# Load the precomputed BERT embeddings
bert_embeddings_path = "/content/drive/My Drive/bt4222data/embeddings/metadata_embeddings.npz"
print(f"Loading precomputed BERT embeddings from {bert_embeddings_path}")

try:
    loaded_data = np.load(bert_embeddings_path)
    # Depending on how the embeddings were saved, you may need to adjust this
    if 'embeddings' in loaded_data:
        bert_embeddings = loaded_data['embeddings']
    else:
        bert_embeddings = loaded_data['arr_0']  # Default key when saving with np.savez

    print(f"Loaded BERT embeddings with shape: {bert_embeddings.shape}")
except Exception as e:
    print(f"Error loading BERT embeddings: {e}")
    raise

# Ensure metadata is sorted by item_idx
metadata_df = metadata_df.sort_values('item_idx').reset_index(drop=True)

# If necessary, verify that the embedding order matches the sorted metadata
print(f"Number of items with embeddings: {len(bert_embeddings)}")
print(f"Number of items in metadata: {len(metadata_df)}")

if len(bert_embeddings) != len(metadata_df):
    print("Warning: BERT embeddings count does not match metadata count. Adjusting...")
    # This might need more sophisticated handling depending on your data

# Get the embedding dimension
bert_embedding_dim = bert_embeddings.shape[1]
print(f"BERT embedding dimension: {bert_embedding_dim}")

print("\n" + "="*80)
print("STEP 4: AUTOENCODER FOR FEATURE EXTRACTION")
print("="*80)

class StackedDenoisingAutoencoder(nn.Module):
    """
    Stacked Denoising Autoencoder (SDAE) for extracting latent features from BERT embeddings
    """
    def __init__(self, input_dim, hidden_dims, dropout_rate=0.2):
        super(StackedDenoisingAutoencoder, self).__init__()

        # Ensure hidden_dims is a list
        if not isinstance(hidden_dims, list):
            hidden_dims = [hidden_dims]

        # Create encoder layers
        encoder_layers = []
        prev_dim = input_dim
        for i, dim in enumerate(hidden_dims):
            encoder_layers.append(nn.Linear(prev_dim, dim))
            if i < len(hidden_dims) - 1:  # No activation on bottleneck layer
                encoder_layers.append(nn.BatchNorm1d(dim))
                encoder_layers.append(nn.ReLU())
                encoder_layers.append(nn.Dropout(dropout_rate))
            prev_dim = dim
        self.encoder = nn.Sequential(*encoder_layers)

        # Create decoder layers (in reverse)
        decoder_layers = []
        prev_dim = hidden_dims[-1]  # Start from bottleneck
        for i, dim in enumerate(reversed(hidden_dims[:-1])):  # Skip bottleneck
            decoder_layers.append(nn.Linear(prev_dim, dim))
            decoder_layers.append(nn.BatchNorm1d(dim))
            decoder_layers.append(nn.ReLU())
            decoder_layers.append(nn.Dropout(dropout_rate))
            prev_dim = dim

        # Final reconstruction layer
        decoder_layers.append(nn.Linear(prev_dim, input_dim))
        self.decoder = nn.Sequential(*decoder_layers)

    def add_noise(self, x, noise_factor=0.2):
        """Add Gaussian noise to the input"""
        noisy_x = x + noise_factor * torch.randn_like(x)
        return noisy_x

    def forward(self, x, add_noise=True):
        # Add noise for denoising effect during training
        if add_noise and self.training:
            x = self.add_noise(x)

        # Encode
        encoded = self.encoder(x)
        # Decode
        decoded = self.decoder(encoded)

        return decoded, encoded

    def encode(self, x):
        """Get the latent representation"""
        return self.encoder(x)


# Define SDAE for GMF and MLP
GMF_HIDDEN_DIMS = [128, 64, 32, 16, 8]  # Latent dim is 8
MLP_HIDDEN_DIMS = [256, 128, 64, 32, 16, 8]  # Latent dim is 8

# Define and train SDAEs
def train_autoencoder(autoencoder, input_data, batch_size=64, epochs=10, lr=0.001, weight_decay=1e-5):
    """Train the autoencoder"""
    autoencoder.to(device)
    autoencoder.train()

    # Create dataset and dataloader
    tensor_data = torch.tensor(input_data, dtype=torch.float).to(device)
    dataset = torch.utils.data.TensorDataset(tensor_data)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Loss and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters(), lr=lr, weight_decay=weight_decay)

    # Training loop
    losses = []
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            x = batch[0]

            # Forward
            reconstructed, _ = autoencoder(x)
            loss = criterion(reconstructed, x)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * len(x)

        avg_loss = total_loss / len(tensor_data)
        losses.append(avg_loss)
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.6f}")

    return losses

# Initialize GMF and MLP SDAEs
gmf_sdae = StackedDenoisingAutoencoder(bert_embedding_dim, GMF_HIDDEN_DIMS)
mlp_sdae = StackedDenoisingAutoencoder(bert_embedding_dim, MLP_HIDDEN_DIMS)

# Check if saved models exist
MODELS_DIR = "models/"
os.makedirs(MODELS_DIR, exist_ok=True)
GMF_SDAE_PATH = os.path.join(MODELS_DIR, "gmf_sdae.pth")
MLP_SDAE_PATH = os.path.join(MODELS_DIR, "mlp_sdae.pth")

# Train or load GMF SDAE
if os.path.exists(GMF_SDAE_PATH):
    print(f"Loading pretrained GMF SDAE from {GMF_SDAE_PATH}")
    gmf_sdae.load_state_dict(torch.load(GMF_SDAE_PATH, map_location=device))
else:
    print("Training GMF SDAE...")
    gmf_losses = train_autoencoder(gmf_sdae, bert_embeddings, batch_size=128, epochs=10)
    torch.save(gmf_sdae.state_dict(), GMF_SDAE_PATH)

# Train or load MLP SDAE
if os.path.exists(MLP_SDAE_PATH):
    print(f"Loading pretrained MLP SDAE from {MLP_SDAE_PATH}")
    mlp_sdae.load_state_dict(torch.load(MLP_SDAE_PATH, map_location=device))
else:
    print("Training MLP SDAE...")
    mlp_losses = train_autoencoder(mlp_sdae, bert_embeddings, batch_size=128, epochs=10)
    torch.save(mlp_sdae.state_dict(), MLP_SDAE_PATH)

# Move to CPU for feature extraction
gmf_sdae.to('cpu')
mlp_sdae.to('cpu')
gmf_sdae.eval()
mlp_sdae.eval()

# Extract latent features
with torch.no_grad():
    bert_tensor = torch.tensor(bert_embeddings, dtype=torch.float)
    gmf_item_features = gmf_sdae.encode(bert_tensor).numpy()
    mlp_item_features = mlp_sdae.encode(bert_tensor).numpy()

print(f"GMF item features shape: {gmf_item_features.shape}")
print(f"MLP item features shape: {mlp_item_features.shape}")

print("\n" + "="*80)
print("STEP 5: DATASET AND DATALOADER CREATION")
print("="*80)

class NeuMFPlusPlusDataset(Dataset):
    def __init__(self, interactions_df, gmf_item_features, mlp_item_features):
        self.users = torch.tensor(interactions_df['user_idx'].values, dtype=torch.long)
        self.items = torch.tensor(interactions_df['item_idx'].values, dtype=torch.long)
        self.labels = torch.tensor(interactions_df['interaction'].values, dtype=torch.float)
        self.gmf_item_features = torch.tensor(gmf_item_features, dtype=torch.float)
        self.mlp_item_features = torch.tensor(mlp_item_features, dtype=torch.float)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        user_id = self.users[idx]
        item_id = self.items[idx]
        label = self.labels[idx]
        gmf_item_feature = self.gmf_item_features[item_id]
        mlp_item_feature = self.mlp_item_features[item_id]

        return user_id, item_id, gmf_item_feature, mlp_item_feature, label

# Train/validation/test split using stratified sampling
train_df, temp_df = train_test_split(
    reviews_df, test_size=0.3, random_state=seed,
    stratify=reviews_df['interaction']
)

val_df, test_df = train_test_split(
    temp_df, test_size=0.5, random_state=seed,
    stratify=temp_df['interaction']
)

print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")

# Create datasets and loaders
BATCH_SIZE = 1024

train_dataset = NeuMFPlusPlusDataset(train_df, gmf_item_features, mlp_item_features)
val_dataset = NeuMFPlusPlusDataset(val_df, gmf_item_features, mlp_item_features)
test_dataset = NeuMFPlusPlusDataset(test_df, gmf_item_features, mlp_item_features)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

print("\n" + "="*80)
print("STEP 6: MODEL DEFINITION")
print("="*80)

class NeuMFPlusPlus(nn.Module):
    """
    Neural Matrix Factorization Plus Plus (NeuMF++) model with separate feature extraction
    for GMF and MLP components, incorporating the learned latent features from BERT embeddings.
    """
    def __init__(self, num_users, num_items,
                 embedding_dim=8,
                 gmf_feature_dim=8,
                 mlp_feature_dim=8,
                 mlp_dims=[32, 16, 8],
                 dropout_rate=0.2):
        super(NeuMFPlusPlus, self).__init__()

        # GMF part
        self.user_gmf_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_gmf_embedding = nn.Embedding(num_items, embedding_dim)

        # MLP part
        self.user_mlp_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_mlp_embedding = nn.Embedding(num_items, embedding_dim)

        # Batch normalization for feature embeddings
        self.gmf_feature_bn = nn.BatchNorm1d(gmf_feature_dim)
        self.mlp_feature_bn = nn.BatchNorm1d(mlp_feature_dim)

        # MLP layers
        mlp_input_dim = embedding_dim * 2 + mlp_feature_dim  # user + item + feature
        self.mlp_layers = nn.ModuleList()
        self.mlp_batch_norms = nn.ModuleList()

        # First layer
        self.mlp_layers.append(nn.Linear(mlp_input_dim, mlp_dims[0]))
        self.mlp_batch_norms.append(nn.BatchNorm1d(mlp_dims[0]))

        # Hidden layers
        for i in range(len(mlp_dims)-1):
            self.mlp_layers.append(nn.Linear(mlp_dims[i], mlp_dims[i+1]))
            self.mlp_batch_norms.append(nn.BatchNorm1d(mlp_dims[i+1]))

        # Output layer
        # GMF part: embedding_dim + feature_dim
        # MLP part: mlp_dims[-1]
        self.output_layer = nn.Linear(embedding_dim + gmf_feature_dim + mlp_dims[-1], 1)

        # Activation and dropout
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.sigmoid = nn.Sigmoid()

        # Initialize weights
        self._init_weights()

    def _init_weights(self):
        # Initialize embeddings with normal distribution
        nn.init.normal_(self.user_gmf_embedding.weight, std=0.01)
        nn.init.normal_(self.item_gmf_embedding.weight, std=0.01)
        nn.init.normal_(self.user_mlp_embedding.weight, std=0.01)
        nn.init.normal_(self.item_mlp_embedding.weight, std=0.01)

        # Initialize linear layers with Xavier/Glorot
        for layer in self.mlp_layers:
            nn.init.xavier_uniform_(layer.weight)
            nn.init.zeros_(layer.bias)

        nn.init.xavier_uniform_(self.output_layer.weight)
        nn.init.zeros_(self.output_layer.bias)

    def forward(self, user_indices, item_indices, gmf_item_features, mlp_item_features):
        # GMF part
        user_gmf_emb = self.user_gmf_embedding(user_indices)
        item_gmf_emb = self.item_gmf_embedding(item_indices)

        # Process GMF item features
        gmf_item_features = self.gmf_feature_bn(gmf_item_features)

        # Element-wise product for GMF
        # Combine embedding with features
        gmf_vector = torch.cat([user_gmf_emb * item_gmf_emb, gmf_item_features], dim=1)

        # MLP part
        user_mlp_emb = self.user_mlp_embedding(user_indices)
        item_mlp_emb = self.item_mlp_embedding(item_indices)

        # Process MLP item features
        mlp_item_features = self.mlp_feature_bn(mlp_item_features)

        # Concatenate for MLP
        mlp_input = torch.cat([user_mlp_emb, item_mlp_emb, mlp_item_features], dim=1)

        # Apply MLP layers
        for i, layer in enumerate(self.mlp_layers):
            mlp_input = layer(mlp_input)
            mlp_input = self.mlp_batch_norms[i](mlp_input)
            mlp_input = self.relu(mlp_input)
            mlp_input = self.dropout(mlp_input)

        # Concatenate GMF and MLP parts
        concat_output = torch.cat([gmf_vector, mlp_input], dim=1)

        # Final prediction
        prediction = self.sigmoid(self.output_layer(concat_output))

        return prediction.squeeze()

print("\n" + "="*80)
print("STEP 7: TRAINING AND EVALUATION FUNCTIONS")
print("="*80)

def train_epoch(model, train_loader, optimizer, criterion, device):
    """
    Train model for one epoch
    """
    model.train()
    total_loss = 0

    train_bar = tqdm(train_loader, desc="Training")
    for user_ids, item_ids, gmf_item_features, mlp_item_features, labels in train_bar:
        # Move tensors to device
        user_ids = user_ids.to(device)
        item_ids = item_ids.to(device)
        gmf_item_features = gmf_item_features.to(device)
        mlp_item_features = mlp_item_features.to(device)
        labels = labels.to(device)

        # Forward pass
        predictions = model(user_ids, item_ids, gmf_item_features, mlp_item_features)
        loss = criterion(predictions, labels)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * len(labels)
        train_bar.set_postfix({'loss': loss.item()})

    return total_loss / len(train_loader.dataset)

def evaluate(model, data_loader, criterion, device):
    """
    Evaluate model on a dataset
    """
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0

    with torch.no_grad():
        for user_ids, item_ids, gmf_item_features, mlp_item_features, labels in tqdm(data_loader, desc="Evaluating"):
            # Move tensors to device
            user_ids = user_ids.to(device)
            item_ids = item_ids.to(device)
            gmf_item_features = gmf_item_features.to(device)
            mlp_item_features = mlp_item_features.to(device)
            labels = labels.to(device)

            # Get predictions
            predictions = model(user_ids, item_ids, gmf_item_features, mlp_item_features)
            loss = criterion(predictions, labels)
            total_loss += loss.item() * len(labels)

            # Store predictions and labels
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Convert to numpy arrays
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    # Convert predictions to binary (threshold = 0.5)
    binary_preds = (all_preds >= 0.5).astype(int)

    # Calculate metrics
    accuracy = accuracy_score(all_labels, binary_preds)
    precision = precision_score(all_labels, binary_preds)
    recall = recall_score(all_labels, binary_preds)
    f1 = f1_score(all_labels, binary_preds)
    auc = roc_auc_score(all_labels, all_preds)

    avg_loss = total_loss / len(data_loader.dataset)

    return {
        'loss': avg_loss,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc,
        'predictions': all_preds,
        'labels': all_labels
    }

def evaluate_cold_start_items(model, all_df, test_df, gmf_item_features, mlp_item_features, min_interactions=5):
    """
    Evaluate model specifically on cold start items from the test set.
    Cold start items are defined as items with minimal user interactions.
    """
    print("\nEvaluating model on cold start items...")

    # Count interactions per item in the full dataset
    item_counts = all_df['asin'].value_counts()

    # Find cold start items that appear in the test set
    cold_start_items = set(item_counts[item_counts <= min_interactions].index) & set(test_df['asin'].unique())

    if not cold_start_items:
        print(f"No cold start items found with <= {min_interactions} interactions in the test set.")
        return None

    print(f"Found {len(cold_start_items)} cold start items in the test set.")

    # Filter test data to only include cold start items
    cold_start_test_df = test_df[test_df['asin'].isin(cold_start_items)]

    if len(cold_start_test_df) == 0:
        print("No data available for cold start evaluation.")
        return None

    print(f"Number of interactions with cold start items: {len(cold_start_test_df)}")

    # Create dataset and dataloader
    cold_start_dataset = NeuMFPlusPlusDataset(cold_start_test_df, gmf_item_features, mlp_item_features)
    cold_start_loader = DataLoader(cold_start_dataset, batch_size=BATCH_SIZE, shuffle=False)

    # Evaluate using the same criterion
    criterion = nn.BCELoss()
    cold_start_metrics = evaluate(model, cold_start_loader, criterion, device)

    print("Cold Start Items Metrics:")
    print(f"Loss: {cold_start_metrics['loss']:.4f}")
    print(f"Accuracy: {cold_start_metrics['accuracy']:.4f}")
    print(f"Precision: {cold_start_metrics['precision']:.4f}")
    print(f"Recall: {cold_start_metrics['recall']:.4f}")
    print(f"F1 Score: {cold_start_metrics['f1']:.4f}")
    print(f"AUC: {cold_start_metrics['auc']:.4f}")

    return cold_start_metrics

def generate_recommendations(model, user_encoder, item_encoder, all_df, test_df, gmf_item_features, mlp_item_features, n_recommendations=5):
    """
    Generate n recommendations for users in the test set.
    """
    model.eval()

    # Get unique users from test set
    test_users = test_df['reviewerID'].unique()

    # Get all items
    all_items = all_df['asin'].unique()

    # Create tensors of all item features
    gmf_item_features_tensor = torch.tensor(gmf_item_features, dtype=torch.float)
    mlp_item_features_tensor = torch.tensor(mlp_item_features, dtype=torch.float)

    recommendations = {}

    for user in tqdm(test_users[:10], desc="Generating recommendations"):  # Just do 10 users for demo
        user_idx = user_encoder.transform([user])[0]

        # Get items the user hasn't interacted with yet
        user_items = all_df[all_df['reviewerID'] == user]['asin'].values
        unseen_items = np.setdiff1d(all_items, user_items)

        if len(unseen_items) == 0:
            print(f"User {user} has interacted with all items!")
            continue

        # Convert unseen items to indices
        unseen_item_indices = torch.tensor([asin_to_idx[asin] for asin in unseen_items if asin in asin_to_idx], dtype=torch.long)

        if len(unseen_item_indices) == 0:
            print(f"No valid unseen items for user {user}")
            continue

        # Get features for unseen items
        gmf_item_feat = gmf_item_features_tensor[unseen_item_indices]
        mlp_item_feat = mlp_item_features_tensor[unseen_item_indices]

        # Process in batches to avoid memory issues
        batch_size = 1024
        all_scores = []

        for i in range(0, len(unseen_item_indices), batch_size):
            batch_indices = unseen_item_indices[i:i+batch_size]
            batch_gmf_features = gmf_item_feat[i:i+batch_size]
            batch_mlp_features = mlp_item_feat[i:i+batch_size]

            user_tensor = torch.tensor([user_idx] * len(batch_indices), dtype=torch.long).to(device)
            item_tensor = batch_indices.to(device)
            gmf_item_tensor = batch_gmf_features.to(device)
            mlp_item_tensor = batch_mlp_features.to(device)

            with torch.no_grad():
                scores = model(user_tensor, item_tensor, gmf_item_tensor, mlp_item_tensor)
                all_scores.append(scores.cpu().numpy())

        if all_scores:
            all_scores = np.concatenate(all_scores)

            # Get the indices of the top N scores
            if len(all_scores) >= n_recommendations:
                top_n_indices = np.argsort(all_scores)[-n_recommendations:][::-1]
                recommended_items = [unseen_items[i] for i in top_n_indices]
            else:
                # If we have fewer items than requested recommendations
                recommended_items = [unseen_items[i] for i in np.argsort(all_scores)[::-1]]

            recommendations[user] = recommended_items
        else:
            print(f"No scores computed for user {user}")

    return recommendations

print("\n" + "="*80)
print("STEP 8: MODEL TRAINING WITH EARLY STOPPING")
print("="*80)

# Model parameters
num_users = len(user_encoder.classes_)
num_items = len(item_encoder.classes_)
gmf_feature_dim = gmf_item_features.shape[1]
mlp_feature_dim = mlp_item_features.shape[1]

print(f"Number of users: {num_users}")
print(f"Number of items: {num_items}")
print(f"GMF feature dimension: {gmf_feature_dim}")
print(f"MLP feature dimension: {mlp_feature_dim}")

# Model hyperparameters
EMBEDDING_DIM = 8
MLP_DIMS = [32, 16, 8]
LEARNING_RATE = 0.001
WEIGHT_DECAY = 1e-5
EPOCHS = 20
EARLY_STOPPING_PATIENCE = 3

# Initialize model
model = NeuMFPlusPlus(
    num_users=num_users,
    num_items=num_items,
    embedding_dim=EMBEDDING_DIM,
    gmf_feature_dim=gmf_feature_dim,
    mlp_feature_dim=mlp_feature_dim,
    mlp_dims=MLP_DIMS
).to(device)

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# Training loop with early stopping
train_losses = []
val_losses = []
val_aucs = []
best_val_auc = 0
patience_counter = 0
MODEL_SAVE_DIR = "models/"
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
best_model_path = os.path.join(MODEL_SAVE_DIR, 'neumf_plus_plus_best.pth')

for epoch in range(EPOCHS):
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    train_losses.append(train_loss)

    # Validate
    val_metrics = evaluate(model, val_loader, criterion, device)
    val_losses.append(val_metrics['loss'])
    val_aucs.append(val_metrics['auc'])

    # Print metrics
    print(f"Epoch {epoch+1}/{EPOCHS}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val Loss: {val_metrics['loss']:.4f}, Val Accuracy: {val_metrics['accuracy']:.4f}")
    print(f"Val Precision: {val_metrics['precision']:.4f}, Val Recall: {val_metrics['recall']:.4f}")
    print(f"Val F1 Score: {val_metrics['f1']:.4f}, Val AUC: {val_metrics['auc']:.4f}")

    # Early stopping based on validation AUC
    if val_metrics['auc'] > best_val_auc:
        best_val_auc = val_metrics['auc']
        patience_counter = 0

        # Save the best model
        torch.save({
            'model_state_dict': model.state_dict(),
            'num_users': num_users,
            'num_items': num_items,
            'gmf_feature_dim': gmf_feature_dim,
            'mlp_feature_dim': mlp_feature_dim,
            'embedding_dim': EMBEDDING_DIM,
            'mlp_dims': MLP_DIMS,
            'best_val_auc': best_val_auc
        }, best_model_path)
        print(f"Saved best model with Val AUC: {best_val_auc:.4f}")
    else:
        patience_counter += 1
        print(f"Early stopping patience: {patience_counter}/{EARLY_STOPPING_PATIENCE}")

    if patience_counter >= EARLY_STOPPING_PATIENCE:
        print("Early stopping triggered!")
        break

    print("-" * 50)

print("\n" + "="*80)
print("STEP 9: MODEL EVALUATION")
print("="*80)

# Try to load the best model, but continue with the current model if loading fails
try:
    checkpoint = torch.load(best_model_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Loaded best model with validation AUC: {checkpoint['best_val_auc']:.4f}")
except (FileNotFoundError, RuntimeError, pickle.UnpicklingError) as e:
    print(f"Could not load saved model: {e}")
    print("Continuing with current model state.")

# Evaluate on test set
test_metrics = evaluate(model, test_loader, criterion, device)
print("\nTest Results:")
print(f"Test Loss: {test_metrics['loss']:.4f}")
print(f"Test Accuracy: {test_metrics['accuracy']:.4f}")
print(f"Test Precision: {test_metrics['precision']:.4f}")
print(f"Test Recall: {test_metrics['recall']:.4f}")
print(f"Test F1 Score: {test_metrics['f1']:.4f}")
print(f"Test AUC: {test_metrics['auc']:.4f}")

# Evaluate on cold start items
all_df = pd.concat([train_df, val_df, test_df])
cold_start_metrics = evaluate_cold_start_items(
    model=model,
    all_df=all_df,
    test_df=test_df,
    gmf_item_features=gmf_item_features,
    mlp_item_features=mlp_item_features,
    min_interactions=5  # Define cold start items as those with <= 5 interactions
)

print("\n" + "="*80)
print("STEP 10: GENERATE RECOMMENDATIONS")
print("="*80)

# Generate recommendations
recommendations = generate_recommendations(
    model=model,
    user_encoder=user_encoder,
    item_encoder=item_encoder,
    all_df=all_df,
    test_df=test_df,
    gmf_item_features=gmf_item_features,
    mlp_item_features=mlp_item_features,
    n_recommendations=5
)

# Display sample recommendations
print("\nSample Recommendations:")
for i, (user, items) in enumerate(recommendations.items()):
    if i >= 3:  # Just show 3 users for brevity
        break
    print(f"\nRecommendations for user {user}:")
    for item in items:
        print(f"Item: {item}")

print("\n" + "="*80)
print("STEP 11: PLOT TRAINING CURVES")
print("="*80)

# Plot training curves
plt.figure(figsize=(15, 5))

# Loss curves
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

# AUC curve
plt.subplot(1, 2, 2)
plt.plot(val_aucs, label='Validation AUC')
plt.xlabel('Epoch')
plt.ylabel('AUC')
plt.title('Validation AUC')
plt.legend()

plt.tight_layout()
plt.savefig("training_curves.png")
plt.close()
print("Training curves saved to training_curves.png")

print("\nNeuMF++ with BERT and Autoencoders implementation complete!")

# Memory cleanup
del train_dataset, val_dataset, test_dataset
del train_loader, val_loader, test_loader
gc.collect()
torch.cuda.empty_cache()


STEP 1: SETUP AND DATA LOADING
Using device: cuda
Reviews shape: (1689188, 18)
Metadata shape: (492009, 19)

STEP 2: DATA PREPROCESSING
Common ASINs: 61709


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df['interaction'] = (reviews_df['overall'] >= 3).astype(int)


Interaction distribution (binary):
interaction
1    1436504
0     183627
Name: count, dtype: int64
Unique users: 192395
Unique items: 61709

STEP 3: LOAD PRECOMPUTED BERT EMBEDDINGS
Loading precomputed BERT embeddings from /content/drive/My Drive/bt4222data/embeddings/metadata_embeddings.npz
Loaded BERT embeddings with shape: (498191, 384)
Number of items with embeddings: 498191
Number of items in metadata: 61709
BERT embedding dimension: 384

STEP 4: AUTOENCODER FOR FEATURE EXTRACTION
Training GMF SDAE...
Epoch [1/10], Loss: 0.003245
Epoch [2/10], Loss: 0.002022
Epoch [3/10], Loss: 0.001950
Epoch [4/10], Loss: 0.001949
Epoch [5/10], Loss: 0.001948
Epoch [6/10], Loss: 0.001947
Epoch [7/10], Loss: 0.001947
Epoch [8/10], Loss: 0.001947
Epoch [9/10], Loss: 0.001948
Epoch [10/10], Loss: 0.001947
Training MLP SDAE...
Epoch [1/10], Loss: 0.002981
Epoch [2/10], Loss: 0.002008
Epoch [3/10], Loss: 0.001967
Epoch [4/10], Loss: 0.001967
Epoch [5/10], Loss: 0.001967
Epoch [6/10], Loss: 0.001967
Ep

Training: 100%|██████████| 1108/1108 [00:11<00:00, 93.43it/s, loss=0.309]
Evaluating: 100%|██████████| 238/238 [00:02<00:00, 95.28it/s] 


Epoch 1/20
Train Loss: 0.3608
Val Loss: 0.3280, Val Accuracy: 0.8867
Val Precision: 0.8867, Val Recall: 1.0000
Val F1 Score: 0.9399, Val AUC: 0.7005
Saved best model with Val AUC: 0.7005
--------------------------------------------------


Training: 100%|██████████| 1108/1108 [00:11<00:00, 92.47it/s, loss=0.295]
Evaluating: 100%|██████████| 238/238 [00:02<00:00, 87.82it/s] 


Epoch 2/20
Train Loss: 0.2980
Val Loss: 0.3285, Val Accuracy: 0.8866
Val Precision: 0.8868, Val Recall: 0.9996
Val F1 Score: 0.9399, Val AUC: 0.7146
Saved best model with Val AUC: 0.7146
--------------------------------------------------


Training: 100%|██████████| 1108/1108 [00:12<00:00, 91.46it/s, loss=0.324]
Evaluating: 100%|██████████| 238/238 [00:02<00:00, 95.19it/s] 


Epoch 3/20
Train Loss: 0.2577
Val Loss: 0.3536, Val Accuracy: 0.8854
Val Precision: 0.8873, Val Recall: 0.9974
Val F1 Score: 0.9391, Val AUC: 0.7080
Early stopping patience: 1/3
--------------------------------------------------


Training: 100%|██████████| 1108/1108 [00:11<00:00, 92.83it/s, loss=0.241] 
Evaluating: 100%|██████████| 238/238 [00:02<00:00, 94.63it/s] 


Epoch 4/20
Train Loss: 0.2257
Val Loss: 0.3976, Val Accuracy: 0.8825
Val Precision: 0.8888, Val Recall: 0.9915
Val F1 Score: 0.9373, Val AUC: 0.7005
Early stopping patience: 2/3
--------------------------------------------------


Training: 100%|██████████| 1108/1108 [00:11<00:00, 92.38it/s, loss=0.206]
Evaluating: 100%|██████████| 238/238 [00:02<00:00, 88.22it/s] 


Epoch 5/20
Train Loss: 0.2037
Val Loss: 0.4495, Val Accuracy: 0.8537
Val Precision: 0.9025, Val Recall: 0.9360
Val F1 Score: 0.9190, Val AUC: 0.6970
Early stopping patience: 3/3
Early stopping triggered!

STEP 9: MODEL EVALUATION
Could not load saved model: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL numpy._core.multiarray.scalar was not an allowed global by default. Please use `torch.serialization.add_safe

Evaluating: 100%|██████████| 238/238 [00:02<00:00, 90.68it/s] 



Test Results:
Test Loss: 0.4429
Test Accuracy: 0.8536
Test Precision: 0.9027
Test Recall: 0.9358
Test F1 Score: 0.9189
Test AUC: 0.7005

Evaluating model on cold start items...
Found 4857 cold start items in the test set.
Number of interactions with cold start items: 6583


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 63.37it/s]

Cold Start Items Metrics:
Loss: 0.5111
Accuracy: 0.7957
Precision: 0.8788
Recall: 0.8845
F1 Score: 0.8817
AUC: 0.6485

STEP 10: GENERATE RECOMMENDATIONS



Generating recommendations: 100%|██████████| 10/10 [00:06<00:00,  1.53it/s]



Sample Recommendations:

Recommendations for user A15PUGYZ6C2IPU:
Item: B008JCVF0U
Item: B0044YPN0A
Item: B000V5P90K
Item: B000OG6I6A
Item: B000053HC5

Recommendations for user AKXT3E60ZZQCY:
Item: B000UZH7P6
Item: B00004TX71
Item: B007HSKSP0
Item: B00003G1RG
Item: B000RZNI4S

Recommendations for user ABHM4V3BH2C2T:
Item: B0044YPN0A
Item: B001BTCSI6
Item: B000OG6I6A
Item: B000V5P90K
Item: B000053HC5

STEP 11: PLOT TRAINING CURVES
Training curves saved to training_curves.png

NeuMF++ with BERT and Autoencoders implementation complete!
