In [7]:
from typing import Tuple, Callable, Dict, List, Set

import pandas as pd
import numpy as np
# torch imports are not needed for this NumPy SVD++
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import os
import time
import copy

Make sure that results are reproducible by using a seed.

In [8]:
SEED = 42
np.random.seed(SEED)

## Helper functions & Configuration

In [None]:
DATA_DIR = "/cluster/courses/cil/collaborative_filtering/data" # Adjust to your path
#DATA_DIR = ""

# Hyperparameters for SVD++ with Wishlist Integration
SVD_N_FACTORS = 50
SVD_LR = 0.007       # Learning rate, may need tuning
SVD_REG = 0.04         # Regularization, may need tuning
SVD_N_EPOCHS_VALID = 20 # Epochs for training with a validation set
SVD_VALID_PATIENCE = 3  # Early stopping patience for validation
SVD_N_EPOCHS_FULL = 45  # Epochs for final training on all data - based on validation set, this was the best number of epochs
SVD_FULL_PATIENCE = 5   # Early stopping for full training (based ONLY on train RMSE)


In [None]:

def read_data_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Reads in data and splits it into training and validation sets."""
    df = pd.read_csv(os.path.join(DATA_DIR, "train_ratings.csv"))
    df[["sid", "pid"]] = df["sid_pid"].str.split("_", expand=True)
    df = df.drop("sid_pid", axis=1)
    df["sid"] = df["sid"].astype(int)
    df["pid"] = df["pid"].astype(int)
    train_df, valid_df = train_test_split(df, test_size=0.25, random_state=SEED)
    return train_df, valid_df

def read_full_training_data() -> pd.DataFrame:
    """Reads the entire training dataset."""
    df = pd.read_csv(os.path.join(DATA_DIR, "train_ratings.csv"))
    df[["sid", "pid"]] = df["sid_pid"].str.split("_", expand=True)
    df = df.drop("sid_pid", axis=1)
    df["sid"] = df["sid"].astype(int)
    df["pid"] = df["pid"].astype(int)
    return df

def evaluate_model_predictions(true_ratings: np.ndarray, pred_ratings: np.ndarray) -> float:
    """Calculates RMSE after clipping predictions."""
    preds_clipped = np.clip(pred_ratings, 1.0, 5.0)
    return root_mean_squared_error(true_ratings, preds_clipped)

def evaluate_with_model(model_dict: Dict, eval_df: pd.DataFrame, 
                          pred_function: Callable[[Dict, np.ndarray, np.ndarray], np.ndarray]) -> float:
    """Helper to evaluate a model dictionary on a dataframe."""
    if eval_df is None or eval_df.empty:
        return np.nan
    preds = pred_function(model_dict, eval_df["sid"].values, eval_df["pid"].values)
    return evaluate_model_predictions(eval_df["rating"].values, preds)

def make_submission(pred_fn_callable: Callable[[np.ndarray, np.ndarray], np.ndarray], filename: os.PathLike):
    """Makes a submission CSV file."""
    df_sub = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))
    sid_pid_split = df_sub["sid_pid"].str.split("_", expand=True)
    sids_sub_vals = sid_pid_split[0].astype(int).values
    pids_sub_vals = sid_pid_split[1].astype(int).values
    predictions = pred_fn_callable(sids_sub_vals, pids_sub_vals)
    df_sub["rating"] = np.clip(predictions, 1.0, 5.0)
    df_sub.to_csv(filename, index=False)
    print(f"Submission file created: {filename}")

def plot_training_curves(n_total_epochs_run: int, train_rmse_hist: List[float], val_rmse_hist: List[float], title_prefix: str = ""):
    epochs_ran = len(train_rmse_hist)
    if epochs_ran == 0: 
        print(f"{title_prefix}: No training history to plot.")
        return
    epochs_range = range(1, epochs_ran + 1)
    plt.figure(figsize=(10, 5))
    plt.plot(epochs_range, train_rmse_hist, "bo-", label="Training RMSE")
    valid_val_epochs = [e for e, r in zip(epochs_range, val_rmse_hist[:epochs_ran]) if not np.isnan(r)]
    valid_val_rmse_points = [r for r in val_rmse_hist[:epochs_ran] if not np.isnan(r)]
    if valid_val_epochs:
        plt.plot(valid_val_epochs, valid_val_rmse_points, "ro-", label="Validation RMSE")
    plt.title(f"{title_prefix} Training & Validation RMSE (Up to {epochs_ran} Epochs)")
    plt.xlabel("Epochs")
    plt.ylabel("RMSE")
    plt.legend()
    plt.grid(True)
    plt.show()

## SVD++ with Integrated Wishlist Data

In [None]:
def train_svdpp_with_wishlist_integrated(train_df: pd.DataFrame, tbr_df: pd.DataFrame, 
                                         num_factors: int, lr: float, reg: float, n_epochs: int,
                                         valid_df: pd.DataFrame = None, 
                                         early_stopping_patience: int = 5,
                                         evaluate_every_n_epochs: int = 1) -> Tuple[Dict, List[float], List[float]]:
    """
    SVD++ with wishlist data integrated into the implicit feedback component.
    Based on the structure of your "train_svdpp_fast".
    """
    # 1) Remap IDs using both ratings and TBR data
    sids_ratings = train_df["sid"].unique(); pids_ratings = train_df["pid"].unique()
    sids_tbr = tbr_df["sid"].unique(); pids_tbr = tbr_df["pid"].unique()
    sids_all = np.union1d(sids_ratings, sids_tbr)
    pids_all = np.union1d(pids_ratings, pids_tbr)
    user2ind = {sid: i for i, sid in enumerate(sids_all)}
    item2ind = {pid: i for i, pid in enumerate(pids_all)}
    n_users, n_items = len(sids_all), len(pids_all)
    print(f"SVD++ (Wishlist Integrated): {n_users} users, {n_items} items")

    # 2) Build index arrays for ratings (only from train_df)
    train_df_mappable = train_df[train_df["sid"].isin(user2ind) & train_df["pid"].isin(item2ind)]
    user_arr   = train_df_mappable["sid"].map(user2ind).to_numpy(dtype=np.int32)
    item_arr   = train_df_mappable["pid"].map(item2ind).to_numpy(dtype=np.int32)
    rating_arr = train_df_mappable["rating"].to_numpy(dtype=np.float32)

    # 3) Global mean from ratings
    mu = np.float32(rating_arr.mean()) if len(rating_arr) > 0 else np.float32(3.5)

    # 4) Init parameters
    b_u = np.zeros(n_users, np.float32); b_i = np.zeros(n_items, np.float32)
    p   = np.random.normal(0, 0.01, (n_users, num_factors)).astype(np.float32)
    q   = np.random.normal(0, 0.01, (n_items, num_factors)).astype(np.float32)
    y   = np.random.normal(0, 0.01, (n_items, num_factors)).astype(np.float32)

    # 5) Build combined implicit lists (N'(u) = Rated(u) U Wishlisted(u))
    print("Building combined implicit feedback lists (ratings + wishlist)...")
    implicit_combined: Dict[int, Set[int]] = {u_idx: set() for u_idx in range(n_users)}
    for sid_orig, pid_orig in zip(train_df["sid"], train_df["pid"]):
        if sid_orig in user2ind and pid_orig in item2ind:
            implicit_combined[user2ind[sid_orig]].add(item2ind[pid_orig])
    for sid_orig, pid_orig in zip(tbr_df["sid"], tbr_df["pid"]):
        if sid_orig in user2ind and pid_orig in item2ind:
            implicit_combined[user2ind[sid_orig]].add(item2ind[pid_orig])
    
    Nu_list_combined: List[np.ndarray] = [np.array(list(implicit_combined[u_idx]), dtype=np.int32) for u_idx in range(n_users)]
    Nu_count_combined = np.array([len(a) for a in Nu_list_combined], dtype=np.int32)
    sqrt_Nu_combined_inv = np.where(Nu_count_combined > 0, 1.0 / np.sqrt(Nu_count_combined, dtype=np.float32), np.float32(0.0))
    sqrt_Nu_for_pred = np.where(Nu_count_combined > 0, np.sqrt(Nu_count_combined, dtype=np.float32), np.float32(1.0))

    # History and Best Model Tracking
    train_rmse_history = []
    val_rmse_history = []
    best_metric = float("inf")
    epochs_no_improve = 0
    best_model_params = {}

    n_ratings = len(user_arr)
    if n_ratings == 0:
        print("SVD++ Wishlist: No ratings for training.")
        best_model_params = {"mu":mu, "b_u":b_u, "b_i":b_i, "p":p, "q":q, "y":y, 
                       "user2ind":user2ind, "item2ind":item2ind, 
                       "Nu_list_combined": Nu_list_combined, "sqrt_Nu_for_pred": sqrt_Nu_for_pred, 
                       "num_factors":num_factors}
        return best_model_params, [], []

    print(f"SVD++ Wishlist: Starting SGD ({n_epochs} epochs on {n_ratings} ratings)...")
    for epoch in range(n_epochs):
        epoch_start_time = time.time()
        current_epoch_squared_errors = []
        
        # Recompute y_sum based on Nu_list_combined (includes wishlist)
        y_sum = np.zeros((n_users, num_factors), np.float32) # y_sum is sum(y_j)/sqrt(N'_u)
        for u_idx in range(n_users):
            if Nu_count_combined[u_idx] > 0:
                y_sum[u_idx] = y[Nu_list_combined[u_idx]].sum(axis=0) / sqrt_Nu_for_pred[u_idx]

        perm = np.random.permutation(n_ratings)
        for i_perm_idx in range(n_ratings):
            idx = perm[i_perm_idx]
            u = user_arr[idx]; i = item_arr[idx]; r = rating_arr[idx]
            
            imp = y_sum[u] # This now uses the combined implicit feedback
            pred = mu + b_u[u] + b_i[i] + q[i].dot(p[u] + imp)
            err  = r - pred
            current_epoch_squared_errors.append(err**2)

            b_u[u] += lr * (err - reg * b_u[u])
            b_i[i] += lr * (err - reg * b_i[i])
            p_old  = p[u].copy()
            p[u]  += lr * (err * q[i] - reg * p[u])
            q[i]  += lr * (err * (p_old + imp) - reg * q[i])

            # Update y_j for items in Nu_list_combined[u]
            if Nu_count_combined[u] > 0:
                # sqrt_Nu_combined_inv[u] is 1/sqrt(N'_u)
                coeff_grad_y = err * q[i] * sqrt_Nu_combined_inv[u] 
                idxs  = Nu_list_combined[u]
                y[idxs] += lr * (coeff_grad_y - reg * y[idxs])
                
                # Update y_sum[u] incrementally
                y_sum[u] += lr * (err * q[i] * sqrt_Nu_combined_inv[u] - reg * y_sum[u])

        epoch_train_rmse = np.sqrt(np.mean(current_epoch_squared_errors))
        train_rmse_history.append(epoch_train_rmse)
        current_val_rmse = np.nan
        stopping_metric = epoch_train_rmse

        if valid_df is not None and not valid_df.empty and \
           ((epoch + 1) % evaluate_every_n_epochs == 0 or epoch == n_epochs - 1):
            temp_model_params = {
                "mu":mu, "b_u":b_u, "b_i":b_i, "p":p, "q":q, "y":y,
                "user2ind":user2ind, "item2ind":item2ind,
                "Nu_list_combined": Nu_list_combined, # Pass combined list
                "sqrt_Nu_for_pred": sqrt_Nu_for_pred, # Pass corresponding sqrt
                "num_factors":num_factors
            }
            current_val_rmse = evaluate_with_model(temp_model_params, valid_df, svdpp_pred_wishlist_integrated)
            val_rmse_history.append(current_val_rmse)
            stopping_metric = current_val_rmse
            print(f"  SVD++ Wishlist: Epoch {epoch+1}/{n_epochs}. Train RMSE: {epoch_train_rmse:.4f}, Valid RMSE: {current_val_rmse:.4f}. Took: {time.time() - epoch_start_time:.2f}s.")
        else:
            val_rmse_history.append(np.nan)
            print(f"  SVD++ Wishlist: Epoch {epoch+1}/{n_epochs}. Train RMSE: {epoch_train_rmse:.4f}. Took: {time.time() - epoch_start_time:.2f}s. (Val skipped)")

        if stopping_metric < best_metric:
            best_metric = stopping_metric
            epochs_no_improve = 0
            best_model_params = {
                "mu": mu, "b_u": b_u.copy(), "b_i": b_i.copy(),
                "p": p.copy(), "q": q.copy(), "y": y.copy(),
                "user2ind": user2ind, "item2ind": item2ind,
                "Nu_list_combined": [arr.copy() for arr in Nu_list_combined], 
                "sqrt_Nu_for_pred": sqrt_Nu_for_pred.copy(),
                "num_factors": num_factors
            }
            print(f"    New best metric ({"Valid" if not np.isnan(current_val_rmse) else "Train"} RMSE): {best_metric:.4f}. Model params saved.")
        else:
            epochs_no_improve += 1
        if early_stopping_patience > 0 and epochs_no_improve >= early_stopping_patience:
            print(f"    SVD++ Wishlist: Early stopping at epoch {epoch+1}.")
            break
            
    if not best_model_params: # If loop didnt run or improve
        print("SVD++ Wishlist: No best model found, returning last state.")
        best_model_params = {"mu":mu, "b_u":b_u, "b_i":b_i, "p":p, "q":q, "y":y, 
                       "user2ind":user2ind, "item2ind":item2ind, 
                       "Nu_list_combined": Nu_list_combined, "sqrt_Nu_for_pred": sqrt_Nu_for_pred, 
                       "num_factors":num_factors}
    return best_model_params, train_rmse_history, val_rmse_history

def svdpp_pred_wishlist_integrated(model: Dict, sids_arr: np.ndarray, pids_arr: np.ndarray) -> np.ndarray:
    """Prediction function for SVD++ model trained with integrated wishlist data."""
    mu = model["mu"]
    user2ind = model["user2ind"]
    item2ind = model["item2ind"]
    b_u = model["b_u"]
    b_i = model["b_i"]
    p = model["p"]
    q = model["q"]
    y = model["y"]
    num_factors = model["num_factors"]
    Nu_list_combined = model["Nu_list_combined"] # Use the combined list
    sqrt_Nu_for_pred = model["sqrt_Nu_for_pred"] # Use corresponding sqrt factor
    
    n_preds = len(sids_arr)
    preds = np.full(n_preds, mu, dtype=np.float32)

    for k_idx in range(n_preds):
        sid = sids_arr[k_idx]
        pid = pids_arr[k_idx]
        
        current_pred = mu 
        if (sid in user2ind) and (pid in item2ind):
            u = user2ind[sid]
            i = item2ind[pid]
            
            user_implicit_items = Nu_list_combined[u]
            norm_factor = sqrt_Nu_for_pred[u]
            
            imp_sum = np.zeros(num_factors, dtype=np.float32)
            if user_implicit_items.size > 0 and norm_factor > 1e-9: 
                imp_sum = np.sum(y[user_implicit_items], axis=0) / norm_factor
            
            current_pred = mu + b_u[u] + b_i[i] + np.dot(q[i], p[u] + imp_sum)
        preds[k_idx] = current_pred
    return preds

### Main routine for training and evaluation (using SVD++ with Wishlist)

In [None]:
# Load all necessary data
train_df_split, valid_df_split = read_data_df() # For validation run
tbr_df_global = pd.read_csv(os.path.join(DATA_DIR, "train_tbr.csv"))

print("--- Training SVD++ with Integrated Wishlist (on validation split) ---")
svdpp_wishlist_model_val, train_hist_val, val_hist_val = train_svdpp_with_wishlist_integrated(
    train_df_split, 
    tbr_df_global, 
    num_factors=SVD_N_FACTORS, 
    lr=SVD_LR, 
    reg=SVD_REG, 
    n_epochs=SVD_N_EPOCHS_VALID,
    valid_df=valid_df_split,
    early_stopping_patience=SVD_VALID_PATIENCE,
    evaluate_every_n_epochs=1
)

plot_training_curves(SVD_N_EPOCHS_VALID, train_hist_val, val_hist_val, "SVD++ Integrated Wishlist (Validation Run)")
final_val_rmse = val_hist_val[-1] if val_hist_val and not np.isnan(val_hist_val[-1]) else train_hist_val[-1]
print(f"Final RMSE on validation set (SVD++ with Wishlist): {final_val_rmse:.4f}")


print("\n--- Training SVD++ with Integrated Wishlist (on FULL data for submission) ---")
full_train_data = read_full_training_data()
svdpp_wishlist_model_full, train_hist_full, _ = train_svdpp_with_wishlist_integrated(
    full_train_data, 
    tbr_df_global, 
    num_factors=SVD_N_FACTORS, 
    lr=SVD_LR, 
    reg=SVD_REG, 
    n_epochs=SVD_N_EPOCHS_FULL, # Use full epochs here
    valid_df=None, # No separate validation set for full training
    early_stopping_patience=SVD_FULL_PATIENCE, # Based on training RMSE
    evaluate_every_n_epochs=1 # Will only print train RMSE
)
plot_training_curves(SVD_N_EPOCHS_FULL, train_hist_full, [], "SVD++ Integrated Wishlist (Full Training)")
print(f"Lowest Training RMSE on full data: {min(train_hist_full):.4f}")

# Make submission
submission_pred_fn = lambda sids_arr, pids_arr: svdpp_pred_wishlist_integrated(svdpp_wishlist_model_full, sids_arr, pids_arr)
submission_filename = f"svdpp_integrated_wishlist_f{SVD_N_FACTORS}_lr{SVD_LR}_reg{SVD_REG}_ep{len(train_hist_full) if train_hist_full else SVD_N_EPOCHS_FULL}.csv"
make_submission(submission_pred_fn, submission_filename)