# Baseline using SVD++
all based on [surpire](https://surprise.readthedocs.io/en/stable/index.html) lib 

In [None]:
!pip uninstall -y numpy 
!pip install numpy==1.26.4
!pip install scikit-surprise # requires numpy 1 🫠

In [None]:
from surprise import SVDpp, accuracy
import pandas as pd
from sklearn.model_selection import train_test_split as sklearn_split
from surprise import Dataset, Reader
import pandas as pd
from tqdm import tqdm
import numpy as np

from src.dataloader import Dataloader

In [None]:
def calculate_rmse(predictions):
    sum_sq_error = 0
    count = 0
    
    for _, _, true_rating, pred_rating in predictions:
        if pred_rating is not None:
            sum_sq_error += (true_rating - pred_rating) ** 2
            count += 1
    return np.sqrt(sum_sq_error / count)

## SVDpp, explicite data only (ratings)

In [None]:
ratings_df = Dataloader.load_train_ratings()
tbr_df = Dataloader.load_train_tbr()

ratings_train, ratings_test = sklearn_split(ratings_df, test_size=0.25, random_state=42)

reader = Reader(line_format='user item rating', sep=',', skip_lines=0)
trainset = Dataset.load_from_df(ratings_train[['sid', 'pid', 'rating']], reader=reader).build_full_trainset()
testset = Dataset.load_from_df(ratings_test[['sid', 'pid', 'rating']], reader=reader).build_full_trainset().build_testset()

model = SVDpp(n_factors=30, n_epochs=20, random_state=42)
model = model.fit(trainset)
predictions = []
predictions_int = []
for uid, iid, true_rating in testset:
    pred = model.predict(uid, iid).est
    predictions.append((uid, iid, true_rating, pred))
    predictions_int.append((uid, iid, true_rating, np.round(pred, decimals=0)))

rmse = calculate_rmse(predictions)
print(f"RMSE: {rmse:.4f}")
rmse_int = calculate_rmse(predictions_int)
print(f"RMSE int: {rmse_int:.4f}")

## SVDpp ensemble, explicite data only (ratings)

In [None]:
ratings_df = Dataloader.load_train_ratings()
tbr_df = Dataloader.load_train_tbr()

ratings_train, ratings_test = sklearn_split(ratings_df, test_size=0.25, random_state=42)

reader = Reader(line_format='user item rating', sep=',', skip_lines=0)
trainset = Dataset.load_from_df(ratings_train[['sid', 'pid', 'rating']], reader=reader).build_full_trainset()
testset = Dataset.load_from_df(ratings_test[['sid', 'pid', 'rating']], reader=reader).build_full_trainset().build_testset()

models = {
    'svdpp10': SVDpp(n_factors=10, n_epochs=20, random_state=42, reg_all=0.005),
    'svdpp20': SVDpp(n_factors=20, n_epochs=20, random_state=42, reg_all=0.01),
    'svdpp25': SVDpp(n_factors=25, n_epochs=20, random_state=42, reg_all=0.01),
    'svdpp30': SVDpp(n_factors=30, n_epochs=20, random_state=42, reg_all=0.013),
    'svdpp35': SVDpp(n_factors=35, n_epochs=20, random_state=42, reg_all=0.015),
    'svdpp40': SVDpp(n_factors=40, n_epochs=20, random_state=42, reg_all=0.016),
    'svdpp45': SVDpp(n_factors=45, n_epochs=20, random_state=42, reg_all=0.018),
    'svdpp50': SVDpp(n_factors=50, n_epochs=20, random_state=42, reg_all=0.02),
    'svdpp55': SVDpp(n_factors=55, n_epochs=20, random_state=42, reg_all=0.025),
    'svdpp60': SVDpp(n_factors=60, n_epochs=20, random_state=42, reg_all=0.03),
}

trained_models = {}
for name, algo in models.items():
    print(f"Training {name}...")
    trained_models[name] = algo.fit(trainset)

def predict_ensemble(uid, iid, weights=None):
    if weights is None:
        weights = {name: 1/len(trained_models) for name in trained_models}
    
    predictions = {}
    
    for name, model in trained_models.items():
        pred = model.predict(uid, iid).est
        predictions[name] = pred
    
    weight_sum = sum(v for _,v in weights.items())
    for name in weights:
        weights[name] /= weight_sum
    
    # Calculate weighted average
    weighted_sum = sum(
        predictions[name] * weights[name] 
        for name in predictions 
    )
    return weighted_sum

print("Making ensemble predictions...")
ensemble_predictions = []
ensemble_predictions_int = []

for uid, iid, true_rating in testset:
    pred = predict_ensemble(uid, iid)
    ensemble_predictions.append((uid, iid, true_rating, pred))
    ensemble_predictions_int.append((uid, iid, true_rating, np.round(pred, decimals=0)))

ensemble_rmse = calculate_rmse(ensemble_predictions)
print(f"RMSE: {ensemble_rmse:.4f}")
ensemble_rmse_int = calculate_rmse(ensemble_predictions_int)
print(f"RMSE int: {ensemble_rmse_int:.4f}")

## prediction for submission

In [None]:
# predict
df_pred = Dataloader.load_sample_submission()
predictions = []

preds = set()
for i, row in df_pred.iterrows():
    pred = predict_ensemble(row['sid'], row['pid'])
    preds.add(pred)
    predictions.append([row['sid'], row['pid'], pred])

results_df = pd.DataFrame(predictions, columns=['sid', 'pid', 'rating'])

results_df['rating'] = results_df['rating']#.round().astype(int)

with open('test_submission.csv', 'w') as f:
    f.write('sid_pid,rating\n')
    for i, row in results_df.iterrows():
        f.write(f'{int(row["sid"])}_{int(row["pid"])},{row["rating"]}\n')