In [52]:
import os 
import numpy as np 
from time import time
 
from train_test import get_train_test, convert_df_to_matrix, _read_df_in_format

# Surprise package for dataloading and evaluation
from surprise import Dataset, Reader
from surprise import accuracy as acc    

# Baseline 1: Averaging
from averaging import UserAverage, ItemAverage, UserItemAverage

# Baseline 2: Iterative SVD 
from svdals import normalize, ALS

# Baseline 3: Neural Collaborative Filtering
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam, SGD
from torch.nn import MSELoss

from ncf import CFDataset, GMF, MLP, NeuMF
from ncf import nn_train, nn_predict

# BFM + Adaptions 
from bfm import run_bfm, generate_clusters, create_augmented_dataset, run_bfm_augmented 

In [10]:
# For evaluationn
def print_scores(y_true, y_pred, name):
    rmse = np.sqrt(np.mean((y_true - y_pred)**2))
    mae = np.mean(np.abs(y_true - y_pred))
    print(f"Method: {name}, RMSE: {rmse:.4f}, MAE: {mae:.4f}")

# Data Loading

NOTE: These experiments are conducted on a custom train-test split for comparison. To generate the final results for submission, the data will just be the full training data set instead

In [50]:
data_folder = '../data/'
train_df, test_df = get_train_test(os.path.join(data_folder, 'data_train.csv'), split_num=0)
train_matrix = convert_df_to_matrix(train_df)

y_true = test_df['Prediction'].to_numpy()

train_df_full = _read_df_in_format(os.path.join(data_folder, 'data_train.csv'))
train_matrix_full = convert_df_to_matrix(train_df_full)

In [53]:
# For usage with Surprise package
reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(train_df, reader)
test_data = Dataset.load_from_df(test_df, reader)

trainset = train_data.build_full_trainset()
testset = test_data.build_full_trainset().build_testset()
anti_trainset = trainset.build_anti_testset()

# Full trainset
train_data_full = Dataset.load_from_df(train_df_full, reader)
trainset_full = train_data_full.build_full_trainset()
anti_trainset_full = trainset_full.build_anti_testset()

# Baseline 1: Averages

This is performed primarily as a sanity check for future methods; it represents the average across the various dimensions

In [4]:
methods = [UserAverage, ItemAverage, UserItemAverage]

for method in methods:
    start_time = time()
    algo = method()
    algo.fit(trainset)
    predictions = algo.test(testset)
    print(f"Method: {method.__name__}, RMSE: {acc.rmse(predictions, False):.4f}, MAE: {acc.mae(predictions, False):.4f}, Time: {time() - start_time:.2f}", end = '\n')

Method: UserAverage, RMSE: 1.0949, MAE: 0.9009, Time: 1.82
Method: ItemAverage, RMSE: 1.0309, MAE: 0.8398, Time: 1.58
Method: UserItemAverage, RMSE: 1.0314, MAE: 0.8482, Time: 1.59


# Baseline 2: SVD + ALS

This method first applies SVD iteratively with shrinkage as an initialization for the U and V matrices. After which, the decomposition is limited to $k$ ranks and Alternating Least Squares is performed to optimize the performance

In [5]:
train_matrix_na = train_matrix.copy()
train_matrix_na[train_matrix_na == 0] = np.nan
A, mean, std = normalize(train_matrix_na)

A = A.to_numpy()
A[np.isnan(A)] = 0
mask_A = A != 0

In [6]:
als = ALS()
start = time()
U, V = als.ALS(A, mask_A, k=3, shrinkage=30, lambd=0.1, n_iter_svd=5, n_iter_als=20)
end = time(); print(f"Time: {end - start:.2f}")

Initializing IterSVD
IterSVD completelete
Iteration 1		Error after solving for U matrix: 0.88878528090379		Error after solving for V matrix: 0.8699788788828307
Iteration 2		Error after solving for U matrix: 0.8694005409783845		Error after solving for V matrix: 0.8636261330729882
Iteration 3		Error after solving for U matrix: 0.8646307420528284		Error after solving for V matrix: 0.8616478322078692
Iteration 4		Error after solving for U matrix: 0.8625725718395331		Error after solving for V matrix: 0.8607290666194622
Iteration 5		Error after solving for U matrix: 0.8614763785422687		Error after solving for V matrix: 0.8602207445063061
Iteration 6		Error after solving for U matrix: 0.8608178563885811		Error after solving for V matrix: 0.8599073772503655
Iteration 7		Error after solving for U matrix: 0.8603893940900001		Error after solving for V matrix: 0.8596993060114979
Iteration 8		Error after solving for U matrix: 0.8600941682702217		Error after solving for V matrix: 0.8595534570369412


In [8]:
predictions = als.predict(U, V, mean, std)
row_ids = test_df.row.to_numpy() - 1
col_ids = test_df.col.to_numpy() - 1
test_preds = predictions[row_ids, col_ids]

In [14]:
print_scores(y_true, test_preds, "Iterative SVD with ALS")

Method: Iterative SVD with ALS, RMSE: 0.9921, MAE: 0.7896


# Baseline 3: NCF

This uses Neural Collaborative Filtering. To improve the process, a Generalzied Factorization Machine and a Multi-Layer Perceptron are first trained separately, then used as pre-trained weights for the final model

In [None]:
trainset = CFDataset(train_df.values)
train_loader = DataLoader(trainset, batch_size=256, shuffle=True)

testset = CFDataset(test_df.values)
test_loader = DataLoader(testset, batch_size=256, shuffle=False)

In [None]:
latent_dim = 32
hidden_dims = [64, 32]
num_users, num_items = train_matrix.shape

## GMF

In [None]:
gmf = GMF(latent_dim=latent_dim, num_users=num_users, num_items=num_items)
loss_function = MSELoss()
optimizer = Adam(gmf.parameters(), lr=0.001)

model = nn_train(gmf, train_loader, loss_function, optimizer)
torch.save(model.state_dict(), 'models/gmf.pth')

Epoch: 9, Batch: 3600, Loss: 1.038

In [None]:
y_pred_gmf = nn_predict(gmf, test_loader)
y_pred_gmf = np.clip(y_pred_gmf, 1, 5)
print_scores(y_true, y_pred_gmf, "GMF Only")

Method: GMF Only, RMSE: 1.0822, MAE: 0.8795


## MLP

In [None]:
mlp = MLP(latent_dim=latent_dim, num_users=num_users, num_items=num_items, hidden_layers=hidden_dims)
loss_function = MSELoss()
optimizer = Adam(mlp.parameters(), lr=0.001)

model = nn_train(mlp, train_loader, loss_function, optimizer)
torch.save(model.state_dict(), 'models/mlp.pth')

Epoch: 9, Batch: 3600, Loss: 1.024

In [None]:
y_pred_mlp = nn_predict(mlp, test_loader)
y_pred_mlp = np.clip(y_pred_mlp, 1, 5)
print_scores(y_true, y_pred_mlp, "MLP Only")

Method: MLP Only, RMSE: 1.0029, MAE: 0.8105


## NeuMF with pre-training
This makes use of the previously learnt GMF and MLP as initializations for the NeuMF model. The models are weighted by an $\alpha$ value where $\alpha=0$ fully uses the MLP model, while $\alpha=1$ fully uses the GMF model.  
After tuning for various $\alpha$ values, an appropriate value was selected

In [None]:
neumf_pretrained = NeuMF(latent_dim=latent_dim, num_users=num_users, num_items=num_items, hidden_layers=hidden_dims, pretrained=True, alpha=0.05)
neumf_dict = neumf_pretrained.state_dict()

In [None]:
gmf_state_dict = torch.load('models/gmf.pth')
mlp_state_dict = torch.load('models/mlp.pth')

pretrained_dict_gmf = {k: v for k, v in gmf_state_dict.items() if k in neumf_dict}
pretrained_dict_mlp = {k: v for k, v in mlp_state_dict.items() if k in neumf_dict}

neumf_dict.update(pretrained_dict_gmf)
neumf_dict.update(pretrained_dict_mlp)
neumf_pretrained.load_state_dict(neumf_dict)

<All keys matched successfully>

In [None]:
loss_function = MSELoss()
optimizer = SGD(neumf_pretrained.parameters(), lr=0.001)

model = nn_train(neumf_pretrained, train_loader, loss_function, optimizer)

Epoch: 9, Batch: 3600, Loss: 0.997

In [None]:
y_pred_neumf_pretrained = nn_predict(neumf_pretrained, test_loader)
y_pred_neumf_pretrained = np.clip(y_pred_neumf_pretrained, 1, 5)
print_scores(y_true, y_pred_neumf_pretrained, "NeuMF Pretrained")

Method: NeuMF Pretrained, RMSE: 1.0041, MAE: 0.8092


# Bayesian Factorization Machines

## Baseline
This only uses the individual ratings, without utilising any other knowledge about user/item

In [43]:
y_pred_bfm, fm = run_bfm(train_df, test_df, rank=10, fm_kind='classifier') 

w0 = 0.12, cutpoint = ['-1.983', '-1.237', '-0.263', '0.613'] : 100%|██████████| 200/200 [02:43<00:00,  1.22it/s]


In [44]:
print_scores(y_true, y_pred_bfm, "BFM Baseline")

Method: BFM Baseline, RMSE: 0.9777, MAE: 0.7809


## Fusing with KNN

When performing preliminary experiments with various methods, we noticed that K-Nearest-Neighbours worked relatively well despite its simplicity. Therefore, we wanted to supplement the BFM with KNN predictions.  
Using KNN to train the model, we generated predictions for all datapoints within rating matrix and their corresponding clusters.

In [27]:
antitrain_df = generate_clusters(trainset, anti_trainset, n_clusters=30)
# antitrain_df.to_csv('models/knn_clusters.csv', index=False)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [46]:
y_pred_ensemble = run_bfm_augmented(train_df, antitrain_df, test_df, n_samples_per_cluster=50000, rank=10, seed_lst=[1, 42, 66, 88, 420])

alpha = 2.49 w0 = 2.58 : 100%|██████████| 200/200 [04:47<00:00,  1.44s/it]
alpha = 2.49 w0 = 2.58 : 100%|██████████| 200/200 [05:52<00:00,  1.76s/it]
alpha = 2.49 w0 = 2.58 : 100%|██████████| 200/200 [05:14<00:00,  1.57s/it]
alpha = 2.49 w0 = 2.58 : 100%|██████████| 200/200 [05:17<00:00,  1.59s/it]
alpha = 2.49 w0 = 2.58 : 100%|██████████| 200/200 [07:21<00:00,  2.21s/it]


In [47]:
print_scores(y_true, y_pred_ensemble['Prediction_avg'], "BFM with KNN Preds")

Method: BFM with KNN Preds, RMSE: 0.9840, MAE: 0.7808
