In [1]:
import pandas as pd
from surprise import KNNBasic
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold

from read_and_split_data import split_data

In [2]:
def get_dataset():
    anime = pd.read_parquet("../datasets/anime.parquet")
    anime = anime[["anime_id", "type"]]
    users = pd.read_parquet("../datasets/users.parquet")
    base_df = users.merge(anime, on="anime_id", how="left")
    return base_df

base_df = get_dataset()
data = split_data(base_df)

In [3]:
kf = KFold(n_splits=5)

In [4]:
def predict_and_error(data, algo):
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        predictions = algo.test(testset)
        # Mean Squared Error
        accuracy.rmse(predictions, verbose=True)
        # Mean Absolute Erro
        accuracy.mse(predictions, verbose=True)
        # Mean Absolute Erro
        accuracy.mae(predictions, verbose=True)
        # Fraction of Concordant Pairs
        accuracy.fcp(predictions, verbose=True)
    return predictions

### Cosine Similarity

In [5]:
algo = KNNBasic(k=7, sim_options={"name": "cosine", "user_based": False})

In [6]:
predictions = predict_and_error(data, algo)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.2053
MSE: 4.8632
MAE:  1.5258
FCP:  0.6230
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.2056
MSE: 4.8648
MAE:  1.5262
FCP:  0.6213
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.2056
MSE: 4.8648
MAE:  1.5277
FCP:  0.6229
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.2029
MSE: 4.8530
MAE:  1.5250
FCP:  0.6216
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.2054
MSE: 4.8637
MAE:  1.5260
FCP:  0.6215


In [7]:
df = pd.DataFrame(columns=["user", "anime", "actual", "est", "details"], data=predictions)

In [8]:
df.sample()

Unnamed: 0,user,anime,actual,est,details
286417,28620,6045,8.0,5.849256,"{'actual_k': 7, 'was_impossible': False}"


In [9]:
df.to_csv("../predictions/knn_basic_cosine.csv")
df.to_parquet("../predictions/knn_basic_cosine.parquet")

### Mean Squared Difference

In [10]:
algo = KNNBasic(k=7, sim_options={"name": "msd", "user_based": False})

In [11]:
predictions_msd = predict_and_error(data, algo)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 2.1883
MSE: 4.7886
MAE:  1.5157
FCP:  0.6401
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 2.1847
MSE: 4.7728
MAE:  1.5152
FCP:  0.6379
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 2.1902
MSE: 4.7968
MAE:  1.5166
FCP:  0.6392
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 2.1894
MSE: 4.7934
MAE:  1.5178
FCP:  0.6371
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 2.1903
MSE: 4.7974
MAE:  1.5176
FCP:  0.6395


In [12]:
df = pd.DataFrame(columns=["user", "anime", "actual", "est", "details"], data=predictions_msd)

In [13]:
df.to_csv("../predictions/knn_basic_msd.csv")
df.to_parquet("../predictions/knn_basic_msd.parquet")

### Pearson

In [14]:
algo = KNNBasic(k=7, sim_options={"name": "pearson", "user_based": False})

In [15]:
predictions_pearson = predict_and_error(data, algo)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 2.2440
MSE: 5.0358
MAE:  1.5921
FCP:  0.5807
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 2.2484
MSE: 5.0552
MAE:  1.5965
FCP:  0.5812
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 2.2446
MSE: 5.0381
MAE:  1.5955
FCP:  0.5791
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 2.2481
MSE: 5.0539
MAE:  1.5957
FCP:  0.5816
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 2.2458
MSE: 5.0437
MAE:  1.5950
FCP:  0.5817


In [16]:
df = pd.DataFrame(columns=["user", "anime", "actual", "est", "details"], data=predictions_pearson)

In [17]:
df.to_csv("../predictions/knn_basic_pearson.csv")
df.to_parquet("../predictions/knn_basic_pearson.parquet")

### Pearson Baseline

In [18]:
algo = KNNBasic(k=7, sim_options={"name": "pearson_baseline", "user_based": False})

In [19]:
predictions_pearson_baseline = predict_and_error(data, algo)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 2.0870
MSE: 4.3555
MAE:  1.4204
FCP:  0.6859
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 2.0860
MSE: 4.3516
MAE:  1.4213
FCP:  0.6863
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 2.0869
MSE: 4.3552
MAE:  1.4220
FCP:  0.6858
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 2.0872
MSE: 4.3563
MAE:  1.4208
FCP:  0.6864
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 2.0877
MSE: 4.3586
MAE:  1.4225
FCP:  0.6869


In [20]:
df = pd.DataFrame(columns=["user", "anime", "actual", "est", "details"], data=predictions_pearson_baseline)

In [21]:
df.to_csv("../predictions/knn_basic_pearson_baseline.csv")
df.to_parquet("../predictions/knn_basic_pearson_baseline.parquet")