In [1]:
import pandas as pd
from surprise import KNNWithMeans
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold

from read_and_split_data import split_data

In [2]:
def get_dataset():
    anime = pd.read_parquet("../datasets/anime.parquet")
    anime = anime[["anime_id", "type"]]
    users = pd.read_parquet("../datasets/users.parquet")
    base_df = users.merge(anime, on="anime_id", how="left")
    return base_df

base_df = get_dataset()
data = split_data(base_df)

In [3]:
kf = KFold(n_splits=5)

In [4]:
def predict_and_error(data, algo):
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        predictions = algo.test(testset)
        # Mean Squared Error
        accuracy.rmse(predictions, verbose=True)
        # Mean Absolute Erro
        accuracy.mse(predictions, verbose=True)
        # Mean Absolute Erro
        accuracy.mae(predictions, verbose=True)
        # Fraction of Concordant Pairs
        accuracy.fcp(predictions, verbose=True)
    return predictions

### Cosine Similarity

In [5]:
algo = KNNWithMeans(k=7, sim_options={"name": "cosine", "user_based": False})

In [6]:
predictions = predict_and_error(data, algo)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.1599
MSE: 4.6651
MAE:  1.4769
FCP:  0.6783
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.1584
MSE: 4.6588
MAE:  1.4763
FCP:  0.6789
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.1623
MSE: 4.6756
MAE:  1.4791
FCP:  0.6767
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.1587
MSE: 4.6599
MAE:  1.4763
FCP:  0.6780
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.1622
MSE: 4.6752
MAE:  1.4792
FCP:  0.6778


In [7]:
df = pd.DataFrame(columns=["user", "anime", "actual", "est", "details"], data=predictions)

In [8]:
df.sample()

Unnamed: 0,user,anime,actual,est,details
960914,58736,14645,7.0,6.436638,"{'actual_k': 7, 'was_impossible': False}"


In [9]:
df.to_csv("../predictions/knn_with_means/anime_type_tv/no_filter/knn_with_means_cosine.csv")
df.to_parquet("../predictions/knn_with_means/anime_type_tv/no_filter/knn_with_means_cosine.parquet")

### Mean Squared Difference

In [10]:
algo = KNNWithMeans(k=7, sim_options={"name": "msd", "user_based": False})

In [11]:
predictions_msd = predict_and_error(data, algo)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 2.1645
MSE: 4.6852
MAE:  1.4770
FCP:  0.6757
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 2.1662
MSE: 4.6923
MAE:  1.4791
FCP:  0.6756
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 2.1647
MSE: 4.6860
MAE:  1.4777
FCP:  0.6760
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 2.1702
MSE: 4.7099
MAE:  1.4799
FCP:  0.6752
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 2.1649
MSE: 4.6866
MAE:  1.4795
FCP:  0.6744


In [12]:
df = pd.DataFrame(columns=["user", "anime", "actual", "est", "details"], data=predictions_msd)

In [13]:
df.to_csv("../predictions/knn_with_means/anime_type_tv/no_filter/knn_with_means_msd.csv")
df.to_parquet("../predictions/knn_with_means/anime_type_tv/no_filter/knn_with_means_msd.parquet")

### Pearson

In [14]:
algo = KNNWithMeans(k=7, sim_options={"name": "pearson", "user_based": False})

In [15]:
predictions_pearson = predict_and_error(data, algo)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 2.1746
MSE: 4.7290
MAE:  1.4890
FCP:  0.6729
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 2.1702
MSE: 4.7099
MAE:  1.4857
FCP:  0.6725
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 2.1683
MSE: 4.7015
MAE:  1.4857
FCP:  0.6743
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 2.1755
MSE: 4.7328
MAE:  1.4883
FCP:  0.6729
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 2.1742
MSE: 4.7273
MAE:  1.4888
FCP:  0.6723


In [16]:
df = pd.DataFrame(columns=["user", "anime", "actual", "est", "details"], data=predictions_pearson)

In [17]:
df.to_csv("../predictions/knn_with_means/anime_type_tv/no_filter/knn_with_means_pearson.csv")
df.to_parquet("../predictions/knn_with_means/anime_type_tv/no_filter/knn_with_means_pearson.parquet")

### Pearson Baseline

In [18]:
algo = KNNWithMeans(k=7, sim_options={"name": "pearson_baseline", "user_based": False})

In [19]:
predictions_pearson_baseline = predict_and_error(data, algo)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 2.0524
MSE: 4.2121
MAE:  1.3842
FCP:  0.7187
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 2.0497
MSE: 4.2013
MAE:  1.3831
FCP:  0.7175
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 2.0517
MSE: 4.2095
MAE:  1.3842
FCP:  0.7183
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 2.0487
MSE: 4.1973
MAE:  1.3826
FCP:  0.7182
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 2.0535
MSE: 4.2170
MAE:  1.3829
FCP:  0.7182


In [20]:
df = pd.DataFrame(columns=["user", "anime", "actual", "est", "details"], data=predictions_pearson_baseline)

In [21]:
df.to_csv("../predictions/knn_with_means/anime_type_tv/no_filter/knn_with_means_pearson_baseline.csv")
df.to_parquet("../predictions/knn_with_means/anime_type_tv/no_filter/knn_with_means_pearson_baseline.parquet")