In [1]:
import numpy as np
import pandas as pd
import ast
from tqdm.auto import tqdm
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [2]:
rating = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Anime Recommendation System/Data/Rating/enable_rating.csv')

In [3]:
short_vector = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Anime Recommendation System/Data/Encode/short_vector.csv')
short_vdict = {}
for _, row in short_vector.iterrows():
    short_vdict[row['anime_id']] = np.array(ast.literal_eval(row['short_vector']), dtype=float)

In [4]:
user_ids = rating['user_id'].unique()

# train

In [None]:
UW_SHORT_DICT = {}
d = 43
MAE = 0
MSE = 0
number_of_test = 0
for uid in tqdm(user_ids[:]):
    clf = BayesianRidge(fit_intercept=True)
    X_tmp = rating[rating['user_id'] == uid][['anime_id', 'rating']].values
    if X_tmp.shape[0] == 1:
        W = np.zeros(d)
        b = X_tmp[0][1]
    else:
        x_train_tmp, x_test_tmp = train_test_split(X_tmp, test_size=0.25, random_state=42)
        a_train = np.stack((short_vdict[aid] for aid in x_train_tmp[:, 0]))
        scores_train = x_train_tmp[:, 1]

        clf.fit(a_train, scores_train)
        W = clf.coef_
        b = clf.intercept_

        scores_test = x_test_tmp[:, 1]
        a_test = np.stack((short_vdict[aid] for aid in x_test_tmp[:, 0]))
        Yhat_test = a_test.dot(W) + b
        n = Yhat_test - scores_test
        MAE += np.abs(n).sum()
        MSE += (n * n).sum(axis=0)
        number_of_test += n.shape[0]
    UW_SHORT_DICT[uid] = (W, b)

HBox(children=(FloatProgress(value=0.0, max=73515.0), HTML(value='')))




In [None]:
MAE / number_of_test

1.3562877426651485

In [None]:
np.sqrt(MSE / number_of_test)

2.0575522916314815

In [None]:
vector = pd.DataFrame(data={'user_id': user_ids})
vector['W'] = vector['user_id'].apply(lambda x: UW_SHORT_DICT[x][0].tolist())
vector['b'] = vector['user_id'].apply(lambda x: UW_SHORT_DICT[x][1])
vector.to_csv('/content/drive/MyDrive/Colab Notebooks/Anime Recommendation System/Data/Vector/bayesianridge_short.csv', index=False)

# evaluate

In [5]:
vector = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Anime Recommendation System/Data/Vector/bayesianridge_short.csv')

In [6]:
u_vdict = {}
for _, row in vector.iterrows():
    u_vdict[row['user_id']] = {'W': np.array(ast.literal_eval(row['W']), dtype=float), 'b': row['b']}

In [9]:
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Anime Recommendation System/Data/Test/test_data.csv')

In [10]:
test_dict = {}
for _, row in tqdm(test_df.iterrows()):
    test_dict[row['user_id']] = {'anime_ids': np.array(ast.literal_eval(row['anime_ids']), dtype=np.int32),
                                 'ratings': np.array(ast.literal_eval(row['ratings']), dtype=np.int32)}

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [11]:
def ndcg(uid):
    W = u_vdict[uid]['W']
    b = u_vdict[uid]['b']
    u_test = test_dict[uid]
    scores_test = u_test['ratings']
    if scores_test.sum() == 0:
        return 1.0
    a_test = np.stack((short_vdict[aid] for aid in u_test['anime_ids']))
    Yhat_test = a_test.dot(W) + b

    dlog2 = np.log2(np.arange(scores_test.shape[0]) + 2)

    tmp_df = pd.DataFrame({'rele': scores_test, 'pred': Yhat_test})
    tmp_df = tmp_df.sort_values(by=['pred'], ascending=False)
    dcg = (tmp_df['rele'].values / dlog2).sum()

    scores_test[::-1].sort()
    idcg = (scores_test / dlog2).sum()

    return dcg / idcg

In [12]:
sum = 0
for uid in tqdm(user_ids[:]):
    sum += ndcg(uid)
sum / len(user_ids)

HBox(children=(FloatProgress(value=0.0, max=73515.0), HTML(value='')))




0.9377913777756427