In [1]:
import pandas as pd
import numpy as np
import json
import itertools


def load_recommender_data(path_in_str):
    json_file = open(path_in_str)
    data = json.load(json_file)
    df = pd.DataFrame.from_dict(data, orient='index')
    df = df[df['error'].isna()]

    # %%
    df['user'] = df.index
    df.reset_index(inplace=True)
    df['user_id'] = df.index

    # %%
    def melt_series(s):
        lengths = s.str.len().values
        flat = [i for i in itertools.chain.from_iterable(s.values.tolist())]
        idx = np.repeat(s.index.values, lengths)
        return pd.Series(flat, idx, name=s.name)


    user_df = melt_series(df.data).to_frame().join(df.drop('data', 1))

    # %%
    user_df['anime_id'] = user_df['data'].apply(lambda x : x['node']['id'])
    user_df['status'] = user_df['data'].apply(lambda x : x['list_status']['status'])
    user_df['score'] = user_df['data'].apply(lambda x : x['list_status']['score'])
    user_df['is_rewatching'] = user_df['data'].apply(lambda x : x['list_status']['is_rewatching'])


    # %%
    user_df = user_df[['user', 'user_id', 'anime_id', 'status', 'score', 'is_rewatching']]
    user_df = user_df.rename({'score': 'user_score'}, axis=1)

    # %%
    path_in_str = '/content/anime_list_final_231.json'
    json_file = open(path_in_str)
    data = json.load(json_file)
    anime_df_raw = pd.DataFrame.from_dict(data, orient='index')
    anime_df_raw = anime_df_raw[anime_df_raw['error'] != 'not_found']
    anime_df = anime_df_raw[['id', 'title', 'mean', 'genres', 'statistics', 'num_episodes']]
    anime_df = anime_df.dropna()

    # %%
    anime_df['genres_name'] = anime_df['genres'].apply(lambda x : [a['name'] for a in x])
    anime_df['genres_id'] = anime_df['genres'].apply(lambda x : [a['id'] for a in x])

    # %%
    anime_df['watching'] = anime_df['statistics'].apply(lambda x : x['status']['watching'])
    anime_df['num_list_users'] = anime_df['statistics'].apply(lambda x : x['num_list_users'])
    anime_df['completed'] = anime_df['statistics'].apply(lambda x : x['status']['completed'])
    anime_df['plan_to_watch'] = anime_df['statistics'].apply(lambda x : x['status']['plan_to_watch'])
    anime_df['dropped'] = anime_df['statistics'].apply(lambda x : x['status']['dropped'])
    anime_df['on_hold'] = anime_df['statistics'].apply(lambda x : x['status']['on_hold'])

    # %%
    anime_df.drop(['genres', 'statistics'], axis=1, inplace=True)

    # %%
    anime_df.rename({'id': 'anime_id'}, axis=1, inplace=True)



    # %%
    df_merge_raw = pd.merge(anime_df, user_df, on = 'anime_id')

    # %%
    df_merge = df_merge_raw[['anime_id', 'title', 'genres_name', 'num_episodes', 'mean', 'num_list_users', 'user_id', 'user_score']]

    # %%
    df_merge.rename({'title': 'name', 'genres_name': 'genre', 'num_episodes': 'episodes', 'mean': 'rating_x', 'num_list_users': 'members', 'user_score': 'rating_y'}, axis=1, inplace= True)

    # %%
    df_merge['rating_x'] = df_merge['rating_x'].astype(int)
    df_merge['rating_x'] = df_merge['rating_x'].round()

    return df_merge, anime_df, user_df


def main():
    path_in_str = '/content/user_data1.0.json'
    load_recommender_data(path_in_str)


In [2]:
# from load_recommender_data import load_recommender_data
import torch
df_merge, anime_df, user_df =  load_recommender_data('user_data1.0.json')
df_merge.head()
sample = df_merge.sample(n=5000)
sample.shape

FileNotFoundError: ignored

In [None]:
sample.dtypes

anime_id    float64
name         object
genre        object
episodes    float64
rating_x      int64
members       int64
user_id       int64
rating_y      int64
dtype: object

In [None]:
sample.head()

Unnamed: 0,anime_id,name,genre,episodes,rating_x,members,user_id,rating_y
45316,38409.0,Cike Wu Liuqi,"[Action, Comedy, Drama, Martial Arts, Mystery,...",10.0,7,45087,33,7
16464,34542.0,Inuyashiki,"[Action, Drama, Psychological, Sci-Fi, Seinen]",11.0,7,520171,35,9
8253,12467.0,Nazo no Kanojo X,"[Romance, School, Seinen]",13.0,7,245117,13,9
27341,2236.0,Toki wo Kakeru Shoujo,"[Drama, Romance, School, Sci-Fi]",1.0,8,689355,86,0
4261,3603.0,JoJo no Kimyou na Bouken: Phantom Blood,"[Action, Adventure, Horror, Shounen, Vampire]",1.0,7,56315,5,5


In [None]:
!pip3 install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 3.7 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1619408 sha256=c8b4be441e83c0b41fabfe4f6cdfddb4796280396e4bb25f1ea729be1debfb7e
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [None]:
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds 

from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate

from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline

from surprise.prediction_algorithms import knns
from surprise.prediction_algorithms import SVD

from surprise.similarities import cosine, msd, pearson

from surprise import accuracy 
from surprise import Reader
from surprise import Dataset
from surprise import dataset
# from numba import jit, cuda 
# import numpy as np
import cupy as np

data = sample[['user_id', 'anime_id', 'rating_x']]
reader = Reader(line_format='user item rating', sep='')
anime_loaded_data = Dataset.load_from_df(data, reader)

#train_test_split
trainset, testset = train_test_split(anime_loaded_data, test_size=.2)

In [None]:
svd = SVD()
# svd = np.linalg.svd
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f587d53dc50>

In [None]:
predictions = svd.test(testset) #
accuracy.rmse(predictions)


RMSE: 2.1705


2.170483817032507

In [None]:
predictions[:10]


[Prediction(uid=61, iid=21995.0, r_ui=7.0, est=5, details={'was_impossible': False}),
 Prediction(uid=53, iid=10589.0, r_ui=7.0, est=5, details={'was_impossible': False}),
 Prediction(uid=1, iid=32615.0, r_ui=7.0, est=5, details={'was_impossible': False}),
 Prediction(uid=34, iid=14835.0, r_ui=7.0, est=5, details={'was_impossible': False}),
 Prediction(uid=68, iid=35135.0, r_ui=6.0, est=5, details={'was_impossible': False}),
 Prediction(uid=75, iid=34393.0, r_ui=4.0, est=5, details={'was_impossible': False}),
 Prediction(uid=91, iid=15609.0, r_ui=6.0, est=5, details={'was_impossible': False}),
 Prediction(uid=73, iid=48707.0, r_ui=7.0, est=5, details={'was_impossible': False}),
 Prediction(uid=13, iid=37492.0, r_ui=7.0, est=5, details={'was_impossible': False}),
 Prediction(uid=27, iid=7366.0, r_ui=6.0, est=5, details={'was_impossible': False})]

In [None]:
print("Number of users:", sample.user_id.nunique())


Number of users: 94


In [None]:
user = 5
item = 100
svd.predict(user, item)


Prediction(uid=5, iid=100, r_ui=None, est=5, details={'was_impossible': False})

In [None]:
params = {'n_factors': [20,50,100],
         'reg_all': [.02,.05, .10]}
gridsearch_svd1 = GridSearchCV(SVD, param_grid=params, n_jobs=-1, joblib_verbose=3)
gridsearch_svd1.fit(anime_loaded_data)

print(gridsearch_svd1.best_score)
print(gridsearch_svd1.best_params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    7.1s


{'rmse': 2.1746149454635506, 'mae': 2.0218}
{'rmse': {'n_factors': 20, 'reg_all': 0.02}, 'mae': {'n_factors': 20, 'reg_all': 0.02}}


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   11.9s finished
