In [None]:
!pip3 install cupy

Collecting cupy
  Downloading cupy-9.6.0.tar.gz (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 5.2 MB/s 
Building wheels for collected packages: cupy
  Building wheel for cupy (setup.py) ... [?25l[?25hcanceled
[31mERROR: Operation cancelled by user[0m


In [28]:
import pandas as pd
import numpy as np
import torch
import json
import itertools


def load_recommender_data(path_in_str):
    json_file = open(path_in_str)
    data = json.load(json_file)
    df = pd.DataFrame.from_dict(data, orient='index')
    df = df[df['error'].isna()]

    # %%
    df['user'] = df.index
    df.reset_index(inplace=True)
    df['user_id'] = df.index

    # %%
    def melt_series(s):
        lengths = s.str.len().values
        flat = [i for i in itertools.chain.from_iterable(s.values.tolist())]
        idx = np.repeat(s.index.values, lengths)
        return pd.Series(flat, idx, name=s.name)


    user_df = melt_series(df.data).to_frame().join(df.drop('data', 1))

    # %%
    user_df['anime_id'] = user_df['data'].apply(lambda x : x['node']['id'])
    user_df['status'] = user_df['data'].apply(lambda x : x['list_status']['status'])
    user_df['score'] = user_df['data'].apply(lambda x : x['list_status']['score'])
    user_df['is_rewatching'] = user_df['data'].apply(lambda x : x['list_status']['is_rewatching'])


    # %%
    user_df = user_df[['user', 'user_id', 'anime_id', 'status', 'score', 'is_rewatching']]
    user_df = user_df.rename({'score': 'user_score'}, axis=1)

    # %%
    path_in_str = '/content/data/anime_list_final_231.json'
    json_file = open(path_in_str)
    data = json.load(json_file)
    anime_df_raw = pd.DataFrame.from_dict(data, orient='index')
    anime_df_raw = anime_df_raw[anime_df_raw['error'] != 'not_found']
    anime_df = anime_df_raw[['id', 'title', 'mean', 'genres', 'statistics', 'num_episodes']]
    anime_df = anime_df.dropna()

    # %%
    anime_df['genres_name'] = anime_df['genres'].apply(lambda x : [a['name'] for a in x])
    anime_df['genres_id'] = anime_df['genres'].apply(lambda x : [a['id'] for a in x])

    # %%
    anime_df['watching'] = anime_df['statistics'].apply(lambda x : x['status']['watching'])
    anime_df['num_list_users'] = anime_df['statistics'].apply(lambda x : x['num_list_users'])
    anime_df['completed'] = anime_df['statistics'].apply(lambda x : x['status']['completed'])
    anime_df['plan_to_watch'] = anime_df['statistics'].apply(lambda x : x['status']['plan_to_watch'])
    anime_df['dropped'] = anime_df['statistics'].apply(lambda x : x['status']['dropped'])
    anime_df['on_hold'] = anime_df['statistics'].apply(lambda x : x['status']['on_hold'])

    # %%
    anime_df.drop(['genres', 'statistics'], axis=1, inplace=True)

    # %%
    anime_df.rename({'id': 'anime_id'}, axis=1, inplace=True)



    # %%
    df_merge_raw = pd.merge(anime_df, user_df, on = 'anime_id')

    # %%
    df_merge = df_merge_raw[['anime_id', 'title', 'genres_name', 'num_episodes', 'mean', 'num_list_users', 'user_id', 'user_score']]

    # %%
    df_merge.rename({'title': 'name', 'genres_name': 'genre', 'num_episodes': 'episodes', 'mean': 'rating_x', 'num_list_users': 'members', 'user_score': 'rating_y'}, axis=1, inplace= True)

    # %%
    df_merge['rating_x'] = df_merge['rating_x'].astype(int)
    df_merge['rating_x'] = df_merge['rating_x'].round()

    return df_merge, anime_df, user_df


def main():
    path_in_str = '/content/data/user_data.json'
    load_recommender_data(path_in_str)


## Load data

In [None]:
df_merge, anime_df, user_df =  load_recommender_data('data/user_data.json')

## Data preprocessing

In [None]:
new_indices = range(len(anime_df.index))
dict_ids = dict(zip(list(anime_df['anime_id']), new_indices))

In [31]:
anime_df.replace({'anime_id': dict_ids}, inplace=True)
user_df.replace({'anime_id': dict_ids}, inplace=True)
df_merge.replace({'anime_id': dict_ids}, inplace=True)

In [None]:
df_merge.head()
sample = df_merge.sample(n=10000)

In [None]:
df_merge['anime_id'] = df_merge['anime_id'].astype(np.int32)
df_merge['rating_x'] = df_merge['rating_x'].astype(np.int32)
df_merge['user_id'] = df_merge['user_id'].astype(np.int32)
df_merge = df_merge[['anime_id', 'rating_x','user_id']]

In [None]:
df_merge = df_merge.sample(n=5)

## Create Required Matrices

In [None]:
# rating matrix with rows as animes and columns as users
matrix_rating_user = np.ndarray(
    shape=(np.max(df_merge.anime_id.values), np.max(df_merge.user_id.values)+1),
    dtype=np.uint8)
matrix_rating_user

In [None]:
matrix_rating_user[df_merge.anime_id.values-1, df_merge.user_id.values] = df_merge.rating_x.values
matrix_rating_user

In [None]:
matrix_norm = matrix_rating_user - np.asarray([(np.mean(matrix_rating_user, 1))]).T
matrix_norm

## Get SVD Decomposition

In [39]:
# Compute the Singular Value Decomposition (SVD).
cuda  = torch.device('cuda')
A = matrix_norm.T / np.sqrt(matrix_rating_user.shape[0] - 1)
t = torch.from_numpy(A).to(device=cuda)     # converted to tensor
U, S, Vt = torch.svd(t)
V = Vt.cpu().detach().numpy()   #convert Vt to numpy array

## Recommender helper functions

In [72]:
def cosine_similarity_sort(r_data, anime_id, top_n=10):
    ind = anime_id -1
    anime_row = r_data[ind, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', r_data, r_data))
    matrix_similarity = np.dot(anime_row, r_data.T) / (magnitude[ind] * magnitude)
    sorted_indices = np.argsort(-matrix_similarity)
    return sorted_indices[:top_n]

In [73]:
def get_most_similar_anime(anime_df, anime_id, index_list):
    print('Best recommendations for {}:'.format(anime_df[anime_df.anime_id == anime_id].title.values[0]))
    count = 1
    for id in index_list + 1:
        print('{0}: {1}'.format(count, anime_df[anime_df.anime_id == id].title.values[0]))
        count = count +1

In [74]:
def get_anime_id(anime_df, fav_anime):
    anime_id = anime_df.loc[anime_df['title'] == fav_anime, 'anime_id'].iloc[0]
    anime_id = int(anime_id)
    return anime_id

In [75]:
def make_recommendation(anime_name, top_n=10):
    anime_id = get_anime_id(anime_df, anime_name) # get anime id
    k = 50      #k-principal components to represent anime, anime_id to find recommendations
    rep_data = V.T[:, :k] # representative data
    index_list = cosine_similarity_sort(rep_data, anime_id, top_n)

    #Get the top N recommendations
    get_most_similar_anime(anime_df, anime_id, index_list)

In [None]:
## Make recommendations
Enter your anime name and the numbre of recommendations needed in top_n field, and the recommender will generate recommendations

In [76]:
anime_name = 'Cowboy Bebop'
top_n = 10

make_recommendation(anime_name, top_n)

Best recommendations for Cowboy Bebop:
1: Initial D First Stage
2: Trigun
3: El Hazard: The Alternative World
4: Mobile Suit Gundam SEED
5: Beck
6: Saiyuuki Reload Gunlock
7: Yakitate!! Japan
8: Hunter x Hunter: Greed Island Final
9: Pita Ten
10: Green Green
