## Recommendation Module

#### ToDo

- [x] Calculate user similarities
    - [x] Query by user_id -> users with similar taste -> recommend
    - [x] Query by picking out the favorite titles -> users with similar taste -> recommend
- [x] Similarity Evaluation: average of title overlap ratio of top 10 similar users
    - [x] cosine -> 0.256
    - [x] euclidean -> 0.231
    - [x] manhattan -> 0.275
- [ ] Recommendation
    - [ ] Try item-user matrix recommendation
    - [x] From list of "not-yet-seen" list:
        - [x] a) Pick the most popular titles
        - [x] b) From top 10 similar users, which title have you not seen/read but the others have?
- [ ] Refactor
    - [x] Separate sections for scaled & not scaled codes
    - [x] Convert methods into one class -> UserBasedFiltering
    - [ ] Refactor UserBasedFiltering: 
        - [ ] Add parameter to recommendation method to filter manga/anime recommendations
        - [ ] Stop doing everything in df to make things faster

#### Other
- [ ] Extra: Convert media list to network visualization
- [ ] Weight the media by its status (CURRENT/DROPPED etc)
- [ ] Fuzzy match problem with the title name

In [44]:
# !pip install fuzzywuzzy
# !pip install python-Levenshtein

In [56]:
import sys
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from joypy import joyplot
import sweetviz as sv
from tqdm import tqdm
from collections import ChainMap
import sklearn
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances, euclidean_distances
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from scipy import sparse
pd.set_option("max_columns", 200)
from fuzzywuzzy import fuzz

In [2]:
# query user_id
QUERY_USER_ID = 1

In [3]:
class UserBasedFiltering:
    def __init__(self):
        # Load various data first
        self.df_titles = pd.read_csv("../assets/titles_2000p.csv")
        self.df_titles_genre = pd.read_csv("../assets/ryota_title_genre_2000p.csv")
        self.df_mlist = pd.read_csv("../assets/media_list_all_users.csv")
        self.df_mlist_genre = pd.read_csv("../assets/ryota_media_list_genre.csv")
        self.df_user_genre_dist = pd.read_csv("../assets/ryota_user_genre_dist.csv")


    def get_similar_users_from_user_id(self, start_col, dist_metric="cosine_similarity", query_user_id=QUERY_USER_ID, ascending=False):
        '''
        query similar users from user_id
        '''
        # Calculate user similarities
        df = self.df_user_genre_dist
        id_list = df["user_id"]
        if dist_metric=="euclidean_distances":
            df_sim_mat = pd.DataFrame(euclidean_distances(df.iloc[:, start_col:]))
        elif dist_metric=="manhattan_distances":
            df_sim_mat = pd.DataFrame(manhattan_distances(df.iloc[:, start_col:]))
        else:
            df_sim_mat = pd.DataFrame(cosine_similarity(df.iloc[:, start_col:]))
        df_sim_mat.index = id_list
        df_sim_mat.columns = id_list

        similar_users = df_sim_mat[query_user_id].sort_values(ascending=ascending).reset_index()
        top_10_similar_user_ids = list(similar_users.iloc[1:11, 0])
        return top_10_similar_user_ids


    def get_similar_users_from_titles(self, q_titles, threshold=50):
        '''
        query similar users from list of favorite title_ids
        '''
        # refer to users with more than 50 titles -> more stable genre distribution
        df_user_mlist_count = self.df_mlist_genre[["user_id", "mlist_count"]]
        df_user_mlist_count = df_user_mlist_count[df_user_mlist_count["mlist_count"]>threshold]
        ref_user_ids = df_user_mlist_count["user_id"].values
        
        # limit df_user_genre_dist to users with more than threshold n titles in their media list
        df_user_genre_dist_thresh = self.df_user_genre_dist[self.df_user_genre_dist["user_id"].isin(ref_user_ids)]

        # create genre map from title_id list
        df_titles_genre_ex = self.df_titles_genre[self.df_titles_genre["title_id"].isin(q_titles)]
        df_titles_genre_ex = df_titles_genre_ex.sum(axis=0) / len(df_titles_genre_ex)

        # get the genre distribution values and work out cosine similarity
        ex_genre_dist = df_titles_genre_ex.iloc[1:].values.reshape(1,-1)
        user_genre_dist = df_user_genre_dist_thresh.iloc[:,1:].values
        res = cosine_similarity(user_genre_dist, ex_genre_dist)
        res = res.reshape(-1)
        high_sim_idx = np.argsort(res)[-10:]
        top_10_similar_user_ids = ref_user_ids[high_sim_idx]
        # print("shapes of: queried genre dist, referenced users' genre dist, reshaped similarity matrix", ex_genre_dist.shape, user_genre_dist.shape, res.shape)
        # print("similarity score, user_id of similar users: ", res[high_sim_idx], ref_user_ids[high_sim_idx])
        return top_10_similar_user_ids


    def evaluate_by_overlap_titles(self, similar_user_ids, query_user_id=QUERY_USER_ID):
        '''
        Work out the average ratio of titles overlap and use it as direct evaluation metric
        Higher the ratio of overlap = better similarity calculation
        '''
        df = self.df_mlist
        overlap_ratios = []
        df_q = df[df["user_id"]==query_user_id]
        q_u_titles = list(df_q["title_id"])

        for user_id in similar_user_ids:
            df_sim = df[df["user_id"]==user_id]
            sim_u_titles = list(df_sim["title_id"])
            overlap = list(set(sim_u_titles) & set(q_u_titles))
            # print("for user_id: ", user_id)
            # print("all titles: ", len(sim_u_titles))
            # print("overlapping titles: ", len(overlap))
            overlap_ratios.append(len(overlap) / len(sim_u_titles))
        avg_overlap_ratio = sum(overlap_ratios) / len(overlap_ratios)
        return avg_overlap_ratio


    def recommend_from_unread_titles(self, n_titles, similar_user_list, query_user=QUERY_USER_ID, method="refer_others"):
        '''
        It retrieves the media list of similar users and then recommend based on specified logic

        :params
            df_titles: titles df
            df_mlist: media list df
            n_titles: how many titles to recommend
            query_user: querying user_id
            similar_user_list: list of similar user_ids
            method: which method to make recommendation
        :returns
            list of title_id as recommendation
        '''
        # get title_ids that the querying user hasn't read but similar users have
        df_mlist_similar_user = self.df_mlist[self.df_mlist["user_id"].isin(similar_user_list)]
        df_mlist_q_user = self.df_mlist[self.df_mlist["user_id"]==query_user]
        q_users_titles = list(df_mlist_q_user["title_id"])
        df_mlist_similar_user_not_read = df_mlist_similar_user[~df_mlist_similar_user["title_id"].isin(q_users_titles)]

        if method=="refer_popularity":
            # refer_popularity method: get "favorites" count of the unread titles and return top n titles
            unread_list = list(df_mlist_similar_user_not_read["title_id"].unique())
            df_recommend_list = self.df_titles[self.df_titles["title_id"].isin(unread_list)]
            df_recommend_list = df_recommend_list[["title_id", "favorites"]].sort_values(by="favorites", ascending=False).iloc[:n_titles]
            recommend_list = list(df_recommend_list["title_id"])
        else:
            # refer_others method: get count of titles and return top n titles
            df_recommend_list = df_mlist_similar_user_not_read.groupby("title_id").size().sort_values(ascending=False).iloc[:n_titles]
            recommend_list = list(df_recommend_list.index)
        return recommend_list


In [13]:
# Initialize
ubf = UserBasedFiltering()

### Test: Recommendation from user_id

In [14]:
# Query by user_id

# calculate similarity and similar user ids
top_10_similar_user_ids = ubf.get_similar_users_from_user_id(start_col=1, dist_metric="cosine_similarity", query_user_id=QUERY_USER_ID)
print(top_10_similar_user_ids)

# Work out the average ratio of overlapping titles
print(ubf.evaluate_by_overlap_titles(top_10_similar_user_ids))

# make recommendation
ubf.recommend_from_unread_titles(10, top_10_similar_user_ids, method="refer_popularity")

[6803, 532, 5708, 3883, 1393, 8103, 6155, 2871, 54, 3934]
0.2560350993885577


[11061, 21, 113415, 101922, 1735, 20464, 30013, 20, 104578, 21507]

### Test: Recommendation from list of title_ids

In [15]:
# Query by title_ids

# example query title_id list
ex_titles_action = [30002, 105778, 53390, 87216, 85486, 30656, 30642, 31706, 31133, 30025]
ex_titles_romance = [72451, 97852, 85135, 101583, 87395, 59211, 132182, 30145, 41514, 86481]

top_10_similar_user_ids = ubf.get_similar_users_from_titles(ex_titles_romance)
print(top_10_similar_user_ids)

# Work out the average ratio of overlapping titles
print(ubf.evaluate_by_overlap_titles(top_10_similar_user_ids))

# make recommendation
ubf.recommend_from_unread_titles(10, top_10_similar_user_ids, method="refer_popularity")

[9392 2866 9494 2154 5626 8157 8612 1434 8839 7633]
0.3838791368593074


[1735, 112641, 199, 20447, 20613, 853, 20789, 9919, 11771, 1210]

### Test: Recommendation from a title, but refering to title-user matrix

In [40]:
# Query by title_id

# load sparse maatrix
mat_title_user = sparse.load_npz("../assets/ryota_title_user.npz")
print(mat_title_user.shape)

(52400, 6738)


In [83]:
# Make recommendations using cosine similarity
model = NearestNeighbors(metric="cosine", n_neighbors=20)
model.fit(mat_title_user)
distances, indices = model.kneighbors(mat_title_user[5], n_neighbors=10)

In [89]:
titlle_idx = np.load("../assets/ryota_title_user_idx.npy")
recommended_title_ids = titlle_idx[indices].reshape(-1)
print(recommended_title_ids)

df_rec_titles = df_titles[df_titles["title_id"].isin(recommended_title_ids)]
display(df_rec_titles)

[  15  170  558 1842 5028 3226  263 5040 7655 5258]


Unnamed: 0,title_id,title_english,title_romaji,type,duration,start_year,chapters,volume,publishing_status,country,adult,genres,average_score,mean_score,popularity,favorites,score_10,score_20,score_30,score_40,score_50,score_60,score_70,score_80,score_90,score_100,count_CURRENT,count_PLANNING,count_COMPLETED,count_DROPPED,count_PAUSED,ranking_RATED,ranking_POPULAR,synopsis,cover_image_url
106,263,Fighting Spirit,Hajime no Ippo: THE FIGHTING!,ANIME,23.0,2000.0,,,FINISHED,JP,False,"['Comedy', 'Drama', 'Sports']",86.0,86.0,67185,3934,107.0,23.0,38.0,84.0,265.0,528.0,2037.0,5234.0,7885.0,6259.0,4995,25220,31689,1438,3843,1.0,2.0,Makunouchi Ippo has been bullied his entire li...,{'large': 'https://s4.anilist.co/file/anilistc...
147,5258,Fighting Spirit: New Challenger,Hajime no Ippo: New Challenger,ANIME,23.0,2009.0,,,FINISHED,JP,False,"['Comedy', 'Drama', 'Sports']",85.0,85.0,30215,712,73.0,13.0,15.0,50.0,168.0,374.0,1532.0,4104.0,5621.0,3504.0,871,7037,21686,211,410,1.0,2.0,Ippo Makunouchi continues his boxing career an...,{'large': 'https://s4.anilist.co/file/anilistc...
339,170,Slam Dunk,SLAM DUNK,ANIME,23.0,1993.0,,,FINISHED,JP,False,"['Comedy', 'Drama', 'Slice of Life', 'Sports']",82.0,82.0,38052,1572,84.0,31.0,41.0,65.0,261.0,537.0,1539.0,3220.0,3567.0,2515.0,2804,16375,15285,1197,2391,1.0,1.0,"Hanamichi Sakuragi, an entering Shohoku high s...",{'large': 'https://s4.anilist.co/file/anilistc...
349,5028,,Major S5,ANIME,24.0,2009.0,,,FINISHED,JP,False,"['Comedy', 'Drama', 'Romance', 'Sports']",82.0,83.0,8776,250,57.0,7.0,9.0,15.0,89.0,172.0,567.0,1249.0,1373.0,912.0,208,1925,6452,96,95,3.0,12.0,"After the baseball season was over, Goro retur...",{'large': 'https://s4.anilist.co/file/anilistc...
492,7655,,Major S6,ANIME,24.0,2010.0,,,FINISHED,JP,False,"['Comedy', 'Drama', 'Sports']",81.0,82.0,8102,179,46.0,4.0,16.0,24.0,79.0,209.0,581.0,1258.0,1167.0,799.0,208,1725,5963,96,110,4.0,18.0,"The sixth season begins after the ""Baseball Wo...",{'large': 'https://s4.anilist.co/file/anilistc...
578,5040,One Outs,One Outs,ANIME,23.0,2008.0,,,FINISHED,JP,False,"['Psychological', 'Sports']",81.0,81.0,33720,975,45.0,19.0,74.0,122.0,310.0,658.0,2010.0,4338.0,3952.0,1827.0,1195,13618,16852,794,1261,2.0,9.0,"The story begins when Hiromichi Kojima, the st...",{'large': 'https://s4.anilist.co/file/anilistc...
708,1842,,Major S3,ANIME,25.0,2007.0,,,FINISHED,JP,False,"['Drama', 'Sports']",80.0,80.0,8842,140,14.0,10.0,14.0,34.0,148.0,287.0,822.0,1597.0,1234.0,716.0,187,1491,6968,91,105,2.0,6.0,After leaving Kaido in a promise to meet them ...,{'large': 'https://s4.anilist.co/file/anilistc...
804,3226,,Major S4,ANIME,24.0,2008.0,,,FINISHED,JP,False,"['Comedy', 'Drama', 'Sports']",79.0,80.0,8381,112,41.0,3.0,8.0,32.0,103.0,233.0,740.0,1518.0,1031.0,574.0,186,1424,6590,90,91,3.0,12.0,Next season of Major.<br><br>\nAfter the final...,{'large': 'https://s4.anilist.co/file/anilistc...
806,558,,Major S2,ANIME,25.0,2005.0,,,FINISHED,JP,False,"['Comedy', 'Drama', 'Sports']",79.0,80.0,9308,142,21.0,7.0,10.0,25.0,157.0,294.0,887.0,1773.0,1298.0,629.0,203,1531,7356,112,106,2.0,36.0,This series picks up a few years after season ...,{'large': 'https://s4.anilist.co/file/anilistc...
1841,15,Eyeshield 21,Eyeshield 21,ANIME,23.0,2005.0,,,FINISHED,JP,False,"['Action', 'Comedy', 'Sports']",75.0,76.0,21492,434,38.0,26.0,66.0,137.0,377.0,735.0,1855.0,2263.0,1516.0,779.0,1442,6721,10096,1458,1775,3.0,4.0,Welcome To the Gridiron of the Damned! Huge ...,{'large': 'https://s4.anilist.co/file/anilistc...


### Fuzzy match

In [55]:
print(fuzz.ratio("Gintama: THE VERY FINAL", "Gintama."))
print(fuzz.partial_ratio("Gintama: THE VERY FINAL", "Gintama."))
print(fuzz.token_sort_ratio("Gintama: THE VERY FINAL", "Gintama."))

df_titles = pd.read_csv("../assets/titles_2000p.csv")
# display(df_titles)

q_title = "Gintama: THE VERY FINAL"

title_id_zip = zip(df_titles["title_id"], df_titles["title_romaji"])
best_match_title = None
best_match_score = 0
for t_id, title in title_id_zip:
    partial_ratio = fuzz.partial_ratio(title.lower(), q_title.lower())
    if partial_ratio > best_match_score:
        best_match_score = partial_ratio
        best_match_title = [t_id, title, partial_ratio]

print(best_match_title)

45
88
48
[30044, 'Gintama', 100]


---
# Calculating user similarities
---

### Developing a similarity evaluation metric: Checking similarities by looking at overlapping titles in media_list

In [16]:
# Get the media_list df
df_mlist = pd.read_csv("../assets/media_list_all_users.csv")

In [17]:
# calculate similarity and similar user ids
top_10_similar_user_ids = ubf.get_similar_users_from_user_id(start_col=1, dist_metric="cosine_similarity", query_user_id=QUERY_USER_ID)

# Work out the average ratio of overlapping titles
ubf.evaluate_by_overlap_titles(top_10_similar_user_ids)

0.2560350993885577

### Use other metrics to get the similar users

In [20]:
# Calculate user similarities: Euclidean

# calculate similarity and similar user ids
# need to add ascending=True
top_10_similar_user_ids = ubf.get_similar_users_from_user_id(start_col=1, dist_metric="euclidean_distances", query_user_id=QUERY_USER_ID, ascending=True)

# Work out the average ratio of overlapping titles
ubf.evaluate_by_overlap_titles(top_10_similar_user_ids)

0.2307651556697182

In [19]:
# Calculate user similarities: Manhattan

# calculate similarity and similar user ids
top_10_similar_user_ids = ubf.get_similar_users_from_user_id(start_col=1, dist_metric="manhattan_distances", query_user_id=QUERY_USER_ID, ascending=True)

# Work out the average ratio of overlapping titles
ubf.evaluate_by_overlap_titles(top_10_similar_user_ids)

0.27521065158710084