## Recommendation Module

#### ToDo

- [x] Calculate user similarities
    - [x] Query by user_id -> users with similar taste -> recommend
    - [x] Query by picking out the favorite titles -> users with similar taste -> recommend
- [x] Similarity Evaluation: average of title overlap ratio of top 10 similar users
    - [x] cosine -> 0.256
    - [x] euclidean -> 0.231
    - [x] manhattan -> 0.275
- [x] Recommendation
    - [x] Try item-user matrix recommendation
    - [x] Work out what the querying user has not yet seen or watched. From that list:
        - [x] a) Pick the most popular titles
        - [x] b) From top 10 similar users, which title have you not seen/read but the others have?
- [ ] Refactor
    - [x] Run recommendation but show matching df, so we can see how good it might be
    - [ ] Re-check unread recommendation -> seems to return "already read" titles sometimes
    - [x] Separate sections for scaled & not scaled codes
    - [x] Convert methods into one class -> UserBasedFiltering
    - [ ] Refactor UserBasedFiltering: 
        - [ ] Add parameter to recommendation method to filter manga/anime recommendations
        - [ ] Stop doing everything in df to make things faster

#### Other
- [x] Extra: Convert media list to network visualization -> too many edges to be visualized
- [ ] Weight the media by its status (CURRENT/DROPPED etc)
- [ ] Fuzzy match problem with the title name

In [44]:
# !pip install fuzzywuzzy
# !pip install python-Levenshtein

In [5]:
import sys
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from joypy import joyplot
import sweetviz as sv
from tqdm import tqdm
from collections import ChainMap
import sklearn
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances, euclidean_distances
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from scipy import sparse
pd.set_option("max_columns", 200)
from fuzzywuzzy import fuzz

---
# User-based recommendation module
---

In [32]:
class UserBasedFiltering:
    def __init__(self):
        # Load various data first
        self.df_titles = pd.read_csv("../assets/titles_2000p.csv")
        self.df_titles_genre = pd.read_csv("../assets/ryota_title_genre_2000p.csv")
        self.df_mlist = pd.read_csv("../assets/media_list_all_users.csv")
        self.df_mlist_genre = pd.read_csv("../assets/ryota_media_list_genre.csv")
        self.df_user_genre_dist = pd.read_csv("../assets/ryota_user_genre_dist.csv")
        self.mat_title_user = sparse.load_npz("../assets/ryota_title_user.npz")
        self.titlle_idx_list = list(np.load("../assets/ryota_title_user_idx.npy"))


    def get_similar_users_from_user_id(self, start_col, dist_metric="cosine_similarity", query_user_id=QUERY_USER_ID, ascending=False):
        '''
        query similar users from user_id
        '''
        # Calculate user similarities
        df = self.df_user_genre_dist
        id_list = df["user_id"]
        if dist_metric=="euclidean_distances":
            df_sim_mat = pd.DataFrame(euclidean_distances(df.iloc[:, start_col:]))
        elif dist_metric=="manhattan_distances":
            df_sim_mat = pd.DataFrame(manhattan_distances(df.iloc[:, start_col:]))
        else:
            df_sim_mat = pd.DataFrame(cosine_similarity(df.iloc[:, start_col:]))
        df_sim_mat.index = id_list
        df_sim_mat.columns = id_list

        similar_users = df_sim_mat[query_user_id].sort_values(ascending=ascending).reset_index()
        top_10_similar_user_ids = list(similar_users.iloc[1:11, 0])
        return top_10_similar_user_ids


    def get_similar_users_from_titles(self, q_titles, threshold=50):
        '''
        query similar users from list of favorite title_ids
        '''
        # refer to users with more than 50 titles -> more stable genre distribution
        df_user_mlist_count = self.df_mlist_genre[["user_id", "mlist_count"]]
        df_user_mlist_count = df_user_mlist_count[df_user_mlist_count["mlist_count"]>threshold]
        ref_user_ids = df_user_mlist_count["user_id"].values
        
        # limit df_user_genre_dist to users with more than threshold n titles in their media list
        df_user_genre_dist_thresh = self.df_user_genre_dist[self.df_user_genre_dist["user_id"].isin(ref_user_ids)]

        # create genre map from title_id list
        df_titles_genre_ex = self.df_titles_genre[self.df_titles_genre["title_id"].isin(q_titles)]
        df_titles_genre_ex = df_titles_genre_ex.sum(axis=0) / len(df_titles_genre_ex)

        # get the genre distribution values and work out cosine similarity
        ex_genre_dist = df_titles_genre_ex.iloc[1:].values.reshape(1,-1)
        user_genre_dist = df_user_genre_dist_thresh.iloc[:,1:].values
        res = cosine_similarity(user_genre_dist, ex_genre_dist)
        res = res.reshape(-1)
        high_sim_idx = np.argsort(res)[-10:]
        top_10_similar_user_ids = ref_user_ids[high_sim_idx]
        # print("shapes of: queried genre dist, referenced users' genre dist, reshaped similarity matrix", ex_genre_dist.shape, user_genre_dist.shape, res.shape)
        # print("similarity score, user_id of similar users: ", res[high_sim_idx], ref_user_ids[high_sim_idx])
        return top_10_similar_user_ids


    def evaluate_by_overlap_titles(self, similar_user_ids, query_user_id=QUERY_USER_ID):
        '''
        Work out the average ratio of titles overlap and use it as direct evaluation metric
        Higher the ratio of overlap = better similarity calculation
        '''
        df = self.df_mlist
        overlap_ratios = []
        df_q = df[df["user_id"]==query_user_id]
        q_u_titles = list(df_q["title_id"])

        for user_id in similar_user_ids:
            df_sim = df[df["user_id"]==user_id]
            sim_u_titles = list(df_sim["title_id"])
            overlap = list(set(sim_u_titles) & set(q_u_titles))
            # print("for user_id: ", user_id)
            # print("all titles: ", len(sim_u_titles))
            # print("overlapping titles: ", len(overlap))
            overlap_ratios.append(len(overlap) / len(sim_u_titles))
        avg_overlap_ratio = sum(overlap_ratios) / len(overlap_ratios)
        return avg_overlap_ratio


    def recommend_unread_titles(self, n_titles, similar_user_list, query_user=QUERY_USER_ID, method="refer_others"):
        '''
        It retrieves the media list of similar users and then recommend based on specified logic

        :params
            df_titles: titles df
            df_mlist: media list df
            n_titles: how many titles to recommend
            query_user: querying user_id
            similar_user_list: list of similar user_ids
            method: which method to make recommendation
        :returns
            list of title_id as recommendation
        '''
        # get title_ids that the querying user hasn't read but similar users have
        df_mlist_similar_user = self.df_mlist[self.df_mlist["user_id"].isin(similar_user_list)]
        df_mlist_q_user = self.df_mlist[self.df_mlist["user_id"]==query_user]
        q_users_titles = list(df_mlist_q_user["title_id"])
        df_mlist_similar_user_not_read = df_mlist_similar_user[~df_mlist_similar_user["title_id"].isin(q_users_titles)]

        if method=="refer_popularity":
            # refer_popularity method: get "favorites" count of the unread titles and return top n titles
            unread_list = list(df_mlist_similar_user_not_read["title_id"].unique())
            df_recommend_list = self.df_titles[self.df_titles["title_id"].isin(unread_list)]
            df_recommend_list = df_recommend_list[["title_id", "favorites"]].sort_values(by="favorites", ascending=False).iloc[:n_titles]
            recommend_list = list(df_recommend_list["title_id"])
        else:
            # refer_others method: get count of titles and return top n titles
            df_recommend_list = df_mlist_similar_user_not_read.groupby("title_id").size().sort_values(ascending=False).iloc[:n_titles]
            recommend_list = list(df_recommend_list.index)
        return recommend_list


    def recommend_from_other_user_histories(self, q_title_id, model_neighbors=20, model_metric="cosine", output_neighbors=10):
        '''
        Query by the given title_id. Refers to the title:user matrix
        '''
        q_title_idx = self.titlle_idx_list.index(q_title_id)
        model = NearestNeighbors(metric=model_metric, algorithm="brute", n_neighbors=model_neighbors)
        model.fit(self.mat_title_user)
        distances, indices = model.kneighbors(self.mat_title_user[q_title_idx], n_neighbors=output_neighbors+1) # output_neighbors+1 because it always puts q_title_id as result
        indices = indices[indices != q_title_idx] # remove queried title_id from result

        titlle_idx_arr = np.array(self.titlle_idx_list)
        recommended_title_ids = titlle_idx_arr[indices].reshape(-1)
        return recommended_title_ids    

In [33]:
# Initialize
ubf = UserBasedFiltering()

# Load titles data for checking purposes
df_titles = pd.read_csv("../assets/titles_2000p.csv")

In [16]:
# set querying user_id
QUERY_USER_ID = 2000

### Test: Recommendation from user_id

In [19]:
# Query by user_id

# calculate similarity and similar user ids
top_10_similar_user_ids = ubf.get_similar_users_from_user_id(start_col=1, dist_metric="cosine_similarity", query_user_id=QUERY_USER_ID)
print(top_10_similar_user_ids)

# Work out the average ratio of overlapping titles
print(ubf.evaluate_by_overlap_titles(top_10_similar_user_ids))

# make recommendation
recommended_titles = ubf.recommend_unread_titles(10, top_10_similar_user_ids, method="refer_popularity")

# show recommendations
display(df_titles[df_titles["title_id"].isin(recommended_titles)].head(3))


[3594, 1272, 4854, 138, 8311, 1480, 8884, 8265, 4062, 358]
0.15751964101492064


Unnamed: 0,title_id,title_english,title_romaji,type,duration,start_year,chapters,volume,publishing_status,country,adult,genres,average_score,mean_score,popularity,favorites,score_10,score_20,score_30,score_40,score_50,score_60,score_70,score_80,score_90,score_100,count_CURRENT,count_PLANNING,count_COMPLETED,count_DROPPED,count_PAUSED,ranking_RATED,ranking_POPULAR,synopsis,cover_image_url
3,30013,One Piece,ONE PIECE,MANGA,,1997.0,,,RELEASING,JP,False,"['Action', 'Adventure', 'Comedy', 'Fantasy']",91.0,91.0,113083,23264,394.0,58.0,109.0,177.0,453.0,805.0,2170.0,4773.0,12472.0,29853.0,80409,15977,4344,3211,9142,1.0,1.0,"As a child, Monkey D. Luffy was inspired to be...",{'large': 'https://s4.anilist.co/file/anilistc...
9,104578,Attack on Titan Season 3 Part 2,Shingeki no Kyojin 3 Part 2,ANIME,24.0,2019.0,,,FINISHED,JP,False,"['Action', 'Drama', 'Fantasy', 'Mystery']",90.0,90.0,340480,20389,775.0,136.0,274.0,460.0,1282.0,2536.0,8866.0,26721.0,70960.0,93286.0,9756,25181,303207,912,1424,1.0,2.0,The battle to retake Wall Maria begins now! Wi...,{'large': 'https://s4.anilist.co/file/anilistc...
15,11061,Hunter x Hunter (2011),HUNTER×HUNTER (2011),ANIME,24.0,2011.0,,,FINISHED,JP,False,"['Action', 'Adventure', 'Fantasy']",89.0,89.0,488883,54771,1027.0,253.0,438.0,782.0,2140.0,3951.0,12943.0,32075.0,75488.0,105797.0,71230,78737,296691,10182,32043,1.0,1.0,A new adaption of the manga of the same name b...,{'large': 'https://s4.anilist.co/file/anilistc...


### Test: Recommendation from list of title_ids

In [21]:
# Query by title_ids

# example query title_id list
ex_titles_action = [30002, 105778, 53390, 87216, 85486, 30656, 30642, 31706, 31133, 30025]
ex_titles_romance = [72451, 97852, 85135, 101583, 87395, 59211, 132182, 30145, 41514, 86481]

top_10_similar_user_ids = ubf.get_similar_users_from_titles(ex_titles_romance)
print(top_10_similar_user_ids)

# Work out the average ratio of overlapping titles
print(ubf.evaluate_by_overlap_titles(top_10_similar_user_ids))

# make recommendation
recommended_titles = ubf.recommend_unread_titles(10, top_10_similar_user_ids, method="refer_others")

# show recommendations
display(df_titles[df_titles["title_id"].isin(recommended_titles)].head(3))


[9392 2866 9494 2154 5626 8157 8612 1434 8839 7633]
0.3838791368593074


Unnamed: 0,title_id,title_english,title_romaji,type,duration,start_year,chapters,volume,publishing_status,country,adult,genres,average_score,mean_score,popularity,favorites,score_10,score_20,score_30,score_40,score_50,score_60,score_70,score_80,score_90,score_100,count_CURRENT,count_PLANNING,count_COMPLETED,count_DROPPED,count_PAUSED,ranking_RATED,ranking_POPULAR,synopsis,cover_image_url
2017,17895,Golden Time,Golden Time,ANIME,24.0,2013.0,,,FINISHED,JP,False,"['Comedy', 'Drama', 'Romance']",75.0,75.0,150726,4104,521.0,386.0,865.0,1657.0,3359.0,6156.0,13821.0,18580.0,12799.0,7116.0,7054,43859,87742,6334,5737,14.0,3.0,"Due to a tragic accident, Banri Tada is struck...",{'large': 'https://s4.anilist.co/file/anilistc...
2927,8937,A Certain Magical Index II,Toaru Majutsu no Index II,ANIME,23.0,2010.0,,,FINISHED,JP,False,"['Action', 'Sci-Fi', 'Supernatural']",73.0,73.0,68329,736,180.0,137.0,394.0,768.0,1898.0,3847.0,9115.0,9642.0,4859.0,2137.0,2141,14751,48449,1261,1727,7.0,4.0,"Index is still a fugitive, and many powerful m...",{'large': 'https://s4.anilist.co/file/anilistc...
4124,15451,,High School DxD New,ANIME,27.0,2013.0,,,FINISHED,JP,False,"['Action', 'Comedy', 'Ecchi', 'Fantasy', 'Roma...",71.0,71.0,118832,1887,761.0,739.0,1457.0,2489.0,5060.0,8921.0,17768.0,15831.0,8724.0,6973.0,2949,8766,103854,1752,1511,14.0,7.0,The devilish haremking Issei Hyoudou is back f...,{'large': 'https://s4.anilist.co/file/anilistc...


### Test: Recommendation from a title, but refering to title-user matrix

- Insight: the more popular the title, it comes up with more accurate results

In [38]:
# Query by title_id

# query_title_id = 15 # eye shield 21 (popular, while back, american football manga) -> recommends SLAM DUNK, Major, Hajime no Ippo(boxing), so that's pretty good
query_title_id = 105778 # Chainsaw man (popular, recent, dark fantasy) -> SPYxFAMILY (recent) , Jujutsu Kaisen(popular recent title, dark fantasy)
# query_title_id = 87395 # Grand Blue (popular, recent, comedy, romance, ) -> Kaguya-sam (popular recent rom-com), ReLife (recent romance but no comedy), so not sure about this one

res = ubf.recommend_from_other_user_histories(query_title_id)
print(res)

# show recommendations
display(df_titles[df_titles["title_id"].isin(res)])


[ 86635 100230  85849  98235  86300  85533  85952  97553  86720  85486]


Unnamed: 0,title_id,title_english,title_romaji,type,duration,start_year,chapters,volume,publishing_status,country,adult,genres,average_score,mean_score,popularity,favorites,score_10,score_20,score_30,score_40,score_50,score_60,score_70,score_80,score_90,score_100,count_CURRENT,count_PLANNING,count_COMPLETED,count_DROPPED,count_PAUSED,ranking_RATED,ranking_POPULAR,synopsis,cover_image_url
34,86635,Kaguya-sama: Love is War,Kaguya-sama wa Kokurasetai: Tensaitachi no Ren...,MANGA,,2015.0,,,RELEASING,JP,False,"['Comedy', 'Psychological', 'Romance', 'Slice ...",88.0,88.0,72920,9840,168.0,66.0,79.0,103.0,243.0,421.0,1275.0,3659.0,8311.0,8821.0,41637,23199,1895,1514,4675,1.0,1.0,As leaders of their prestigious academy’s stud...,{'large': 'https://s4.anilist.co/file/anilistc...
98,97553,,"Jumyou wo Kaitotte Moratta. Ichinen ni Tsuki, ...",MANGA,,2016.0,18.0,3.0,FINISHED,JP,False,"['Drama', 'Psychological', 'Romance', 'Superna...",86.0,86.0,45184,4681,63.0,39.0,88.0,127.0,263.0,507.0,1569.0,3495.0,6252.0,5771.0,2122,16803,25763,183,313,2.0,4.0,A twenty-year-old with little hope for the fut...,{'large': 'https://s4.anilist.co/file/anilistc...
124,85849,ReLIFE,ReLife,MANGA,,2013.0,238.0,15.0,FINISHED,JP,False,"['Comedy', 'Drama', 'Psychological', 'Romance'...",85.0,85.0,35236,2885,42.0,10.0,37.0,96.0,192.0,364.0,1271.0,3028.0,4607.0,3690.0,4295,12064,16641,843,1393,3.0,2.0,When the responsibilities of being an adult an...,{'large': 'https://s4.anilist.co/file/anilistc...
636,85486,My Hero Academia,Boku no Hero Academia,MANGA,,2014.0,,,RELEASING,JP,False,"['Action', 'Adventure', 'Comedy', 'Drama', 'Fa...",80.0,80.0,117943,8520,276.0,114.0,278.0,510.0,1285.0,2233.0,5995.0,11418.0,13012.0,7815.0,75721,19792,5425,6151,10854,15.0,1.0,Middle school student Izuku Midoriya wants to ...,{'large': 'https://s4.anilist.co/file/anilistc...
719,86720,Helck,Helck,MANGA,,2014.0,111.0,12.0,FINISHED,JP,False,"['Action', 'Adventure', 'Comedy', 'Fantasy']",80.0,81.0,8082,335,8.0,4.0,12.0,25.0,55.0,108.0,310.0,576.0,627.0,323.0,1132,4084,2434,198,234,16.0,30.0,Helck is the hero that defeated one of the Dem...,{'large': 'https://s4.anilist.co/file/anilistc...
893,85533,Teasing Master Takagi-san,Karakai Jouzu no Takagi-san,MANGA,,2012.0,,,RELEASING,JP,False,"['Comedy', 'Romance', 'Slice of Life']",79.0,79.0,19905,871,21.0,13.0,23.0,61.0,174.0,335.0,913.0,1666.0,1275.0,727.0,10272,6672,485,902,1574,20.0,16.0,"<i>""If you blush, you lose.""</i><br>\n<br>\nLi...",{'large': 'https://s4.anilist.co/file/anilistc...
963,100230,The Quintessential Quintuplets,Go-toubun no Hanayome,MANGA,,2017.0,122.0,14.0,FINISHED,JP,False,"['Comedy', 'Drama', 'Romance']",78.0,78.0,47393,4170,140.0,68.0,190.0,317.0,755.0,1486.0,3993.0,7017.0,5834.0,3185.0,8165,9455,26978,1215,1580,27.0,4.0,"One day, a poor high school second-year named ...",{'large': 'https://s4.anilist.co/file/anilistc...
1341,86300,Tomo-chan is a Girl!,Tomo-chan wa Onnanoko!,MANGA,,2015.0,961.0,8.0,FINISHED,JP,False,"['Comedy', 'Romance', 'Slice of Life']",77.0,77.0,27989,1001,30.0,11.0,68.0,127.0,426.0,790.0,2167.0,3169.0,2298.0,954.0,5990,8552,11473,796,1178,31.0,6.0,Boyish high school girl Aizawa Tomo finally ma...,{'large': 'https://s4.anilist.co/file/anilistc...
2270,98235,We Never Learn,Bokutachi wa Benkyou ga Dekinai,MANGA,,2017.0,187.0,21.0,FINISHED,JP,False,"['Comedy', 'Ecchi', 'Romance', 'Slice of Life']",74.0,74.0,22781,1097,49.0,29.0,117.0,174.0,473.0,765.0,1868.0,2438.0,1503.0,743.0,6620,6183,7760,1110,1108,71.0,13.0,Nariyuki Yuiga is in his last and most painful...,{'large': 'https://s4.anilist.co/file/anilistc...
3816,85952,The Legendary Hero Is Dead!,Yuusha ga Shinda! Murabito no Ore ga Hotta Oto...,MANGA,,2014.0,202.0,20.0,FINISHED,JP,False,"['Action', 'Adventure', 'Comedy', 'Ecchi', 'Fa...",71.0,72.0,6304,128,10.0,9.0,32.0,54.0,92.0,181.0,336.0,343.0,254.0,129.0,2475,2045,880,487,417,76.0,44.0,Three years ago a hero sealed a Hell Gate. The...,{'large': 'https://s4.anilist.co/file/anilistc...


---
# Extras
---

## Fuzzy match

In [55]:
print(fuzz.ratio("Gintama: THE VERY FINAL", "Gintama."))
print(fuzz.partial_ratio("Gintama: THE VERY FINAL", "Gintama."))
print(fuzz.token_sort_ratio("Gintama: THE VERY FINAL", "Gintama."))

df_titles = pd.read_csv("../assets/titles_2000p.csv")
# display(df_titles)

q_title = "Gintama: THE VERY FINAL"

title_id_zip = zip(df_titles["title_id"], df_titles["title_romaji"])
best_match_title = None
best_match_score = 0
for t_id, title in title_id_zip:
    partial_ratio = fuzz.partial_ratio(title.lower(), q_title.lower())
    if partial_ratio > best_match_score:
        best_match_score = partial_ratio
        best_match_title = [t_id, title, partial_ratio]

print(best_match_title)

45
88
48
[30044, 'Gintama', 100]


## Compare different methods of user similarities calculation

Develop a custom similarity evaluation metric by checking overlapping titles in media_list

In [16]:
# Get the media_list df
df_mlist = pd.read_csv("../assets/media_list_all_users.csv")

In [17]:
# calculate similarity and similar user ids
top_10_similar_user_ids = ubf.get_similar_users_from_user_id(start_col=1, dist_metric="cosine_similarity", query_user_id=QUERY_USER_ID)

# Work out the average ratio of overlapping titles
ubf.evaluate_by_overlap_titles(top_10_similar_user_ids)

0.2560350993885577

In [20]:
# Calculate user similarities: Euclidean

# calculate similarity and similar user ids
# need to add ascending=True
top_10_similar_user_ids = ubf.get_similar_users_from_user_id(start_col=1, dist_metric="euclidean_distances", query_user_id=QUERY_USER_ID, ascending=True)

# Work out the average ratio of overlapping titles
ubf.evaluate_by_overlap_titles(top_10_similar_user_ids)

0.2307651556697182

In [19]:
# Calculate user similarities: Manhattan

# calculate similarity and similar user ids
top_10_similar_user_ids = ubf.get_similar_users_from_user_id(start_col=1, dist_metric="manhattan_distances", query_user_id=QUERY_USER_ID, ascending=True)

# Work out the average ratio of overlapping titles
ubf.evaluate_by_overlap_titles(top_10_similar_user_ids)

0.27521065158710084