## Recommendation Module

#### ToDo

- [x] Calculate user similarities
    - [x] Query by user_id -> users with similar taste -> recommend
    - [x] Query by picking out the favorite titles -> users with similar taste -> recommend
- [x] Similarity Evaluation: average of title overlap ratio of top 10 similar users
    - [x] cosine -> 0.256
    - [x] euclidean -> 0.231
    - [x] manhattan -> 0.275
- [ ] Recommendation
    - [ ] Try item-user matrix recommendation
    - [x] From list of "not-yet-seen" list:
        - [x] a) Pick the most popular titles
        - [x] b) From top 10 similar users, which title have you not seen/read but the others have?
- [ ] Refactor
    - [x] Separate sections for scaled & not scaled codes
    - [x] Convert methods into one class -> UserBasedFiltering
    - [ ] Refactor UserBasedFiltering: 
        - [ ] Add parameter to recommendation method to filter manga/anime recommendations
        - [ ] Stop doing everything in df to make things faster

#### Other
- [ ] Extra: Convert media list to network visualization
- [ ] Weight the experienced media by its status (CURRENT/DROPPED etc)

In [1]:
# !pip install sweetviz
# !pip install joypy

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from joypy import joyplot
import sweetviz as sv
from tqdm import tqdm
from collections import ChainMap
import sklearn
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances, euclidean_distances
pd.set_option("max_columns", 200)

In [2]:
# query user_id
QUERY_USER_ID = 1

In [12]:
class UserBasedFiltering:
    def __init__(self):
        # Load various data first
        self.df_titles = pd.read_csv("../assets/titles_2000p.csv")
        self.df_titles_genre = pd.read_csv("../assets/ryota_title_genre_2000p.csv")
        self.df_mlist = pd.read_csv("../assets/media_list_all_users.csv")
        self.df_mlist_genre = pd.read_csv("../assets/ryota_media_list_genre.csv")
        self.df_user_genre_dist = pd.read_csv("../assets/ryota_user_genre_dist.csv")


    def get_similar_users_from_user_id(self, start_col, dist_metric="cosine_similarity", query_user_id=QUERY_USER_ID, ascending=False):
        '''
        query similar users from user_id
        '''
        # Calculate user similarities
        df = self.df_user_genre_dist
        id_list = df["user_id"]
        if dist_metric=="euclidean_distances":
            df_sim_mat = pd.DataFrame(euclidean_distances(df.iloc[:, start_col:]))
        elif dist_metric=="manhattan_distances":
            df_sim_mat = pd.DataFrame(manhattan_distances(df.iloc[:, start_col:]))
        else:
            df_sim_mat = pd.DataFrame(cosine_similarity(df.iloc[:, start_col:]))
        df_sim_mat.index = id_list
        df_sim_mat.columns = id_list

        similar_users = df_sim_mat[query_user_id].sort_values(ascending=ascending).reset_index()
        top_10_similar_user_ids = list(similar_users.iloc[1:11, 0])
        return top_10_similar_user_ids


    def get_similar_users_from_titles(self, q_titles, threshold=50):
        '''
        query similar users from list of favorite title_ids
        '''
        # refer to users with more than 50 titles -> more stable genre distribution
        df_user_mlist_count = self.df_mlist_genre[["user_id", "mlist_count"]]
        df_user_mlist_count = df_user_mlist_count[df_user_mlist_count["mlist_count"]>threshold]
        ref_user_ids = df_user_mlist_count["user_id"].values
        
        # limit df_user_genre_dist to users with more than threshold n titles in their media list
        df_user_genre_dist_thresh = self.df_user_genre_dist[self.df_user_genre_dist["user_id"].isin(ref_user_ids)]

        # create genre map from title_id list
        df_titles_genre_ex = self.df_titles_genre[self.df_titles_genre["title_id"].isin(q_titles)]
        df_titles_genre_ex = df_titles_genre_ex.sum(axis=0) / len(df_titles_genre_ex)

        # get the genre distribution values and work out cosine similarity
        ex_genre_dist = df_titles_genre_ex.iloc[1:].values.reshape(1,-1)
        user_genre_dist = df_user_genre_dist_thresh.iloc[:,1:].values
        res = cosine_similarity(user_genre_dist, ex_genre_dist)
        res = res.reshape(-1)
        high_sim_idx = np.argsort(res)[-10:]
        top_10_similar_user_ids = ref_user_ids[high_sim_idx]
        # print("shapes of: queried genre dist, referenced users' genre dist, reshaped similarity matrix", ex_genre_dist.shape, user_genre_dist.shape, res.shape)
        # print("similarity score, user_id of similar users: ", res[high_sim_idx], ref_user_ids[high_sim_idx])
        return top_10_similar_user_ids


    def evaluate_by_overlap_titles(self, similar_user_ids, query_user_id=QUERY_USER_ID):
        '''
        Work out the average ratio of titles overlap and use it as direct evaluation metric
        Higher the ratio of overlap = better similarity calculation
        '''
        df = self.df_mlist
        overlap_ratios = []
        df_q = df[df["user_id"]==query_user_id]
        q_u_titles = list(df_q["title_id"])

        for user_id in similar_user_ids:
            df_sim = df[df["user_id"]==user_id]
            sim_u_titles = list(df_sim["title_id"])
            overlap = list(set(sim_u_titles) & set(q_u_titles))
            # print("for user_id: ", user_id)
            # print("all titles: ", len(sim_u_titles))
            # print("overlapping titles: ", len(overlap))
            overlap_ratios.append(len(overlap) / len(sim_u_titles))
        avg_overlap_ratio = sum(overlap_ratios) / len(overlap_ratios)
        return avg_overlap_ratio


    def recommend_from_unread_titles(self, n_titles, similar_user_list, query_user=QUERY_USER_ID, method="refer_others"):
        '''
        It retrieves the media list of similar users and then recommend based on specified logic

        :params
            df_titles: titles df
            df_mlist: media list df
            n_titles: how many titles to recommend
            query_user: querying user_id
            similar_user_list: list of similar user_ids
            method: which method to make recommendation
        :returns
            list of title_id as recommendation
        '''
        # get title_ids that the querying user hasn't read but similar users have
        df_mlist_similar_user = self.df_mlist[self.df_mlist["user_id"].isin(similar_user_list)]
        df_mlist_q_user = self.df_mlist[self.df_mlist["user_id"]==query_user]
        q_users_titles = list(df_mlist_q_user["title_id"])
        df_mlist_similar_user_not_read = df_mlist_similar_user[~df_mlist_similar_user["title_id"].isin(q_users_titles)]

        if method=="refer_popularity":
            # refer_popularity method: get "favorites" count of the unread titles and return top n titles
            unread_list = list(df_mlist_similar_user_not_read["title_id"].unique())
            df_recommend_list = self.df_titles[self.df_titles["title_id"].isin(unread_list)]
            df_recommend_list = df_recommend_list[["title_id", "favorites"]].sort_values(by="favorites", ascending=False).iloc[:n_titles]
            recommend_list = list(df_recommend_list["title_id"])
        else:
            # refer_others method: get count of titles and return top n titles
            df_recommend_list = df_mlist_similar_user_not_read.groupby("title_id").size().sort_values(ascending=False).iloc[:n_titles]
            recommend_list = list(df_recommend_list.index)
        return recommend_list


In [13]:
# Initialize
ubf = UserBasedFiltering()

In [14]:
# Query by user_id

# calculate similarity and similar user ids
top_10_similar_user_ids = ubf.get_similar_users_from_user_id(start_col=1, dist_metric="cosine_similarity", query_user_id=QUERY_USER_ID)
print(top_10_similar_user_ids)

# Work out the average ratio of overlapping titles
print(ubf.evaluate_by_overlap_titles(top_10_similar_user_ids))

# make recommendation
ubf.recommend_from_unread_titles(10, top_10_similar_user_ids, method="refer_popularity")

[6803, 532, 5708, 3883, 1393, 8103, 6155, 2871, 54, 3934]
0.2560350993885577


[11061, 21, 113415, 101922, 1735, 20464, 30013, 20, 104578, 21507]

In [15]:
# Query by title_ids

# example query title_id list
ex_titles_action = [30002, 105778, 53390, 87216, 85486, 30656, 30642, 31706, 31133, 30025]
ex_titles_romance = [72451, 97852, 85135, 101583, 87395, 59211, 132182, 30145, 41514, 86481]

top_10_similar_user_ids = ubf.get_similar_users_from_titles(ex_titles_romance)
print(top_10_similar_user_ids)

# Work out the average ratio of overlapping titles
print(ubf.evaluate_by_overlap_titles(top_10_similar_user_ids))

# make recommendation
ubf.recommend_from_unread_titles(10, top_10_similar_user_ids, method="refer_popularity")

[9392 2866 9494 2154 5626 8157 8612 1434 8839 7633]
0.3838791368593074


[1735, 112641, 199, 20447, 20613, 853, 20789, 9919, 11771, 1210]

---
# Calculating user similarities
---

### Developing a similarity evaluation metric: Checking similarities by looking at overlapping titles in media_list

In [16]:
# Get the media_list df
df_mlist = pd.read_csv("../assets/media_list_all_users.csv")

In [17]:
# calculate similarity and similar user ids
top_10_similar_user_ids = ubf.get_similar_users_from_user_id(start_col=1, dist_metric="cosine_similarity", query_user_id=QUERY_USER_ID)

# Work out the average ratio of overlapping titles
ubf.evaluate_by_overlap_titles(top_10_similar_user_ids)

0.2560350993885577

### Use other metrics to get the similar users

In [20]:
# Calculate user similarities: Euclidean

# calculate similarity and similar user ids
# need to add ascending=True
top_10_similar_user_ids = ubf.get_similar_users_from_user_id(start_col=1, dist_metric="euclidean_distances", query_user_id=QUERY_USER_ID, ascending=True)

# Work out the average ratio of overlapping titles
ubf.evaluate_by_overlap_titles(top_10_similar_user_ids)

0.2307651556697182

In [19]:
# Calculate user similarities: Manhattan

# calculate similarity and similar user ids
top_10_similar_user_ids = ubf.get_similar_users_from_user_id(start_col=1, dist_metric="manhattan_distances", query_user_id=QUERY_USER_ID, ascending=True)

# Work out the average ratio of overlapping titles
ubf.evaluate_by_overlap_titles(top_10_similar_user_ids)

0.27521065158710084