In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
# Data Gathered By Me Via the MyAnimeList API and Jikan REST API
anime_df = pd.read_csv('./data/anime.csv')
user_ratings_df = pd.read_csv('./data/user_ratings.csv', parse_dates=['updated_at'])

In [8]:
anime_df.head()

Unnamed: 0,id,title,start_date,end_date,synopsis,score,rank,popularity,num_list_users,num_scoring_users,media_type,status,genres,num_episodes,start_season,broadcast_day,broadcast_time,source,rating,studios
0,5114,Fullmetal Alchemist: Brotherhood,2009-04-05,2010-07-04,After a horrific alchemy experiment goes wrong...,9.1,1.0,3,3240265,2056814,tv,finished_airing,"Action,Adventure,Drama,Fantasy,Military,Shounen",64,spring,sunday,17:00,manga,r,Bones
1,9253,Steins;Gate,2011-04-06,2011-09-14,Eccentric scientist Rintarou Okabe has a never...,9.07,2.0,13,2489957,1361494,tv,finished_airing,"Drama,Psychological,Sci-Fi,Suspense,Time Travel",24,spring,wednesday,02:05,visual_novel,pg_13,White Fox
2,28977,Gintama°,2015-04-08,2016-03-30,"Gintoki, Shinpachi, and Kagura return as the f...",9.06,3.0,336,610158,243298,tv,finished_airing,"Action,Comedy,Gag Humor,Historical,Parody,Samu...",51,spring,wednesday,18:00,manga,pg_13,Bandai Namco Pictures
3,41467,Bleach: Sennen Kessen-hen,2022-10-11,2022-12-27,Substitute Soul Reaper Ichigo Kurosaki spends ...,9.05,4.0,432,489199,245969,tv,finished_airing,"Action,Adventure,Fantasy,Shounen",13,fall,tuesday,00:00,manga,r,Pierrot
4,39486,Gintama: The Final,2021-01-08,2021-01-08,Two years have passed following the Tendoshuu'...,9.05,5.0,1550,140032,67119,movie,finished_airing,"Action,Comedy,Drama,Gag Humor,Historical,Parod...",1,winter,,,manga,pg_13,Bandai Namco Pictures


In [9]:
user_ratings_df.drop(columns=['start_date', 'finish_date'], inplace=True)

# Handling Missing Values

In [10]:
user_ratings_df.isna().sum()

user_id                 0
anime_id                0
score                   0
status                  1
num_episodes_watched    0
updated_at              0
dtype: int64

Although the above functions shows that there are no missing score values, a score of 0 indicates that the score was missing since it's not possible on MyAnimeList to rate an anime 0 unless you choose not to rate the anime.

## Remove Users who have only given scores of 0

In [11]:
users_to_remove = []
for user_id in user_ratings_df.user_id.unique():
  users_ratings = user_ratings_df.loc[user_ratings_df.user_id == user_id]
  num_non_zero_scores = len(users_ratings.loc[users_ratings.score > 0])

  if num_non_zero_scores == 0:
    users_to_remove.append(user_id)

print(users_to_remove)

[20, 35, 40, 57, 88, 116, 122, 124, 127, 131, 137, 139, 148, 185, 198, 202, 210, 217, 225, 226, 229, 232, 234, 235, 253, 271, 308, 309, 315, 321, 322, 364, 376, 384, 385, 387, 393, 419, 426, 443, 462, 476, 479, 487, 504, 506, 516, 532, 559, 562, 564, 569, 570, 590, 599, 612, 615, 617, 626, 644, 660, 664, 691, 723, 732, 733, 741, 745, 751, 761, 763, 766, 770, 771, 776, 785, 788, 794, 804, 815, 816, 844, 853, 860, 868, 879, 883, 886, 890, 897, 898, 906, 926, 961, 992, 999, 1023, 1029, 1033, 1054, 1059, 1065, 1077, 1090, 1099, 1105, 1108, 1114, 1127, 1129, 1143, 1145, 1147, 1156, 1157, 1170, 1180, 1223, 1227, 1228, 1230, 1237, 1258, 1268, 1271, 1272, 1273, 1310, 1314, 1351, 1358, 1370, 1386, 1387, 1388, 1392, 1406, 1409, 1420, 1427, 1446, 1455, 1458, 1469, 1477, 1490, 1491, 1511, 1522, 1529, 1541, 1542, 1549, 1557, 1570, 1576, 1587, 1594, 1596, 1604, 1607, 1616, 1641, 1650, 1652, 1653, 1655, 1662, 1667, 1691, 1693, 1715, 1720, 1732, 1746, 1752, 1755, 1764, 1766, 1795, 1806, 1808, 1840, 18

In [12]:
user_ratings_df = user_ratings_df.loc[~user_ratings_df.user_id.isin(users_to_remove)]

# Splitting User Ratings into Train and Test Datasets
Split based on the update_at column

In [13]:
import math
split_ratio = 0.7 # The ratio of anime per user to put in the training set

def train_test_split(user_ratings_df: pd.DataFrame, split_ratio: float):
  train_df = None
  test_df = None

  for user_id in user_ratings_df['user_id'].unique():
    # Get the users ratings and sort by the date the rating was last updated
    users_ratings = user_ratings_df.loc[user_ratings_df.user_id == user_id]
    users_ratings = users_ratings.sort_values(by=['updated_at'])

    # Determine what index to split the user's ratings by
    split_index = int(math.floor((len(users_ratings)-1) * split_ratio))

    # Get the split datasets
    train_users_ratings = users_ratings[:split_index]
    test_users_ratings = users_ratings[split_index+1:]

    if train_df is None:
      train_df = train_users_ratings
    else:
      train_df = pd.concat([train_df, train_users_ratings], ignore_index=True)

    if test_df is None:
      test_df = test_users_ratings
    else:
      test_df = pd.concat([test_df, test_users_ratings], ignore_index=True)

  return (train_df, test_df)

(train_df, test_df) = train_test_split(user_ratings_df, split_ratio)

In [14]:
train_df

Unnamed: 0,user_id,anime_id,score,status,num_episodes_watched,updated_at
0,0,48561,9,completed,1,2023-01-01 09:37:49+00:00
1,0,20583,9,completed,25,2023-01-01 09:39:58+00:00
2,0,38883,8,completed,13,2023-01-01 09:40:05+00:00
3,0,28891,10,completed,25,2023-01-01 09:40:24+00:00
4,0,32935,9,completed,10,2023-01-01 09:40:42+00:00
...,...,...,...,...,...,...
2912948,9071,35247,4,completed,7,2022-09-27 03:09:59+00:00
2912949,9071,31181,4,completed,12,2022-09-27 03:10:01+00:00
2912950,9071,29803,3,completed,13,2022-09-27 03:10:02+00:00
2912951,9071,34134,3,completed,12,2022-09-27 03:10:04+00:00


In [15]:
test_df

Unnamed: 0,user_id,anime_id,score,status,num_episodes_watched,updated_at
0,0,38826,9,completed,1,2023-04-15 14:38:56+00:00
1,0,40028,10,completed,16,2023-04-15 14:42:02+00:00
2,0,51535,9,watching,1,2023-04-15 14:42:16+00:00
3,0,48583,10,completed,12,2023-04-15 14:42:25+00:00
4,0,41467,10,completed,13,2023-04-15 14:42:51+00:00
...,...,...,...,...,...,...
1253543,9071,21,6,watching,1048,2023-08-19 14:33:51+00:00
1253544,9071,37171,5,completed,12,2023-09-07 12:21:07+00:00
1253545,9071,659,5,completed,1,2023-09-07 12:21:41+00:00
1253546,9071,35838,5,completed,12,2023-09-07 12:22:10+00:00


# Vectorizing the Anime Data

# Create the Recommender Class

In [16]:
class CBFRecommender:
  def __init__(self, anime_data: pd.DataFrame, user_ratings_data: pd.DataFrame):
    # Cleanup Anime Data
    self.anime_df = anime_data
    self.anime_df.fillna({"genres": ""}, inplace=True)
    self.anime_df.fillna({"synopsis": ""}, inplace=True)

    self.user_ratings_df = user_ratings_df
    self.user_profiles = {}

  # Uses TF-IDF to Vectorize the Anime DataFrame
  def vectorize_anime_data(self):
    vectorizer = TfidfVectorizer()
    anime_features_df = self.anime_df[['id']]
    anime_features_df['combined'] =  self.anime_df.apply(lambda x: self.anime_df['genres'] + " " + self.anime_df['synopsis'], axis=1)

    anime_features_tfidf_matrix = vectorizer.fit_transform(anime_features_df['combined'])

    self.anime_tfidf_df = pd.DataFrame(data=genres_tfidf_matrix.toarray())
    self.anime_tfidf_df['anime_id'] = self.anime_df['id']
    self.anime_tfidf_df = self.anime_tfidf_df.set_index('anime_id')

    return self.anime_tfidf_df

  # Creates a profile of the specified user
  def create_user_profile(self, user_id: int):
    # Get only this user's ratings
    users_ratings_df = user_ratings_df[user_ratings_df.user_id == user_id]

    # Prefer to only use anime they rated higher than their avg. rating
    average_rating = np.average(users_ratings_df['score'])
    selected_user_ratings_df = users_ratings_df[users_ratings_df.score > average_rating]

    # If the user hasn't rated any anime higher than thier avg.,
    # then use the median instead
    if selected_user_ratings_df.empty:
      average_rating = np.median(users_ratings_df['score'])
      selected_user_ratings_df = users_ratings_df[users_ratings_df.score >= average_rating]

    # Get the Anime they rated highly
    user_anime_rated_df = self.anime_df[self.anime_df.id.isin(selected_user_ratings_df['anime_id'])]

    # Get the weighted average of the TF-IDF values for the anime they rated
    user_tfidf_df = anime_tfidf_df.loc[user_anime_rated_df['id']]
    weighted_tfidf_avg = user_tfidf_df.mean()

    # Append to the user profiles dictionary
    self.user_profiles[user_id] = {
        "weighted_tfidf_avg": weighted_tfidf_avg,
        "tfidf_df": user_tfidf_df
    }

    return self.user_profiles[user_id]

  # Get a DataFrame of How Similar Anime Are to the User's Preference
  def get_user_anime_similarity(self, user_id):
    # Find user profile in user profiles
    # TODO: Check if the user_id exists first
    user_profile = self.user_profiles[user_id]

    if user_profile is None:
      print("Could not find user profile. Creating new one...")
      user_profile = self.create_user_profile(user_id)

    # Calculate the similarity matrix of the user's weighted tfidf instance compared to all instances in the anime tfidf df
    similarity_mat = cosine_similarity(np.array([user_profile["weighted_tfidf_avg"].tolist()]).reshape((1, -1)), anime_tfidf_df).reshape(-1)

    # Convert the matrix into a dataframe
    similarity_df = pd.DataFrame(data=similarity_mat.tolist())
    similarity_df = similarity_df.rename(columns={0: "cosine_similarity"})
    similarity_df['id'] = self.anime_df['id']

    # The closer to 1 the cosine similarity is the more similar the anime is to the weighted vector
    similarity_df = similarity_df.sort_values(by='cosine_similarity', ascending=False)

    # Remove all anime that the user has already watched
    similarity_df = similarity_df.loc[~similarity_df.id.isin(user_profile["tfidf_df"].index)]

    return similarity_df

  # Recommend Anime to the User
  def recommend_user(self, user_id: int, num_recommendations: int, add_anime_info: bool = True):
    similarity_df = self.get_user_anime_similarity(user_id)

    # Get the top recommendations and merge the anime data with the dataframe
    top_similar_anime_df = similarity_df.iloc[0:num_recommendations]

    # Only merge if we want to the anime information too.
    if add_anime_info:
      top_similar_anime_df = top_similar_anime_df.merge(self.anime_df, on='id', how='inner')

    return top_similar_anime_df

  # Recommend Anime to Multiple Users
  def make_recommendations(self, num_recommendations: int, should_create_user_profiles: bool = True):
    unique_user_ids = self.user_ratings_df['user_id'].unique()

    recommendations_dict = {}

    for user_id in unique_user_ids:

      if should_create_user_profiles:
        self.create_user_profile(user_id)

      # Get this user's recommendations
      users_top_similar_anime_df = self.recommend_user(user_id, num_recommendations, add_anime_info = False)

      # Add the user's recommendations to the recommendation dictionary
      recommendations_dict[user_id] = users_top_similar_anime_df

    # recommendations_df = recommendations_df.merge(self.anime_df, on='id', how='inner')
    return recommendations_dict

  # Evaluate the Effectiveness of the Model
  def evaluate(self, recommendations: pd.DataFrame, test_data: pd.DataFrame):
    accuracy = 0
    num_total_recommendations = 0
    num_total_correct_recommendations = 0

    for user_id, user_recommendations in recommendations.items():
      test_user_recommendations = test_data.loc[test_data.user_id == user_id]
      num_recommendations = len(user_recommendations.index)
      num_correct_recommendations = 0

      for anime_id in user_recommendations['id']:
        if anime_id in test_user_recommendations['anime_id'].values:
          num_correct_recommendations += 1

      num_total_recommendations += num_recommendations
      num_total_correct_recommendations += num_correct_recommendations

    accuracy = (num_total_correct_recommendations / num_total_recommendations) * 100

    return {
        "accuracy": accuracy
    }

In [None]:
anime_cbf_recommeder = CBFRecommender(anime_df, train_df)
anime_cbf_recommeder.vectorize_anime_data()

Unexpected exception formatting exception. Falling back to standard exception


## Test Recommending Anime for One User

In [None]:
test_user_id = 1
anime_cbf_recommeder.create_user_profile(test_user_id)
test_user_recommendations = anime_cbf_recommeder.recommend_user(test_user_id, 10)
test_user_recommendations.head()

## Test Recommending Anime for Every User

In [None]:
test_recommendations = anime_cbf_recommeder.make_recommendations(10)

In [None]:
test_recommendations[104]

In [None]:
evaluation = anime_cbf_recommeder.evaluate(test_recommendations, test_df)

In [None]:
evaluation["accuracy"]