In [1]:
import pandas as pd 
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import difflib
import string
from collections import Counter
from tqdm import tqdm
import pickle

In [2]:
df = pd.read_csv("/Users/justinvhuang/Desktop/CSE-6242-Group-Project/users-score-2023.csv")

In [3]:
df2 = pd.read_json("/Users/justinvhuang/Desktop/CSE-6242-Group-Project/fin_anime_dfv2.json")

In [4]:
anime_list = df2['anime_id'].tolist()

In [5]:
df = df[df['anime_id'].isin(anime_list)]

In [6]:
anime_count = df.groupby('user_id')['anime_id'].count()

user_ids_to_drop = anime_count[anime_count < 10].index

filtered_df = df[~df['user_id'].isin(user_ids_to_drop)]

In [7]:
user_item_matrix = filtered_df.pivot_table(index='user_id', columns='Anime Title', values='rating')

user_item_matrix = user_item_matrix.fillna(0)

user_ratings_mean = user_item_matrix.mean(axis=1)
user_item_matrix_centered = user_item_matrix.sub(user_ratings_mean, axis=0)

item_similarity = cosine_similarity(user_item_matrix_centered.T)

In [8]:
def jaccard_similarity(title1, title2):
    title1_tokens = set(title1.lower().translate(str.maketrans('', '', string.punctuation)).split())
    title2_tokens = set(title2.lower().translate(str.maketrans('', '', string.punctuation)).split())
    intersection = len(title1_tokens.intersection(title2_tokens))
    union = len(title1_tokens.union(title2_tokens))
    return intersection / union if union else 0

def item_collaborative_recommender(anime_title, user_item_matrix, item_similarity, top_n=10):
    sim_scores = item_similarity[user_item_matrix.columns.get_loc(anime_title)]
    
    top_indices = np.argsort(sim_scores)[::-1][1:2*top_n+1]  
    
    similar_titles = []
    for idx in top_indices:
        title = user_item_matrix.columns[idx]
        if jaccard_similarity(anime_title, title) < 0.5:  
            similar_titles.append(title)
        if len(similar_titles) >= top_n:
            break
    
    return similar_titles[:top_n]


In [16]:
recommendations = item_collaborative_recommender('Hunter x Hunter', user_item_matrix, item_similarity)
print(recommendations)

['Hunter x Hunter: Original Video Animation', 'Hunter x Hunter: Greed Island Final', 'Hajime no Ippo', 'Fullmetal Alchemist', 'Naruto', 'Death Note', 'Bleach', 'D.Gray-man', 'Great Teacher Onizuka', 'Code Geass: Hangyaku no Lelouch']


In [10]:
# Create an empty dictionary to store recommendations
anime_recommendations_dict = {}

# Group the DataFrame by 'Anime Title' to avoid repeated computations
grouped_df = filtered_df.groupby('Anime Title')

# Loop through unique anime titles with tqdm for a progress bar
for anime_title, anime_group in tqdm(grouped_df, total=len(grouped_df), desc="Processing Anime Titles"):
    # Assuming you want to use the first anime_id for each anime_title (you may need to adjust this logic)
    anime_id = anime_group['anime_id'].iloc[0]
    
    # Generate recommendations for the current anime title
    recommendations = item_collaborative_recommender(anime_title, user_item_matrix, item_similarity)
    
    # Store the recommendations in the dictionary with anime_id as the key
    anime_recommendations_dict[anime_id] = recommendations

Processing Anime Titles: 100%|██████████| 7253/7253 [00:05<00:00, 1239.23it/s]


In [11]:
import pickle

# File path to save the dictionary
file_path = "anime_recommendations_item_knn_CF_10k_fin.pkl"

# Save the dictionary using pickle
with open(file_path, 'wb') as f:
    pickle.dump(anime_recommendations_dict, f)

In [12]:
title2id = {x:y for x,y in zip(df['Anime Title'], df['anime_id'])}

In [13]:
def replace_anime_titles(user_dict, replacement_dict):
    for user_id, anime_list in user_dict.items():
        for i, anime_title in enumerate(anime_list):
            if anime_title in replacement_dict:
                user_dict[user_id][i] = replacement_dict[anime_title]
    return user_dict

In [14]:
cf_dict_recs = replace_anime_titles(anime_recommendations_dict, title2id)

In [15]:
cf_dict_recs

{8481: [5177, 9776, 9117, 11235, 8676, 11339, 10067, 3087, 7059, 9213],
 6076: [33978, 28685, 29421, 29425, 32773, 23861, 7524, 9949, 9947, 16007],
 52034: [47917, 44511, 46569, 41457, 48569, 51019, 48549, 49596, 42310, 49918],
 34595: [54323, 53844, 40745, 29373, 52767, 38640, 41446, 51014, 42845, 49491],
 38603: [52691, 19751, 31510, 41276, 42013, 10602, 16620, 36735, 38119, 3497],
 52691: [50028, 19751, 31510, 41276, 42013, 10602, 16620, 36735, 38119, 3497],
 51268: [51109, 54323, 53844, 51014, 42845, 38701, 52767, 53721, 40476, 43651],
 2928: [3269, 4469, 873, 1487, 1143, 9332, 298, 48, 15219, 2205],
 3269: [2928, 873, 1487, 298, 1143, 9332, 48, 15219, 408, 317],
 4469: [2928, 1487, 1143, 15219, 873, 9332, 298, 3960, 5246, 55225],
 1143: [1487, 2928, 48, 873, 298, 3269, 4469, 9332, 15219, 49],
 9332: [3269, 2928, 873, 1487, 298, 1143, 48, 4469, 15219, 8514],
 873: [298, 48, 3269, 2928, 1487, 1143, 9332, 4469, 857, 60],
 48: [298, 873, 1143, 1487, 3269, 121, 1, 227, 317, 59],
 298: 

In [17]:
import pickle

# File path to save the dictionary
file_path = "anime_recommendations_item_knn_CF_10k_num_fin.pkl"

# Save the dictionary using pickle
with open(file_path, 'wb') as f:
    pickle.dump(cf_dict_recs, f)