In [126]:
import pandas as pd
import numpy as np 
from tqdm import tqdm
import pickle

In [128]:
df = pd.read_csv("/Users/justinvhuang/Desktop/CSE-6242-Group-Project/users-score-2023.csv")

In [129]:
# Open the pickle file in binary read mode and load the pickled object
with open("my_animelist.pkl", 'rb') as f:
    loaded_list = pickle.load(f)

In [140]:
df2 = pd.read_json("/Users/justinvhuang/Desktop/CSE-6242-Group-Project/web_scraping/fin_anime_df.json")

In [130]:
len(loaded_list)

11565

In [131]:
filtered_df = df[df['anime_id'].isin(loaded_list)]

In [132]:
# Get unique anime and user IDs
anime_ids = filtered_df['anime_id'].unique()
user_ids = filtered_df['user_id'].unique()

# Create a dictionary to map anime IDs to indices
anime_id_to_index = {anime_id: index for index, anime_id in enumerate(anime_ids)}

In [133]:
class ThompsonSamplingPopularity:
    def __init__(self, num_anime):
        self.num_anime = num_anime
        self.alpha = np.ones(num_anime)  
        self.beta = np.ones(num_anime)   

    def update_parameters(self, user_ratings, user_interactions):
        self.alpha += user_ratings
        self.beta += user_interactions - user_ratings

    def recommend_top_popular_anime(self, anime_popularity, num_recommendations=50):
        sorted_anime_indices = np.argsort(anime_popularity)[::-1]  
        return sorted_anime_indices[:num_recommendations]

    def recommend_anime(self, user_ratings, user_interactions, anime_popularity):
        self.update_parameters(user_ratings, user_interactions)

        expected_theta = self.alpha / (self.alpha + self.beta)
        adjusted_theta = expected_theta * anime_popularity

        return self.recommend_top_popular_anime(adjusted_theta)

In [134]:
#Calculate popularity of each anime based on the total number of interactions
anime_interactions = filtered_df.groupby('anime_id')['user_id'].count().sort_values(ascending=False)
anime_popularity = anime_interactions.values / anime_interactions.values.sum()

# Initialize Thompson Sampling with popularity
thompson_sampling = ThompsonSamplingPopularity(len(anime_ids))

In [135]:
# Initialize an empty set to store unique recommended anime IDs
all_recommended_anime_ids = set()

# Iterate through user IDs with tqdm for progress tracking
for user_id in tqdm(user_ids, desc="Processing users"):
    user_data =filtered_df[filtered_df['user_id'] == user_id]
    user_ratings = np.zeros(len(anime_ids))
    user_interactions = np.zeros(len(anime_ids))

    for _, row in user_data.iterrows():
        anime_index = anime_id_to_index[row['anime_id']]
        user_ratings[anime_index] += row['rating']
        user_interactions[anime_index] += 1

    recommended_anime_indices = thompson_sampling.recommend_anime(user_ratings, user_interactions, anime_popularity)
    recommended_anime_ids = [anime_ids[index] for index in recommended_anime_indices]

    # Add recommended anime IDs to the set
    all_recommended_anime_ids.update(recommended_anime_ids)

# Convert the set to a list
all_recommended_anime_ids = list(all_recommended_anime_ids)

Processing users: 100%|██████████| 269008/269008 [56:06<00:00, 79.90it/s] 


In [149]:
df3 = df2[df2['anime_anime_id'].isin(all_recommended_anime_ids)].drop_duplicates('anime_Genres')
# Filter and drop rows containing 'Ecchi' and 'Hentai'
filtered_df = df3[~df3['Genres'].str.contains('Ecchi|Hentai')]

In [156]:
popular_dict_10 = filtered_df.sort_values("Favorites",ascending = False).head(10)

In [136]:
popular_dict = df[df['anime_id'].isin(all_recommended_anime_ids)].drop_duplicates('anime_id')[['anime_id', 'Anime Title']].to_dict(orient='records')

In [158]:
popular_dict_10 = filtered_df[filtered_df['anime_id'].isin(all_recommended_anime_ids)].drop_duplicates('anime_id')[['anime_id', 'Name']].to_dict(orient='records')

In [137]:
file_path = "popular_dict.pkl"

# Open the file in binary write mode and save the list using pickle.dump()
with open(file_path, 'wb') as f:
    pickle.dump(popular_dict, f)

In [159]:
file_path = "popular_dict_10.pkl"

# Open the file in binary write mode and save the list using pickle.dump()
with open(file_path, 'wb') as f:
    pickle.dump(popular_dict_10, f)