<a href="https://colab.research.google.com/github/ciddy0/Anime-Recommender/blob/main/AnimeRecommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
anime_df = pd.read_csv('/content/anime.csv')
ratings_df = pd.read_csv('/content/rating.csv')


In [None]:
print('The dimensions of anime dataframe are: ', anime_df.shape , '\nTHe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of anime dataframe are:  (12294, 7) 
THe dimensions of ratings dataframe are: (1393382, 3)


In [None]:
#take a look at anime_df
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [None]:
#take a look at rating_df
ratings_df

Unnamed: 0,user_id,anime_id,rating
0,1,20.0,-1.0
1,1,24.0,-1.0
2,1,79.0,-1.0
3,1,226.0,-1.0
4,1,241.0,-1.0
...,...,...,...
710588,6695,12673.0,8.0
710589,6695,12679.0,8.0
710590,6695,12685.0,5.0
710591,6695,12729.0,7.0


In [None]:
ratings_df_cleaned = ratings_df.dropna()

In [None]:
 #Anime ID to anime name mapping
 anime_names = anime_df.set_index('anime_id')['name'].to_dict()
 n_users = len(ratings_df_cleaned.user_id.unique())
 n_items = len(ratings_df_cleaned.anime_id.unique())
 print("Number of unqiue users: ", n_users)
 print("Number of unqiue animes: ", n_items)
 print("The full rating matrix will have: ", n_users*n_items, 'elements.')
 print('-------------')
 print("Number of ratings: ", len(ratings_df))
 print("Therefore: ", len(ratings_df_cleaned)/(n_users*n_items)*100, '% of the matrix will be filled.')
 #matrix factroization can realize the rating matrix implicity thus we dont need all the dat


Number of unqiue users:  6695
Number of unqiue animes:  7758
The full rating matrix will have:  51939810 elements.
-------------
Number of ratings:  710593
Therefore:  1.3681066603824696 % of the matrix will be filled.


In [None]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
  def __init__(self, n_users, n_items, n_factors=20):
    super().__init__()
    # create user embeddings
    self.user_factors = torch.nn.Embedding(n_users, n_factors)
    # create item embeddings
    self.item_factors = torch.nn.Embedding(n_items, n_factors)

    self.user_factors.weight.data.uniform_(0, 0.05)
    self.item_factors.weight.data.uniform_(0, 0.05)
  def forward(self, data):
    # matrix multiplication
    users, items = data[:,0], data[:,1]
    return (self.user_factors(users)*self.item_factors(items)).sum(1)
  def predict(self, user, item):
    return self.forward(user, item)



In [None]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

class Loader(Dataset):
  def __init__(self):
    self.ratings = ratings_df_cleaned.copy()

    # Extract all user IDs and anime IDs
    users = ratings_df_cleaned.user_id.unique()
    animes = ratings_df_cleaned.anime_id.unique()

    # Producing new continous IDs for users and animes
    self.userid2idx = {o:i for i,o in enumerate(users)}
    self.animeid2idx = {o:i for i,o in enumerate(animes)}

    # Unique values : index
    self.idx2userid = {i:o for o, i in self.userid2idx.items()}
    self.idx2animeid = {i:o for o, i in self.animeid2idx.items()}

    # Obtained continous ID for users adn animes
    self.ratings.anime_id = ratings_df_cleaned.anime_id.apply(lambda x: self.animeid2idx[x])
    self.ratings.user_id = ratings_df_cleaned.user_id.apply(lambda x: self.userid2idx[x])

    # return the id from the indexed values as noted in the lambda function down below
    self.x = self.ratings.drop(['rating'], axis=1).values
    self.y = self.ratings['rating'].values

    # Transforms the data to tensors (ready for torch models)
    self.x, self.y = torch.tensor(self.x), torch.tensor(self.y)

  def __getitem__(self, index):
    return (self.x[index], self.y[index])
  def __len__(self):
    return len(self.ratings)





In [None]:
num_epochs = 128
cuda = torch.cuda.is_available()

print('Is running on GPU: ', cuda)
model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)

for name, param in model.named_parameters():
  if param.requires_grad:
    print(name, param.data)
# GPU enable if you have a GPU
if cuda:
  model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)


Is running on GPU:  True
MatrixFactorization(
  (user_factors): Embedding(6695, 8)
  (item_factors): Embedding(7758, 8)
)
user_factors.weight tensor([[0.0191, 0.0115, 0.0202,  ..., 0.0317, 0.0376, 0.0083],
        [0.0272, 0.0350, 0.0348,  ..., 0.0497, 0.0005, 0.0213],
        [0.0149, 0.0319, 0.0181,  ..., 0.0307, 0.0400, 0.0396],
        ...,
        [0.0263, 0.0135, 0.0093,  ..., 0.0300, 0.0327, 0.0498],
        [0.0338, 0.0454, 0.0247,  ..., 0.0224, 0.0125, 0.0211],
        [0.0188, 0.0433, 0.0133,  ..., 0.0145, 0.0017, 0.0330]])
item_factors.weight tensor([[0.0039, 0.0042, 0.0440,  ..., 0.0425, 0.0196, 0.0310],
        [0.0367, 0.0157, 0.0387,  ..., 0.0136, 0.0290, 0.0226],
        [0.0244, 0.0375, 0.0463,  ..., 0.0146, 0.0145, 0.0297],
        ...,
        [0.0150, 0.0203, 0.0413,  ..., 0.0495, 0.0253, 0.0371],
        [0.0348, 0.0087, 0.0205,  ..., 0.0315, 0.0370, 0.0454],
        [0.0147, 0.0331, 0.0486,  ..., 0.0334, 0.0086, 0.0259]])


In [None]:
torch.save(model.state_dict(), '/content/drive/My Drive/matrix_factorization_model.pth')

In [None]:
for it in tqdm(range(num_epochs)):
  losses = []
  for x,y in train_loader:
    if cuda:
      x,y = x.cuda(), y.cuda()
      optimizer.zero_grad()
      outputs = model(x)
      loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
      losses.append(loss.item())
      loss.backward()
      optimizer.step()
    print("iter #{}".format(it), "loss: ", sum(losses)/len(losses))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
iter #127 loss:  3.0109160672906943
iter #127 loss:  3.011992616128405
iter #127 loss:  3.011193739186536
iter #127 loss:  3.0093360768376494
iter #127 loss:  3.007257968133827
iter #127 loss:  3.0066760883109116
iter #127 loss:  3.004921619495467
iter #127 loss:  3.00211931339332
iter #127 loss:  3.0027739410944374
iter #127 loss:  3.001056845926305
iter #127 loss:  3.0011237400251427
iter #127 loss:  3.0022645871689977
iter #127 loss:  3.005525261110964
iter #127 loss:  3.0082422638107946
iter #127 loss:  3.0077143779297146
iter #127 loss:  3.0075116245679454
iter #127 loss:  3.008840274307975
iter #127 loss:  3.010144807581316
iter #127 loss:  3.0133235049540024
iter #127 loss:  3.0169608467942353
iter #127 loss:  3.0153042662622207
iter #127 loss:  3.0157335057906574
iter #127 loss:  3.017904919748721
iter #127 loss:  3.0183438315159745
iter #127 loss:  3.0188754389976094
iter #127 loss:  3.022618766886965
iter #127 l

In [None]:
# By training the model we will have tuned factors for movies and users
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
  if param.requires_grad:
    print(name, param.data)
    if c == 0:
      uw = param.data
      c+=1
    else:
      iw = param.data

user_factors.weight tensor([[-0.1760, -0.2113, -0.0914,  ..., -0.2890, -0.3261, -0.0600],
        [ 0.2487,  2.5490,  0.4062,  ...,  0.4388,  1.1042, -2.0371],
        [ 1.5199,  0.7508,  1.2671,  ...,  1.2541,  2.0453,  0.5709],
        ...,
        [ 0.6211,  1.6103,  1.6067,  ...,  1.6276,  1.6313,  1.6450],
        [ 1.6046,  2.5964,  1.1132,  ..., -0.7323, -0.5183,  0.8456],
        [ 1.6999,  1.1999,  1.3011,  ...,  1.3645,  1.4622,  1.1135]],
       device='cuda:0')
item_factors.weight tensor([[-0.1432, -0.3199,  0.8894,  ...,  1.6290,  1.7455,  0.6059],
        [ 0.9814,  0.8274, -0.3438,  ...,  0.9180,  0.9437,  0.6034],
        [ 0.3109,  1.2454, -0.2834,  ...,  0.6934,  1.6442,  0.9609],
        ...,
        [ 0.7475,  0.7460,  0.7753,  ...,  0.7821,  0.7541,  0.7628],
        [ 0.7665,  0.7437,  0.7533,  ...,  0.7662,  0.5218,  0.7861],
        [ 1.0670,  1.5506,  1.0998,  ...,  1.0852,  1.0607,  1.0775]],
       device='cuda:0')


In [None]:
trained_anime_embeddings = model.item_factors.weight.data.cpu().numpy()

In [None]:
len(trained_anime_embeddings) #unique movie factor weights

7758

In [None]:
from sklearn.cluster import KMeans
# fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=20, random_state=0).fit(trained_anime_embeddings)



In [None]:
# It can be seen here that the animes are in the same cluster tend to have similar genres. Also
# note that the alg is unfamilaiar with the anime name and only obtained the relationships by looking at the numbers
# representing how users have repsonded to the anime selection
for cluster in range(20):
  print("Cluster #{}".format(cluster))
  anms = []
  for anmsidx in np.where(kmeans.labels_ == cluster)[0]:
    ansid = train_set.idx2animeid[anmsidx]
    rat_count = ratings_df.loc[ratings_df['anime_id']==ansid].count()[0]
    anms.append((anime_names[ansid], rat_count))
  for anm in sorted(anms, key=lambda tup: tup[1], reverse = True)[:10]:
    print("\t", anm[0])

Cluster #0
	 Shingeki no Kyojin
	 Sword Art Online
	 Mirai Nikki (TV)
	 Highschool of the Dead
	 Ao no Exorcist
	 Another
	 No Game No Life
	 Noragami
	 Deadman Wonderland
	 Kill la Kill
Cluster #1
	 Clannad: Mou Hitotsu no Sekai, Tomoyo-hen
	 Clannad: After Story - Mou Hitotsu no Sekai, Kyou-hen
	 Darker than Black: Kuro no Keiyakusha Special
	 Clannad Movie
	 Ookami to Koushinryou II: Ookami to Kohakuiro no Yuuutsu
	 Love Hina Again
	 K-On!: Live House!
	 Full Metal Panic! The Second Raid: Wari to Hima na Sentaichou no Ichinichi
	 Zero no Tsukaima: Princesses no Rondo - Yuuwaku no Sunahama
	 Love Hina Christmas Special: Silent Eve
Cluster #2
	 Dragon Ball
	 Dragon Ball GT
	 Elfen Lied Special
	 Naruto Movie 1: Dai Katsugeki!! Yuki Hime Shinobu Houjou Dattebayo!
	 Yu☆Gi☆Oh! Duel Monsters
	 Neon Genesis Evangelion: Death &amp; Rebirth
	 Mirai Nikki
	 Yu☆Gi☆Oh!
	 Mobile Suit Gundam Wing
	 Fairy Tail Movie 1: Houou no Miko
Cluster #3
	 Lucky☆Star: Original na Visual to Animation
	 Mahou 

In [None]:
import csv
all_anime_cluster_labels = kmeans.predict(trained_anime_embeddings)
print(all_anime_cluster_labels)
file_path = '/content/drive/My Drive/anime_cluster_labels_ratings.txt'

# Write the anime names and their cluster labels to the file
file_path = '/content/drive/My Drive/anime_cluster_labels_with_ratings.csv'

# Write the data to the CSV file
with open(file_path, 'w', newline='') as file:
    writer = csv.writer(file)
    # Write the header row
    writer.writerow(['Anime Name', 'Rating Count', 'Cluster Label'])
    # Write the anime names, rating counts, and cluster labels
    for idx, cluster_label in enumerate(all_anime_cluster_labels, 1):
          writer.writerow([idx, rating_count, cluster_label])


[ 8 12 12 ... 14 14 14]
Anime ID 2 not found in the dataset.
Anime ID 3 not found in the dataset.
Anime ID 4 not found in the dataset.
Anime ID 9 not found in the dataset.
Anime ID 10 not found in the dataset.
Anime ID 11 not found in the dataset.
Anime ID 12 not found in the dataset.
Anime ID 13 not found in the dataset.
Anime ID 14 not found in the dataset.
Anime ID 34 not found in the dataset.
Anime ID 35 not found in the dataset.
Anime ID 36 not found in the dataset.
Anime ID 37 not found in the dataset.
Anime ID 38 not found in the dataset.
Anime ID 39 not found in the dataset.
Anime ID 40 not found in the dataset.
Anime ID 41 not found in the dataset.
Anime ID 42 not found in the dataset.
Anime ID 70 not found in the dataset.
Anime ID 78 not found in the dataset.
Anime ID 140 not found in the dataset.
Anime ID 172 not found in the dataset.
Anime ID 224 not found in the dataset.
Anime ID 409 not found in the dataset.
Anime ID 410 not found in the dataset.
Anime ID 414 not found in

In [None]:
from joblib import dump
from google.colab import drive
drive.mount('/content/drive')
# Create a folder in the root directory
!mkdir -p "/content/drive/My Drive/animerec"
# Save the trained KMeans model to a file
with open('/content/drive/My Drive/animerec/kmeans_model2.joblib', 'wb') as f:
    dump(kmeans, 'kmeans_model2.joblib')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
np.save('/content/drive/My Drive/trained_movie_embeddings.npy', trained_movie_embeddings)

In [None]:
def recommend_anime(input_anime_id, model, kmeans, train_set, anime_names):
    # 1. Get the embedding for the input anime
    input_anime_embedding = model.item_factors.weight.data[train_set.animeid2idx[input_anime_id]].cpu().numpy()

    # 2. Predict cluster for the input anime
    input_anime_cluster = kmeans.predict(input_anime_embedding.reshape(1, -1))[0]

    # 3. Find anime in the same cluster
    similar_anime_indices = np.where(kmeans.labels_ == input_anime_cluster)[0]

    # 4. Recommend anime
    recommendations = []
    for anime_idx in similar_anime_indices:
        anime_id = train_set.idx2animeid[anime_idx]
        rat_count = ratings_df.loc[ratings_df['anime_id'] == anime_id].count()[0]
        recommendations.append((anime_names[anime_id], rat_count))

    # Sort recommendations by rating count in descending order
    recommendations.sort(key=lambda x: x[1], reverse=True)

    return recommendations[:10]  # Return top 10 recommendations

input_anime_id = 20583  # Replace with the ID of the input anime provided by the user
recommendations = recommend_anime(input_anime_id, model, kmeans, train_set, anime_names)

# Print recommendations
print("Recommendations for anime '{}':".format(anime_names[input_anime_id]))
for idx, recommendation in enumerate(recommendations, 1):
    print("{}. {} (Rating Count: {})".format(idx, recommendation[0], recommendation[1]))

Recommendations for anime 'Haikyuu!!':
1. One Punch Man (Rating Count: 1260)
2. Kiseijuu: Sei no Kakuritsu (Rating Count: 1035)
3. Zankyou no Terror (Rating Count: 1016)
4. Death Parade (Rating Count: 938)
5. Gekkan Shoujo Nozaki-kun (Rating Count: 922)
6. Boku dake ga Inai Machi (Rating Count: 859)
7. Shigatsu wa Kimi no Uso (Rating Count: 858)
8. Shokugeki no Souma (Rating Count: 763)
9. Magi: The Kingdom of Magic (Rating Count: 761)
10. Nanatsu no Taizai (Rating Count: 754)


In [None]:
total_anime = len(train_set)

# Get the total number of anime assigned to clusters
anime_in_clusters = len(np.unique(kmeans.labels_))

# Calculate the number of anime not assigned to any cluster
anime_not_in_clusters = total_anime - anime_in_clusters

if anime_not_in_clusters == 0:
    print("All anime in the dataset are assigned to clusters.")
else:
    print("There are {} anime not assigned to any cluster.".format(anime_not_in_clusters))

There are 710572 anime not assigned to any cluster.


In [None]:
num_clusters = kmeans.n_clusters

print("Number of clusters:", num_clusters)

Number of clusters: 10
