# NEURAL NETWORK HYBRID COLLOBORATIVE FILTERING

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

dataframe_anime = pd.read_csv('./anime.csv')
dataframe_anime = dataframe_anime[['anime_id', 'name', 'genre']]
dataframe_anime = dataframe_anime.drop_duplicates()

dataframe_ratings = pd.read_csv('./rating.csv')
dataframe_ratings_cp = dataframe_ratings[['user_id', 'anime_id', 'rating']]
dataframe_ratings_cp = dataframe_ratings_cp[dataframe_ratings_cp['user_id'] <= 1000] # only get the first 1000 users
dataframe_ratings_cp = dataframe_ratings_cp[dataframe_ratings_cp['rating'] >= 0]
dataframe_ratings_cp['rating'] = dataframe_ratings_cp['rating'] / 10  # Scale the scores to the range of 0 to 5
dataframe_ratings_cp = dataframe_ratings_cp.sample(frac=1).reset_index(drop=True)

user_mapping = {user_id: idx for idx, user_id in enumerate(dataframe_ratings_cp.user_id.unique())}
anime_mapping = {anime_id: idx for idx, anime_id in enumerate(dataframe_ratings_cp.anime_id.unique())}


dataframe_ratings_cp['user_id'] = dataframe_ratings_cp['user_id'].apply(lambda x: user_mapping[x])
dataframe_ratings_cp['anime_id'] = dataframe_ratings_cp['anime_id'].apply(lambda x: anime_mapping[x])

genres_18_above = ['Hentai', 'Ecchi', 'Harem', 'Yuri', 'Yaoi']
def is_18_above(genre_str):
    if isinstance(genre_str, str):
        for genre in genres_18_above:
            if genre in genre_str:
                return 1
        return 0
dataframe_anime['18_above'] = dataframe_anime['genre'].apply(is_18_above)

# ======================== neural network-based item similarity and rating prediction model ========================
class ItemSimilarityAndRating(nn.Module):
    def __init__(self, n_users, n_anime, n_factors):
        super(ItemSimilarityAndRating, self).__init__()
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.anime_factors = nn.Embedding(n_anime, n_factors)
        self.user_bias = nn.Embedding(n_users, 1)
        self.anime_bias = nn.Embedding(n_anime, 1)

    def rating(self, user, anime):
        dot_product = (self.user_factors(user) * self.anime_factors(anime)).sum(1)
        rating = dot_product + self.user_bias(user).squeeze() + self.anime_bias(anime).squeeze()
        return torch.sigmoid(rating) * 10


    def similarity(self, anime1, anime2):
        dot_product = (self.anime_factors(anime1) * self.anime_factors(anime2)).sum(1)
        return dot_product

n_users = len(user_mapping)
n_anime = len(anime_mapping)
n_factors = 200

model = ItemSimilarityAndRating(n_users, n_anime, n_factors)
loss_func = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 25
batch_size = 1024

print("Start training:")
for epoch in range(epochs):
    train_data, val_data = train_test_split(dataframe_ratings_cp, test_size=0.2, random_state=42)
    shuffled_indices = torch.randperm(len(train_data))
    for batch_start in range(0, len(train_data), batch_size):
        batch_indices = shuffled_indices[batch_start:batch_start + batch_size]
        user_batch = torch.tensor(train_data.iloc[batch_indices]['user_id'].values, dtype=torch.long)
        anime_batch = torch.tensor(train_data.iloc[batch_indices]['anime_id'].values, dtype=torch.long)
        score_batch = torch.tensor(train_data.iloc[batch_indices]['rating'].values, dtype=torch.float32)

        optimizer.zero_grad()
        predictions = model.rating(user_batch, anime_batch)
        loss = loss_func(predictions, score_batch)
        loss.backward()
        optimizer.step()

    print(f"epoch {epoch + 1}/{epochs} - MSELoss: {loss.item()}")
print("Done training.")

Start training:
epoch 1/25 - MSELoss: 40.406768798828125
epoch 2/25 - MSELoss: 37.85776901245117
epoch 3/25 - MSELoss: 38.86270523071289
epoch 4/25 - MSELoss: 29.537490844726562
epoch 5/25 - MSELoss: 32.16763687133789
epoch 6/25 - MSELoss: 30.63105010986328
epoch 7/25 - MSELoss: 29.91755485534668
epoch 8/25 - MSELoss: 30.3470401763916
epoch 9/25 - MSELoss: 30.441120147705078
epoch 10/25 - MSELoss: 29.861879348754883
epoch 11/25 - MSELoss: 27.396835327148438
epoch 12/25 - MSELoss: 30.411827087402344
epoch 13/25 - MSELoss: 29.024381637573242
epoch 14/25 - MSELoss: 28.306671142578125
epoch 15/25 - MSELoss: 27.371484756469727
epoch 16/25 - MSELoss: 27.954891204833984
epoch 17/25 - MSELoss: 26.81529998779297
epoch 18/25 - MSELoss: 28.00058364868164
epoch 19/25 - MSELoss: 26.667510986328125
epoch 20/25 - MSELoss: 27.445369720458984
epoch 21/25 - MSELoss: 25.456159591674805
epoch 22/25 - MSELoss: 26.408143997192383
epoch 23/25 - MSELoss: 26.91847038269043
epoch 24/25 - MSELoss: 27.87816810607

In [21]:
dataframe_anime.columns

Index(['anime_id', 'name', 'genre', '18_above'], dtype='object')

In [22]:
dataframe_ratings.columns

Index(['user_id', 'anime_id', 'rating'], dtype='object')

In [23]:
dataframe_anime = dataframe_anime[dataframe_anime['18_above'] == 0].reset_index(drop=True)

In [24]:
model.eval()

val_user_batch = torch.tensor(val_data['user_id'].values, dtype=torch.long)
val_anime_batch = torch.tensor(val_data['anime_id'].values, dtype=torch.long)
val_score_batch = torch.tensor(val_data['rating'].values, dtype=torch.float32)

with torch.no_grad():
    val_predictions = model.rating(val_user_batch, val_anime_batch)
    val_loss = loss_func(val_predictions, val_score_batch)

print(f"Validation Loss: {val_loss.item()}")

Validation Loss: 39.21752166748047


In [25]:
val_predictions_np = val_predictions.numpy()
result_df = pd.DataFrame({
    'user_id': val_data['user_id'].values,
    'anime_id': val_data['anime_id'].values,
    'original_score': val_data['rating'].values*10,
    'predicted_score': np.round(val_predictions_np,0)})

In [26]:
result_df

Unnamed: 0,user_id,anime_id,original_score,predicted_score
0,96,656,8.0,0.0
1,79,3668,4.0,2.0
2,366,482,8.0,4.0
3,109,490,8.0,10.0
4,357,2569,7.0,0.0
...,...,...,...,...
15577,282,204,9.0,6.0
15578,166,416,8.0,0.0
15579,205,752,8.0,10.0
15580,21,22,7.0,10.0


In [27]:
mean_abs_err = torch.mean(torch.abs(val_predictions - val_score_batch)).item()
print(f"mean_abs_err: {mean_abs_err}")

mean_abs_err: 4.771821975708008


In [28]:
torch.save(model.state_dict(), 'item_sim_and_rating_model.pth')

In [29]:
loaded_model = ItemSimilarityAndRating(n_users, n_anime, n_factors)
loaded_model.load_state_dict(torch.load('item_sim_and_rating_model.pth'))
loaded_model.eval()

ItemSimilarityAndRating(
  (user_factors): Embedding(941, 200)
  (anime_factors): Embedding(4510, 200)
  (user_bias): Embedding(941, 1)
  (anime_bias): Embedding(4510, 1)
)

In [30]:
# Similarity
anime_id = 15
anime_index = anime_mapping[anime_id]
anime_tensor = torch.tensor([anime_index] * n_anime, dtype=torch.long)
anime_tensor_2 = torch.tensor(list(range(n_anime)), dtype=torch.long)

anime_sim = loaded_model.similarity(anime_tensor, anime_tensor_2)
top_10_similar = torch.topk(anime_sim, 10).indices

top_10_anime_ids = [list(anime_mapping.keys())[list(anime_mapping.values()).index(idx)] for idx in top_10_similar.tolist()]
recommended_anime = dataframe_anime[dataframe_anime['anime_id'].isin(top_10_anime_ids)].reset_index(drop=True)
recommended_anime = recommended_anime[recommended_anime['18_above'] == 0].reset_index(drop=True)

top_10_sims = anime_sim[top_10_similar].tolist()
print(f"Top 10 similar animes and sim-scores for anime_id[{anime_id}]:")
for id, row in recommended_anime.iterrows():
    print(f"{row['name']} (sim-score: {top_10_sims[id]:.2f})")

Top 10 similar animes and sim-scores for anime_id[15]:
Sakamichi no Apollon (sim-score: 176.77)
Eyeshield 21 (sim-score: 46.05)
SKET Dance OVA (sim-score: 43.79)
AKB0048: Next Stage (sim-score: 42.89)
3x3 Eyes Seima Densetsu (sim-score: 41.77)
Dragonaut: The Resonance Special (sim-score: 39.60)
Project A-Ko (sim-score: 39.06)
Bakumatsu Rock (sim-score: 38.37)


In [31]:
dataframe_ratings

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


In [32]:
user_id = 15
user_index = user_mapping[user_id]
user_tensor = torch.tensor([user_index] * n_anime, dtype=torch.long)

predicted_ratings = loaded_model.rating(user_tensor, anime_tensor_2)

unique_anime_ids = dataframe_ratings_cp['anime_id'].unique()
unique_animes_df = dataframe_anime[dataframe_anime['anime_id'].isin(unique_anime_ids)].reset_index(drop=True)
predicted_ratings_df = pd.DataFrame({'name': unique_animes_df['name'], 'predicted_rating': predicted_ratings.tolist()})

predicted_ratings_df = predicted_ratings_df.sort_values(by='predicted_rating', ascending=False).head(10)
print(f"\nPredicted Ratings for user_id[{user_id}]:")
print(predicted_ratings_df['predicted_rating'])

ValueError: ignored