# **Collaborative Filtering using Matrix Fractionalisation**

In [1]:
import pandas as pd
import numpy as np
import pickle

The dataset used is the [Anime Recommendation Database (2020)](https://www.kaggle.com/hernan4444/anime-recommendation-database-2020). Lets load the dataset using pandas



In [2]:
animelist = pd.read_csv('anime.csv')
x = pd.read_csv('rating_complete.csv')

Taking only users that has rated more than 50 animes

In [7]:
new_ratings = x['user_id'].value_counts()
new_x = x[x['user_id'].isin(new_ratings[new_ratings >= 50])].copy()
user_ids = {int(j): i for i, j in enumerate(new_x['user_id'].unique())}
anime_ids = {int(j): i for i, j in enumerate(new_x['anime_id'].unique())}

Calculating the Anime-User matrix


In [8]:
num_animes = len(anime_ids)
num_users = len(user_ids)
print(f"Number of animes: {num_animes}, Number of users: {num_users}")
y = np.zeros((num_animes, num_users))
r = np.zeros((num_animes, num_users))
print(np.shape(y))
for user_id, anime_id, rating in new_x.values:
    y[anime_ids[anime_id], user_ids[user_id]] = rating

r = np.where(y == 0, 0, 1)

Number of animes: 11053, Number of users: 2002
(11053, 2002)


Defining cost and partial derivative functions

In [49]:
def cost(X, Theta, y, ld, num_animes, num_users, num_features):
  J = ((np.sum(((np.dot(X, Theta.T) * r) - y) ** 2)) / 2) + ((ld * np.sum(X ** 2))/ 2) + ((ld * np.sum(Theta ** 2))/ 2)
  return J

def partial_deriv(X, Theta, y, ld, num_animes, num_users, num_features):  
  X_grad = np.zeros(np.shape(X))
  Theta_grad = np.zeros(np.shape(Theta))

  for i in range(num_animes):
    idx = np.argwhere(r[i, :] == 1)
    idx = np.reshape(idx, (idx.shape[0],))
    Theta_temp = Theta[idx, :]
    y_temp = y[i, idx]

    p_1 = (np.dot(X[i, :], Theta_temp.T) - y_temp)
    p_2 = np.dot(p_1, Theta_temp)
    X_grad[i, :] = p_2 + (ld * X[i, :])

  for i in range(num_users):
    idx = np.argwhere(r[:, i] == 1)
    idx = np.reshape(idx, (idx.shape[0],))
    X_temp = X[idx, :]
    y_temp = y[idx, i]

    p_1 = (np.dot(X_temp, Theta[i, :].T) - y_temp)
    p_2 = np.dot(p_1.T, X_temp)
    Theta_grad[i, :] = p_2 + (ld * Theta[i, :])
  
  return X_grad, Theta_grad

Initializing parameters and running for 12000 iterations

In [None]:
norm_y = ((y.T - np.mean(y, axis=1)).T ) * r
num_features = 10
X = np.random.randn(num_animes, num_features)
Theta = np.random.randn(num_users, num_features)

alpha = 3e-5
num_iters = 12000
ld = 10

costs = []

for i in range(num_iters):
  J = cost(X, Theta, norm_y, ld, num_animes, num_users, num_features)
  costs.append(J)
  print(f"Cost: {J}")

  X_grad, Theta_grad = partial_deriv(X, Theta, norm_y, ld, num_animes, num_users)

  X = X - (alpha * X_grad)
  Theta = Theta - (alpha * Theta_grad)

After 12000 iterations, we get a cost of nearly 30000. Now, lets try to calculate similiar animes to the given anime using the anime feature matrix (X).

In [3]:
from scipy import spatial

def calculate_similiar(X, anime_id, num=5):
  anime_index = anime_ids[anime_id]
  anime_params = X[anime_index, :]
  #sum_mat = np.sum((X - anime_params) ** 2, axis=1)
  m = X.shape[0]
  sum_mat = np.zeros(m)
  for i in range(m):
    sum_mat[i] = spatial.distance.cosine(X[i, :], anime_params)
  sum_ind = np.argsort(sum_mat)
  suggested_animes = []
  for i in range(1, num+1):
    idx = np.where(sum_ind == i)[0][0]
    suggested_animes.append(list(anime_ids.keys())[idx])
  return suggested_animes

def get_animenames(ids: list):
  info = []
  for id in ids:
    details = animelist[animelist["MAL_ID"] == id]
    name = details.at[details.index[0], 'Name']
    english_name = details.at[details.index[0], 'English name']
    genre = details.at[details.index[0], 'Genres']
    info.append((name, english_name, genre))
  return info

In [52]:
anime_name = "Naruto"
anime_details = animelist[animelist["English name"] == anime_name]
id_anime = anime_details.at[anime_details.index[0], 'MAL_ID']
ids = calculate_similiar(X, id_anime, num=10)
anime_rec = get_animenames(ids)
for a in range(len(anime_rec)):
  print(f"{a+1}. Name: {anime_rec[a][0]} | English Name: {anime_rec[a][1]} | Genres: {anime_rec[a][2]}")


1. Name: Onegai☆Teacher | English Name: Please Teacher! | Genres: Sci-Fi, Comedy, Drama, Romance, School
2. Name: Toradora! | English Name: Toradora! | Genres: Slice of Life, Comedy, Romance, School
3. Name: Tengen Toppa Gurren Lagann Movie Zenyasai: Viral no Amai Yume | English Name: Unknown | Genres: Comedy
4. Name: Free!: Take Your Marks | English Name: Free! -Take Your Marks- | Genres: School, Slice of Life, Sports
5. Name: Tengen Toppa Gurren Lagann | English Name: Gurren Lagann | Genres: Action, Adventure, Comedy, Mecha, Sci-Fi
6. Name: Ayakashi | English Name: Unknown | Genres: Action, Sci-Fi, Horror, Fantasy
7. Name: Kidou Senkan Nadesico | English Name: Martian Successor Nadesico | Genres: Action, Comedy, Mecha, Military, Parody, Romance, Sci-Fi, Shounen, Space
8. Name: Owarimonogatari 2nd Season | English Name: Owarimonogatari Second Season | Genres: Mystery, Comedy, Supernatural, Vampire
9. Name: Kyougoku Natsuhiko: Kousetsu Hyaku Monogatari | English Name: Requiem from the 

This is not at all related to Naruto. 
Now, lets try **Collaborative Filtering using KNN**

In [9]:
from scipy.sparse import csr_matrix

#Converting y into sparse matrix for efficient computation
sparse_y = csr_matrix(y)

Now lets define our KNN model


In [10]:
from sklearn.neighbors import NearestNeighbors

knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn_model.fit(sparse_y)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [11]:
def get_knnrecommendation(knn_model, anime_id, y):
  distances, indices = knn_model.kneighbors(y[anime_ids[anime_id], :].reshape(1, -1), n_neighbors=11)
  reverse_ids = {v: k for k, v in anime_ids.items()}
  anime_list = [reverse_ids[i] for i in indices.reshape(indices.shape[1])]
  anime_list.pop(0)
  return anime_list

In [12]:
anime_name = "Hunter x Hunter"
anime_details = animelist[animelist["English name"] == anime_name]
id_anime = anime_details.at[anime_details.index[0], 'MAL_ID']
ids = get_knnrecommendation(knn_model, id_anime, sparse_y)
anime_rec = get_animenames(ids)
for a in range(len(anime_rec)):
  print(f"{a+1}. Name: {anime_rec[a][0]} | English Name: {anime_rec[a][1]} | Genres: {anime_rec[a][2]}")


1. Name: Hunter x Hunter: Original Video Animation | English Name: Unknown | Genres: Action, Adventure, Super Power, Shounen
2. Name: Hunter x Hunter: Greed Island | English Name: Unknown | Genres: Action, Adventure, Super Power, Fantasy, Shounen
3. Name: Hunter x Hunter: Greed Island Final | English Name: Unknown | Genres: Action, Adventure, Super Power, Fantasy, Shounen
4. Name: Yuu☆Yuu☆Hakusho | English Name: Yu Yu Hakusho:Ghost Files | Genres: Action, Comedy, Demons, Supernatural, Martial Arts, Shounen
5. Name: Fullmetal Alchemist | English Name: Fullmetal Alchemist | Genres: Action, Adventure, Comedy, Drama, Fantasy, Magic, Military, Shounen
6. Name: Hajime no Ippo | English Name: Fighting Spirit | Genres: Comedy, Sports, Drama, Shounen
7. Name: Naruto | English Name: Naruto | Genres: Action, Adventure, Comedy, Super Power, Martial Arts, Shounen
8. Name: Hunter x Hunter (2011) | English Name: Hunter x Hunter | Genres: Action, Adventure, Fantasy, Shounen, Super Power
9. Name: Fullm

This algorithm is better.
Lets save the model and use it to make a **telegram bot**

In [13]:
import pickle
var_file = open("/content/gdrive/MyDrive/Data/AnimeRecommendation/knnmodel", "wb")
pickle.dump(knn_model, var_file)
var_file.close()