# Predicting Anime Ratings using Matrix Factorization in PyTorch

In [2]:
!pip install jovian

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jovian
  Downloading jovian-0.2.45-py2.py3-none-any.whl (68 kB)
[K     |████████████████████████████████| 68 kB 5.7 MB/s 
Collecting uuid
  Downloading uuid-1.30.tar.gz (5.8 kB)
Building wheels for collected packages: uuid
  Building wheel for uuid (setup.py) ... [?25l[?25hdone
  Created wheel for uuid: filename=uuid-1.30-py3-none-any.whl size=6501 sha256=c8c46a78fdec250c5f66ecd780a8a7180a66458682e32e6cd11c5552c0370a1f
  Stored in directory: /root/.cache/pip/wheels/2a/ea/87/dd57f1ecb4f0752f3e1dbf958ebf8b36d920d190425bcdc24d
Successfully built uuid
Installing collected packages: uuid, jovian
Successfully installed jovian-0.2.45 uuid-1.30


In [3]:
#library imports
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy import sparse
import jovian

#Problem Statement :
Given a set of user ratings for anime, predict the rating for each user-anime pair

In [4]:
anime_ratings_df = pd.read_csv("rating.csv")
anime_ratings_df.shape
print(anime_ratings_df.head())

   user_id  anime_id  rating
0        1        20      -1
1        1        24      -1
2        1        79      -1
3        1       226      -1
4        1       241      -1


In [5]:
anime_ratings = anime_ratings_df.loc[anime_ratings_df.rating != -1].reset_index()[['user_id','anime_id','rating']]
print(anime_ratings.shape)
anime_ratings.head()

(6337241, 3)


Unnamed: 0,user_id,anime_id,rating
0,1,8074,10
1,1,11617,10
2,1,11757,10
3,1,15451,10
4,2,11771,10


In [6]:
Counter(anime_ratings.rating) # Distribution of Ratings

Counter({10: 955715,
         8: 1646019,
         6: 637775,
         9: 1254096,
         7: 1375287,
         3: 41453,
         5: 282806,
         4: 104291,
         1: 16649,
         2: 23150})

In [7]:
Counter(anime_ratings.groupby(['user_id']).count()['anime_id']) # Number of ratings per user

Counter({4: 1513,
         1: 3249,
         92: 194,
         459: 10,
         343: 25,
         12: 818,
         3: 1773,
         110: 186,
         22: 561,
         123: 119,
         8: 1089,
         407: 14,
         23: 568,
         21: 623,
         19: 701,
         180: 98,
         72: 232,
         52: 377,
         38: 430,
         79: 231,
         11: 916,
         129: 117,
         35: 469,
         76: 255,
         17: 703,
         18: 680,
         5: 1356,
         68: 301,
         253: 43,
         51: 340,
         54: 351,
         85: 219,
         6: 1231,
         280: 27,
         112: 159,
         26: 578,
         334: 24,
         34: 462,
         90: 226,
         116: 149,
         60: 305,
         40: 414,
         32: 511,
         16: 787,
         30: 452,
         7: 1195,
         42: 425,
         96: 184,
         20: 651,
         25: 572,
         57: 322,
         2: 2095,
         37: 440,
         73: 237,
         173: 87,
     

In [8]:
np.mean(anime_ratings.groupby(['user_id']).count()['anime_id']) #Average number of ratings per user

91.05231321839081

## Train-Valid Split

In [9]:
train_df, valid_df = train_test_split(anime_ratings, test_size=0.2)

#resetting indices to avoid indexing errors in the future
train_df = train_df.reset_index()[['user_id', 'anime_id', 'rating']]
valid_df = valid_df.reset_index()[['user_id', 'anime_id', 'rating']]

## Training
### Encoding columns with continuous ids

In [10]:
def encode_column(column):
    """ Encodes a pandas column with continous IDs"""
    keys = column.unique()
    key_to_id = {key:idx for idx,key in enumerate(keys)}
    return key_to_id, np.array([key_to_id[x] for x in column]), len(keys)

In [11]:
def encode_df(anime_df):
    """Encodes rating data with continuous user and anime ids"""
    
    anime_ids, anime_df['anime_id'], num_anime = encode_column(anime_df['anime_id'])
    user_ids, anime_df['user_id'], num_users = encode_column(anime_df['user_id'])
    return anime_df, num_users, num_anime, user_ids, anime_ids

In [12]:
anime_df, num_users, num_anime, user_ids, anime_ids = encode_df(train_df)
print("Number of users :", num_users)
print("Number of anime :", num_anime)
anime_df.head()

Number of users : 68845
Number of anime : 9775


Unnamed: 0,user_id,anime_id,rating
0,0,0,8
1,1,1,7
2,2,2,8
3,3,3,8
4,4,4,8


## Initializing user and item embeddings

In [13]:
def create_embeddings(n, K):
    """
    Creates a random numpy matrix of shape n, K with uniform values in (0, 11/K)
    n: number of items/users
    K: number of factors in the embedding 
    """
    return 11*np.random.random((n, K)) / K

## Creating Sparse utility matrix

In [14]:
def create_sparse_matrix(df, rows, cols, column_name="rating"):
    """ Returns a sparse utility matrix""" 
    return sparse.csc_matrix((df[column_name].values,(df['user_id'].values, df['anime_id'].values)),shape=(rows, cols))

In [15]:
anime_df, num_users, num_anime, user_ids, anime_ids = encode_df(train_df)
Y = create_sparse_matrix(anime_df, num_users, num_anime)

In [16]:
# to view matrix
Y.todense()

matrix([[8, 0, 0, ..., 0, 0, 0],
        [0, 7, 0, ..., 0, 0, 0],
        [0, 0, 8, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

## Making predictions

In [17]:
def predict(df, emb_user, emb_anime):
    """ This function computes df["prediction"] without doing (U*V^T).
    
    Computes df["prediction"] by using elementwise multiplication of the corresponding embeddings and then 
    sum to get the prediction u_i*v_j. This avoids creating the dense matrix U*V^T.
    """
    df['prediction'] = np.sum(np.multiply(emb_anime[df['anime_id']],emb_user[df['user_id']]), axis=1)
    return df

## Cost

In [18]:
lmbda = 0.0002

In [19]:
def cost(df, emb_user, emb_anime):
    """ Computes mean square error"""
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_anime.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_anime), emb_user.shape[0], emb_anime.shape[0], 'prediction')
    return np.sum((Y-predicted).power(2))/df.shape[0] 

In [20]:
def gradient(df, emb_user, emb_anime):
    """ Computes the gradient for user and anime embeddings"""
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_anime.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_anime), emb_user.shape[0], emb_anime.shape[0], 'prediction')
    delta =(Y-predicted)
    grad_user = (-2/df.shape[0])*(delta*emb_anime) + 2*lmbda*emb_user
    grad_anime = (-2/df.shape[0])*(delta.T*emb_user) + 2*lmbda*emb_anime
    return grad_user, grad_anime

In [21]:
def gradient_descent(df, emb_user, emb_anime, iterations=2000, learning_rate=0.01, df_val=None):
    """ 
    Computes gradient descent with momentum (0.9) for given number of iterations.
    emb_user: the trained user embedding
    emb_anime: the trained anime embedding
    """
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_anime.shape[0])
    beta = 0.9
    grad_user, grad_anime = gradient(df, emb_user, emb_anime)
    v_user = grad_user
    v_anime = grad_anime
    for i in range(iterations):
        grad_user, grad_anime = gradient(df, emb_user, emb_anime)
        v_user = beta*v_user + (1-beta)*grad_user
        v_anime = beta*v_anime + (1-beta)*grad_anime
        emb_user = emb_user - learning_rate*v_user
        emb_anime = emb_anime - learning_rate*v_anime
        if(not (i+1)%50):
            print("\niteration", i+1, ":")
            print("train mse:",  cost(df, emb_user, emb_anime))
            if df_val is not None:
                print("validation mse:",  cost(df_val, emb_user, emb_anime))
    return emb_user, emb_anime

In [22]:
emb_user = create_embeddings(num_users, 3)
emb_anime = create_embeddings(num_anime, 3)
emb_user, emb_anime = gradient_descent(anime_df, emb_user, emb_anime, iterations=800, learning_rate=1)


iteration 50 :
train mse: 16.1229656318067

iteration 100 :
train mse: 12.338856324616625

iteration 150 :
train mse: 10.542786497041181

iteration 200 :
train mse: 9.4538002956885

iteration 250 :
train mse: 8.709082155133354

iteration 300 :
train mse: 8.161566199624808

iteration 350 :
train mse: 7.739134228586291

iteration 400 :
train mse: 7.401921340306754

iteration 450 :
train mse: 7.12599991064492

iteration 500 :
train mse: 6.896115833508122

iteration 550 :
train mse: 6.702081283298155

iteration 600 :
train mse: 6.536831374488087

iteration 650 :
train mse: 6.395309553772087

iteration 700 :
train mse: 6.273795836993777

iteration 750 :
train mse: 6.169485350695725

iteration 800 :
train mse: 6.080214982539077


## Making predictions on new data

In [23]:
def encode_new_data(valid_df, user_ids, anime_ids):
    """ Encodes valid_df with the same encoding as train_df.
    """
    df_val_chosen = valid_df['anime_id'].isin(anime_ids.keys()) & valid_df['user_id'].isin(user_ids.keys())
    valid_df = valid_df[df_val_chosen]
    valid_df['anime_id'] =  np.array([anime_ids[x] for x in valid_df['anime_id']])
    valid_df['user_id'] = np.array([user_ids[x] for x in valid_df['user_id']])
    return valid_df

In [24]:
print("before encoding:", valid_df.shape)
valid_df = encode_new_data(valid_df, user_ids, anime_ids)
print("after encoding:", valid_df.shape)

before encoding: (1267449, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


after encoding: (751697, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [25]:
train_mse = cost(train_df, emb_user, emb_anime)
val_mse = cost(valid_df, emb_user, emb_anime)
print(train_mse, val_mse)

6.080214982539077 12.223649311325055


In [26]:
#looking at the predictions
valid_df[70:80].head()

Unnamed: 0,user_id,anime_id,rating,prediction
131,49224,174,7,5.362968
135,44935,72,8,6.011655
136,38842,2129,5,7.516004
137,23849,651,6,4.443083
139,58144,1,8,7.415189
