In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [None]:
import pickle

def get_and_save_or_load_maps(d):
  mm = "./processed/movie_to_index.pkl"
  uu = "./processed/user_to_index.pkl"
  try:
    with open(mm, 'rb') as f: d["movie_to_index"] = pickle.load(f)
    with open(uu, 'rb') as f: d["user_to_index"] = pickle.load(f)
  except:
    print("COULD NOT LOAD MAPS")
    d["movie_to_index"] = {m: i for i, m in enumerate(d["all_movies"])}
    d["user_to_index"] = {u: i for i, u in enumerate(d["all_users"])}
    with open(mm, 'wb') as f: pickle.dump(d["movie_to_index"], f)
    with open(uu, 'wb') as f: pickle.dump(d["user_to_index"], f)
  return d

In [None]:
def get_and_save_or_load_movies(d):
  mm = "./processed/all_movies.pkl"
  try:
    with open(mm, 'rb') as f: d["all_movies"] = pickle.load(f)
  except:
    print("COULD NOT LOAD MOVIES")
    d["all_movies"] = ratings_data.movie_id.unique()
    with open(mm, 'wb') as f: pickle.dump(d["all_movies"], f)
  return d


In [None]:
from sklearn.model_selection import train_test_split

def get_and_save_or_load_users(d, train_ratio, test_ratio):
  uu = "./processed/all_users.pkl"
  tt1 = "./processed/train_users.pkl"
  tt2 = "./processed/test_users.pkl"
  try:
    with open(uu, 'rb') as f: d["all_users"] = pickle.load(f)
    with open(tt1, 'rb') as f: d["train_users"] = pickle.load(f)
    with open(tt2, 'rb') as f: d["test_users"] = pickle.load(f)
  except:
    print("COULD NOT LOAD USERS")
    d["all_users"] = ratings_data.user_id.unique()
    d["train_users"], d["test_users"] = train_test_split(d["all_users"], train_size=train_ratio, test_size=test_ratio)
    with open(uu, 'wb') as f: pickle.dump(d["all_users"], f)
    with open(tt1, 'wb') as f: pickle.dump(d["train_users"], f)
    with open(tt2, 'wb') as f: pickle.dump(d["test_users"], f)
  return d


In [None]:
from scipy.sparse import coo_matrix

def get_and_save_or_load_matrix(d, test_or_train):
  s = test_or_train + "_matrix"
  tt = "./processed/" + s + ".pkl"
  try:
    with open(tt, 'rb') as f: d[s] = pickle.load(f)
  except:
    print("COULD NOT LOAD MATRIX: " + test_or_train)
    shape = (len(d["all_users"]), len(d["all_movies"]))
    df = ratings_data[ratings_data["user_id"].isin(d[test_or_train + "_users"])]
    row = df['user_id'].map(d['user_to_index']).values
    col = df['movie_id'].map(d['movie_to_index']).values
    data = df['rating_val'].values
    d[s] = coo_matrix((data, (row, col)), shape=shape)
    d[s] = d[s].tocsr()
    with open(tt, 'wb') as f: pickle.dump(d[s], f)
  return d


In [None]:
def get_and_save_or_load_sample(train_ratio=0.8, test_ratio=0.2):
  d = dict()
  d = get_and_save_or_load_movies(d)
  d = get_and_save_or_load_users(d, train_ratio, test_ratio)
  d = get_and_save_or_load_maps(d)
  for s in ("test", "train"):
    d = get_and_save_or_load_matrix(d, s)
  return d

In [None]:
# I added the processed.zip file manually
! unzip processed.zip -d .

Archive:  processed.zip
   creating: ./content/processed/
  inflating: ./content/processed/test_users.pkl  
  inflating: ./content/processed/user_to_index.pkl  
  inflating: ./content/processed/all_users.pkl  
  inflating: ./content/processed/train_matrix.pkl  
  inflating: ./content/processed/train_users.pkl  
  inflating: ./content/processed/movie_to_index.pkl  
  inflating: ./content/processed/test_matrix.pkl  
  inflating: ./content/processed/all_movies.pkl  


In [None]:
! mkdir /content/processed
! mv /content/content/processed/* /content/processed/

In [None]:
! rm -rf /content/content
! rm -rf /content/processed.zip

In [None]:
import numpy as np
from scipy.sparse import find

def subtract_column(sparse_matrix, column):
    column = column.flatten()
    nonzero_rows, nonzero_cols = sparse_matrix.nonzero()
    nonzero_values = sparse_matrix.data
    nonzero_values -= column[nonzero_rows]
    new_sparse_matrix = sparse_matrix.copy()
    new_sparse_matrix.data[:] = nonzero_values
    return new_sparse_matrix

def demean_matrix(mat):
    sums = mat.sum(axis=1).A1
    counts = np.diff(mat.indptr)
    averages = sums / counts
    averages = averages.reshape(-1, 1)
    return subtract_column(mat, averages)

d = get_and_save_or_load_sample()
d["train_matrix_demeaned"] = demean_matrix(d["train_matrix"].asfptype())
d["test_matrix_demeaned"] = demean_matrix(d["test_matrix"].asfptype())
d["train_matrix_demeaned"].data

  averages = sums / counts


array([ 0.56193742,  0.56193742,  0.56193742, ...,  2.78800631,
       -0.21199369, -1.21199369])

In [None]:
print(d["train_matrix"][0])
print(d["train_matrix_demeaned"][0])
print(d["train_matrix"][500])
print(d["train_matrix_demeaned"][500])

  (0, 0)	7
  (0, 1)	7
  (0, 2)	7
  (0, 3)	4
  (0, 4)	5
  (0, 5)	5
  (0, 6)	10
  (0, 7)	7
  (0, 8)	7
  (0, 9)	6
  (0, 10)	8
  (0, 11)	6
  (0, 12)	10
  (0, 13)	5
  (0, 14)	7
  (0, 15)	4
  (0, 16)	5
  (0, 17)	8
  (0, 18)	8
  (0, 19)	5
  (0, 20)	8
  (0, 21)	7
  (0, 22)	8
  (0, 23)	4
  (0, 24)	8
  :	:
  (0, 251340)	9
  (0, 251341)	8
  (0, 251344)	9
  (0, 251369)	7
  (0, 251370)	8
  (0, 251374)	6
  (0, 251752)	8
  (0, 253141)	7
  (0, 254264)	7
  (0, 256409)	8
  (0, 257470)	6
  (0, 257565)	6
  (0, 259445)	7
  (0, 260445)	7
  (0, 262341)	4
  (0, 262342)	6
  (0, 266629)	4
  (0, 273154)	9
  (0, 274406)	7
  (0, 274873)	7
  (0, 274887)	8
  (0, 274900)	9
  (0, 275309)	6
  (0, 280808)	6
  (0, 282608)	6
  (0, 0)	0.5619374196313762
  (0, 1)	0.5619374196313762
  (0, 2)	0.5619374196313762
  (0, 3)	-2.4380625803686238
  (0, 4)	-1.4380625803686238
  (0, 5)	-1.4380625803686238
  (0, 6)	3.5619374196313762
  (0, 7)	0.5619374196313762
  (0, 8)	0.5619374196313762
  (0, 9)	-0.4380625803686238
  (0, 10)	1.561937

In [None]:
from scipy.sparse.linalg import svds
U, sigma, Vh = svds(d["train_matrix_demeaned"], k=500)
U, sigma, Vh

(array([[ 0.00212523,  0.00151282, -0.00319171, ..., -0.01612611,
          0.00839345, -0.01589729],
        [-0.00374893,  0.01278453, -0.00255126, ..., -0.02973947,
         -0.00684841, -0.01634006],
        [ 0.00539789, -0.00617903,  0.034954  , ..., -0.01235481,
         -0.00528474, -0.01589895],
        ...,
        [ 0.00656284, -0.00125372,  0.0085012 , ..., -0.01323971,
          0.00847087, -0.01312287],
        [ 0.00072445, -0.00394366,  0.00105144, ..., -0.01040594,
          0.00168204, -0.00668816],
        [-0.00939986, -0.00715633,  0.01661847, ..., -0.00048983,
          0.0165631 , -0.01616254]]),
 array([ 116.54542151,  116.67910475,  116.71982743,  116.7502984 ,
         116.92202991,  116.92913427,  116.98716484,  117.0767229 ,
         117.12322094,  117.14408382,  117.20827829,  117.25435961,
         117.46000482,  117.59227078,  117.61962909,  117.69108463,
         117.73149586,  117.76780123,  117.84293208,  117.8916043 ,
         117.98947355,  118.02813

In [None]:
import warnings
from sklearn.metrics.pairwise import cosine_similarity
def predict_scores(d, uname, num_neighbors, return_dict=False):
    # Get the score array for the user
    user_scores = d["test_matrix"][d["user_to_index"][uname]]

    # Get the average user rating to add it back later
    arr_nonzero_avg = user_scores.sum() / user_scores.count_nonzero()

    # Calculate latent vector by dot product with sigma reciprocal times Vh transpose    
    latent_vector = user_scores.dot(1 / sigma * Vh.transpose())
    
    # Get cosine similarities between latent vector and U
    sims = cosine_similarity(latent_vector, U)
    sims = sims.flatten()

    neighbor_indices = sims.argsort()[::-1][:num_neighbors]
    neighbor_ratings = d["train_matrix"][neighbor_indices, :].toarray().astype('float')
    neighbor_ratings[neighbor_ratings < 0.5] = 'nan'
    with warnings.catch_warnings():
      warnings.simplefilter("ignore", category=RuntimeWarning)
      movie_ratings = np.nanmean(neighbor_ratings, axis=0)  # Ignores NaNs by default
    movie_ratings[np.isnan(movie_ratings)] = arr_nonzero_avg  # Predict avg score when forced
    if return_dict: return {d["all_movies"][i]: movie_ratings[i] for i in range(len(movie_ratings))}
    else: return movie_ratings

import random
uname = random.choice(d["test_users"])
map = predict_scores(d, uname, 100, return_dict=True)
map

{'feast-2014': 8.0,
 'loving-2016': 7.769230769230769,
 'scripted-content': 6.0,
 'the-future': 6.0,
 'mank': 6.8,
 'embers-2015': 6.493140243902439,
 'the-social-network': 8.89655172413793,
 'miss-you-already': 5.666666666666667,
 'saw-iii': 5.6521739130434785,
 'recess-schools-out': 5.333333333333333,
 'the-girl-on-the-train-2016': 6.129032258064516,
 'yes-man': 5.894736842105263,
 'insidious': 6.6875,
 'the-alphabet': 7.111111111111111,
 'first-girl-i-loved': 6.5,
 'red-mist': 3.5,
 'sydney-white': 3.8,
 'bo-burnham-what': 6.166666666666667,
 'all-cheerleaders-die': 4.411764705882353,
 'the-wave-2015': 7.0,
 'next-floor': 7.266666666666667,
 'nerve-2016': 5.333333333333333,
 'the-disappearance-of-eleanor-rigby-her': 7.333333333333333,
 'mud': 7.884615384615385,
 'doubt': 7.7368421052631575,
 'walk-hard-the-dewey-cox-story': 7.578947368421052,
 'hush-2016': 6.542857142857143,
 'deadpool': 7.174603174603175,
 'the-invitation-2015': 7.142857142857143,
 'trick-r-treat': 7.28,
 'queen-of

In [None]:
def test_scores(d, uname, num_neighbors, func):
    target = d["test_matrix"][d["user_to_index"][uname], :].toarray().reshape(1, -1)
    pred = func(d, uname, num_neighbors)
    target = target.flatten().astype('float')
    target[target < 0.5] = 'nan'
    mse = np.nanmean((target - pred) ** 2)
    mae = np.nanmean(abs(target - pred))
    return mse, mae


In [None]:
import random

NUM_NEIGHBORS = 100
NUM_ITERATIONS = 100

d = get_and_save_or_load_sample()
mse_arr = list()
mae_arr = list()
for i in range(NUM_ITERATIONS):
  uname = random.choice(d["test_users"])
  mse, mae = test_scores(d, uname, NUM_NEIGHBORS, func=predict_scores)
  mse_arr.append(mse)
  mae_arr.append(mae)
print("Average mean squared error for the iterations")
print(np.mean(mse_arr))
print("Average mean absolute error for the iterations")
print(np.mean(mae_arr))

Average mean squared error for the iterations
3.000892033533085
Average mean absolute error for the iterations
1.3131210044440709
