# In part 2 of the project, we're trying to build a collaborative-filtering based movie recommender by following the matrix factorization approach. We will subtract the weighted average movie ratings from the actual ratings so that for a new user (or a user who hasn't rated any movies) the predicted rating will be the weighted avergae movie ratings instead of zero. At the end we will evaluate our model with RMSE, MAE and Mean Average Precision. The result will be compared to the global recommender baseline.

## import dependencies

In [None]:
import time
import gc
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
import keras
from tensorflow.keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from tensorflow.keras.models import Model

## load output of Part 1

### load lists

In [None]:
with open('test_user_ids_list.txt', 'rb') as fp:
    test_user_ids_list=pickle.load(fp)
    
with open('test_movie_ids_list.txt', 'rb') as fp:
    test_movie_ids_list=pickle.load(fp)
    
with open('train_movie_ids_list.txt', 'rb') as fp:
    train_movie_ids_list=pickle.load(fp)
    
with open('train_movie_average_ratings_list.txt', 'rb') as fp:
    train_movie_average_ratings_list=pickle.load(fp)

### convert list to dictionaries

In [None]:
# {user_id_1: True, user_id_2: True, ...}
test_user_ids={}
# {user_id_1: [movie_id_1, movie_id_2, ...], ...}
train_movie_ids={}
# {user_id_1: rating_1, user_id_2: rating_2, ...}
train_movie_average_ratings={}

for ele in test_user_ids_list:
    test_user_ids[ele]=True
    
for ele in train_movie_ids_list:
    train_movie_ids[ele[0]]=ele[1]
    
for ele in train_movie_average_ratings_list:
    train_movie_average_ratings[ele[0]]=ele[1]

# {user_id_1: [movie_id_3, ...], ...} 
# the number of users in test_movie_ids is smaller than total number of users in the test set
# because some users in the test set give all ratings as negative (lower than average in train set)
test_movie_ids={}

for ele in test_movie_ids_list:
    test_movie_ids[ele[0]]=ele[1]

del test_movie_ids_list, test_user_ids_list, train_movie_average_ratings_list, train_movie_ids_list

In [None]:
len(test_user_ids), len(test_movie_ids), len(train_movie_average_ratings), len(train_movie_ids)

(66852, 42001, 66852, 66852)

### load dataframes

In [None]:
df_train=pd.read_pickle("df_train.pkl")
df_test=pd.read_pickle("df_test.pkl")
df_val=pd.read_pickle("df_val.pkl")
combined_df=pd.read_pickle("combined_df.pkl")

## preparation for the collaborative filtering model

In [None]:
# Create user and movie-id mapping to convert to numbers
user_id_mapping = {id:i for i, id in enumerate(df_train['User'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(df_train['Movie'].unique())}

In [None]:
# use dataframe map function to map users & movies to mapped ids based on above mapping
X_train_user = df_train['User'].map(user_id_mapping).values
X_train_movie = df_train['Movie'].map(movie_id_mapping).values

# do the same for val and test sets
X_val_user = df_val['User'].map(user_id_mapping).values
X_val_movie = df_val['Movie'].map(movie_id_mapping).values

X_test_user = df_test['User'].map(user_id_mapping).values
X_test_movie = df_test['Movie'].map(movie_id_mapping).values

### <font color='red'> Subtract the weighted score from the actual rating. As a result, for users who haven't rated any movies we will use weighted average scores of movies as predictions </font> 

**rank_dic defined for global recommender mean average precision evaluation**

In [None]:
from collections import OrderedDict
temp=combined_df.reset_index()[['Id','weighted score']].sort_values('weighted score', ascending=False)

# {movie_id: weighted_score, ....}
rank_dic=OrderedDict()
for i in range(temp.shape[0]):
    rank_dic[temp.iloc[i,0]]=temp.iloc[i,1]

del temp

In [None]:
Y_train=df_train['Rating'].values.copy()-df_train['Movie'].map(rank_dic).values.copy()

Y_val=df_val['Rating'].values.copy()-df_val['Movie'].map(rank_dic).values.copy()

_=gc.collect()

### <font color='red'> reserve space for new users, here we don't handle new movies </font> 

In [None]:
# Get input variable-sizes
users = int(len(user_id_mapping) * 1.1)  # reserve space for new users
movies = len(movie_id_mapping)

## build the collaborative filtering model

In [None]:
embedding_size = 100

# use Input() to create tensors for - 'user' and 'movie'
user_id_input = Input(shape=(1,), name='user')
movie_id_input =  Input(shape=(1,), name='movie')

# Create embedding layer for users 
user_embedding = Embedding(output_dim=embedding_size, 
                           input_dim=users,
                           input_length=1, 
                           embeddings_regularizer=tf.keras.regularizers.l2(0.0000001),
                           name='user_embedding')(user_id_input)

# create embedding layer for movies 
movie_embedding = Embedding(output_dim=embedding_size, 
                           input_dim=movies,
                           input_length=1, 
                           embeddings_regularizer=tf.keras.regularizers.l2(0.0000001),
                           name='movie_embedding')(movie_id_input)

user_vector = Reshape([embedding_size])(user_embedding)
movie_vector = Reshape([embedding_size])(movie_embedding)

output = Dot(1, normalize=False)([user_vector, movie_vector])

model = Model(inputs=[user_id_input, movie_id_input], outputs=output)
model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4))

2021-12-29 23:20:49.726285: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-29 23:20:49.942054: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-29 23:20:49.942833: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-29 23:20:49.958775: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-29 23:20:49.959429: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read f

## train the model

In [None]:
batch_size = 1024
epochs = 20

es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                      patience=1,
                                      restore_best_weights=True,
                                      verbose=1)

model.fit([X_train_user, X_train_movie], Y_train,
          batch_size=batch_size, 
          epochs=epochs,
          validation_data=([X_val_user, X_val_movie], Y_val),
          #validation_split=1/280.0,
          shuffle=True,
          callbacks=[es],
          verbose=1)

del X_train_user, X_train_movie, Y_train, X_val_user, X_val_movie, Y_val
_=gc.collect()

2021-12-29 23:20:52.611877: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 590483320 exceeds 10% of free system memory.
2021-12-29 23:20:53.104315: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 590483320 exceeds 10% of free system memory.
2021-12-29 23:20:53.811121: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 295241660 exceeds 10% of free system memory.
2021-12-29 23:20:54.100831: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 590483320 exceeds 10% of free system memory.
2021-12-29 23:20:54.502385: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 590483320 exceeds 10% of free system memory.
2021-12-29 23:20:55.084721: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Restoring model weights from the end of the best epoch.
Epoch 00004: early stopping


In [None]:
model.save('collaborative.h5')

In [None]:
user_id_mapping_list=[]
for key in list(user_id_mapping.keys()):
    user_id_mapping_list.append([key, user_id_mapping[key]])
    
with open('user_id_mapping_list.txt', 'wb') as fp:
    pickle.dump(user_id_mapping_list, fp)
                
movie_id_mapping_list=[]
for key in list(movie_id_mapping.keys()):
    movie_id_mapping_list.append([key, movie_id_mapping[key]])
    
with open('movie_id_mapping_list.txt', 'wb') as fp:
    pickle.dump(movie_id_mapping_list, fp)

## model evaluation

### evaluate rmse on test set

In [None]:
y_pred=model.predict([X_test_user, X_test_movie]).flatten()

# add back the weighted score
y_pred+=df_test['Movie'].map(rank_dic).values.copy()

# clip the predicted score that's lower than 1 or larger than 5
y_pred = np.array(list(map(lambda x: 1.0 if x < 1 else 5.0 if x > 5.0 else x, y_pred)))

y_true = df_test['Rating'].values

rmse = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred))
mae = mean_absolute_error(y_true=y_true, y_pred=y_pred)
print("The RMSE Value for the Mean Rating Recommender:", rmse)
print("The MAE Value for the Mean Rating Recommender:", mae)

The RMSE Value for the Mean Rating Recommender: 0.7896245334449726
The MAE Value for the Mean Rating Recommender: 0.6041737135076523


### define a function that makes prediction

In [None]:
def make_pred(user_id, movie_id, model, rank_dic):
    if type(user_id)!=type('asdf'):
        print('please enter a string for user id')
        return None
    if movie_id not in movie_id_mapping:
        print('movie id non-existent')
        
    if user_id not in user_id_mapping:
        user=users-1
    else:
        user=user_id_mapping[user_id]
        
    movie=movie_id_mapping[movie_id]
    pred=model.predict([np.array([user]), np.array([movie])])
    pred+=rank_dic[movie_id]
    return pred[0,0]

**for a unknown/new user, the predicted score is simply weighted average score**

In [None]:
user_id='asdf'
movie_id=5
make_pred(user_id, movie_id, model, rank_dic), rank_dic[movie_id] 

(3.697078, 3.6970780962528687)

**for a known user, compare the predicted score and the weighted average score**

In [None]:
user_id=list(test_user_ids.keys())[7]
movie_id=5
make_pred(user_id, movie_id, model, rank_dic), rank_dic[movie_id] 

(3.3203886, 3.6970780962528687)

### evaluate mean average precision

**define the function that calculates average precision**

In [None]:
def average_precision_at_k(rel: [int], pred: [int], k: int) -> float:
    # this function works only for a single user
    # rel is an list of movie id's for all relevant movies in the test set 
    # pred is the prediction of the model excluding those ratings already in the training set
    # pred is a list of movie id's whose scores are ranked from high to low
    # len(pred) should be large enough for k
    # k is the cutoff
    temp=0
    true_positive=0
    for i in range(min(k,len(pred))):
        if pred[i] in rel:
            true_positive+=1
            temp+=true_positive/(i+1)
    return temp/len(rel)

**loop through users in test set and apply the function we defined**

In [None]:
result1=[]
result2=[]
user_list=list(test_movie_ids.keys())

mile=len(user_list)//100
ct=0

for i in range(len(user_list)):
    
    if ct%mile==1: 
        print(ct//mile)
        print('collaborative ', sum(result1)/len(result1))
        print('global ', sum(result2)/len(result2))
    ct+=1

    user=user_list[i]

    rank_dic_copy=rank_dic.copy()

    rel=test_movie_ids[user]

    pred=[]
    # list of movies already rated in train set
    already=train_movie_ids[user]

    # remove movies that are already in the train set
    for ele in already:
        rank_dic_copy.pop(ele)

    # save a copy of movie ids
    aaa=list(rank_dic_copy.keys())

    # map movie ids to movie vocabulary number
    X_movie=pd.Series(np.array(aaa)).map(movie_id_mapping).values
    # map user ids to user vocabulary number
    X_user=pd.Series(np.array([user for i in range(X_movie.shape[0])])).map(user_id_mapping).values

    Y=model.predict([X_user, X_movie])
    Y=Y[:,0]
    Y=list(Y)

    pred=[]
    for iii, y in enumerate(Y):
        pred.append([aaa[iii], y+rank_dic[aaa[iii]]])

    # sort by score from high to low
    pred.sort(key=lambda x : x[1], reverse=True)
    pred=np.array(pred)
    pred=pred[:,0]
    pred=list(pred)
    #print(len(pred))
    result1.append(average_precision_at_k(rel, pred, 100000))

    copy_dic=rank_dic.copy()
    for ele in train_movie_ids[user]:
        copy_dic.pop(ele)
    pred=list(copy_dic.keys())
    #print(len(pred))
    result2.append(average_precision_at_k(rel, pred, 100000))

0
collaborative  0.00423728813559322
global  0.00036310820624546115
1
collaborative  0.02624879994333637
global  0.012302635034416741
2
collaborative  0.029275886069042332
global  0.014614935907681693
3
collaborative  0.028088908452808797
global  0.016210416627352297
4
collaborative  0.02594172146169525
global  0.015184705657032668
5
collaborative  0.02438266916698019
global  0.01531582999101874
6
collaborative  0.025526495972614725
global  0.015647165657557506
7
collaborative  0.023941465586283708
global  0.014725234550717711
8
collaborative  0.023894768601314113
global  0.014813367882146098
9
collaborative  0.023929927981204004
global  0.014292813199572325
10
collaborative  0.023695308729588006
global  0.014337856500880392
11
collaborative  0.023539372004827162
global  0.014675268495120084
12
collaborative  0.024407801181734433
global  0.01501648340979014
13
collaborative  0.024144303703626146
global  0.014641143121788899
14
collaborative  0.02421373445390641
global  0.01419455400571

In [None]:
print('there are {} users in the test set'.format(len(test_user_ids)))
print('there are {} users whose ratings in the test set are all negative'.format(len(test_user_ids)-len(test_movie_ids)))
print('mean average precision for collaborative method is {}'.format(sum(result1)/len(result1)))
print('mean average precision for gloabl method is {}'.format(sum(result2)/len(result2)))

there are 66852 users in the test set
there are 24851 users whose ratings in the test set are all negative
mean average precision for collaborative method is 0.024886198324415267
mean average precision for gloabl method is 0.014362832245891459


**Compute again the MAP@k, k=100 for the global recommender. This should agree with the result in Part 1.**

In [None]:
result=[]

user_list=list(test_movie_ids.keys())

for i in range(len(user_list)):
    
    user=user_list[i]
    rel=test_movie_ids[user]

    copy_dic=rank_dic.copy()
    for ele in train_movie_ids[user]:
        copy_dic.pop(ele)
    pred=list(copy_dic.keys())
    #print(len(pred))
    result.append(average_precision_at_k(rel, pred, 100))

In [None]:
sum(result)/len(result)

0.013133213156647231