# In Part 3 of this project, we build a content-based movie recommender. For each movie, input its overview (a short description of the movie) to distilled BERT for document embedding (a 768 dimensional vector). Learn a user-embedding whose dot product with the document embedding yields the predicted movie rating. Eventually we will evaluate the model performace by calculating RMSE, MAE and Mean Average Precision and compare the result with the global movie recommender baseline.

### import dependencies

In [1]:
%%capture
!pip3 install pandas
!pip3 install sklearn
!pip3 install matplotlib
!pip3 install seaborn
!pip3 install transformers

In [2]:
import random
import time
import sys
import gc
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
import keras
from tensorflow.keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from tensorflow.keras.models import Model
import pickle
from tqdm import tqdm
import transformers

**remove useless warnings**

In [3]:
import warnings
warnings.filterwarnings('ignore')

## import data

In [4]:
df_movie_title_filtered=pd.read_pickle('df_movie_titles_filtered.pkl')
df_train_filtered=pd.read_pickle('df_train_filtered.pkl')
df_val_filtered=pd.read_pickle('df_val_filtered.pkl')
df_test_filtered=pd.read_pickle('df_test_filtered.pkl')
combined_df=pd.read_pickle('combined_df.pkl')

## prepare tokens and masks BERT model

**create a function that makes BERT input features from overview text** <br>
**the function is copied from: https://github.com/dipanjanS/deep_transfer_learning_nlp_dhs2019/blob/master/notebooks/6%20-%20Transformers%20-%20DistilBERT.ipynb**

In [5]:
def create_bert_input_features(tokenizer, docs, max_seq_length):
    
    all_ids, all_masks = [], []
    for doc in docs:
        tokens = tokenizer.tokenize(doc)
        if len(tokens) > max_seq_length-2:
            tokens = tokens[0 : (max_seq_length-2)]
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        ids = tokenizer.convert_tokens_to_ids(tokens)
        masks = [1] * len(ids)
        # Zero-pad up to the sequence length.
        while len(ids) < max_seq_length:
            ids.append(0)
            masks.append(0)
        all_ids.append(ids)
        all_masks.append(masks)
    encoded = [all_ids, all_masks]
    
    return encoded

In [6]:
%%capture
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [7]:
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [8]:
df_movie_title_filtered['Movie']=df_movie_title_filtered.index

In [9]:
MAX_SEQ_LENGTH = 300 # slightly larger than 274

feature_id_dic={}
feature_mask_dic={}

for i in tqdm(range(df_movie_title_filtered.shape[0])):
    movie=df_movie_title_filtered.iloc[i, 3]
    overview=df_movie_title_filtered.iloc[i, 2]
    temp=create_bert_input_features(tokenizer, [overview], max_seq_length=MAX_SEQ_LENGTH)
    feature_id_dic[movie] = temp[0][0]
    feature_mask_dic[movie] = temp[1][0]

100%|██████████| 6551/6551 [00:05<00:00, 1137.52it/s]


## use a BERT model to do paragraph embedding. We won't train this model

In [10]:
inp_id = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_input_ids")
inp_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype='int32', name="bert_input_masks")
inputs = [inp_id, inp_mask]

layer=transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')
layer.trainable=False
hidden_state = layer(inputs)[0]
print(hidden_state.shape)
output = hidden_state[:, 0]

model = Model(inputs=[inp_id, inp_mask], outputs=output)
model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5))

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

2021-12-30 14:52:48.320553: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-30 14:52:48.576819: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-30 14:52:48.577521: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-30 14:52:48.582386: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-30 14:52:48.583018: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1050] successful NUMA node read f

(None, 300, 768)


**bert_dic is the actual dictionary that maps movie ids to movie overview embeddings**

In [11]:
bert_dic={}

for movie in feature_id_dic:
    inp_id=np.array([feature_id_dic[movie]])
    inp_mask=np.array([feature_mask_dic[movie]])
    bert_dic[movie]=list(model.predict([inp_id, inp_mask])[0])

2021-12-30 14:52:56.482669: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [12]:
bert_dic_list=[]
for key in list(bert_dic.keys()):
    bert_dic_list.append([key, bert_dic[key]])
    
with open('bert_dic_list.txt', 'wb') as fp:
    pickle.dump(bert_dic_list, fp)

**make sure all users in val and test sets are included in train set**

In [13]:
l1=list(df_train_filtered['User'].unique())
l2=list(df_val_filtered['User'].unique())
l3=list(df_test_filtered['User'].unique())
len(l1), len(l2), len(l3)

(150245, 57042, 57069)

In [14]:
check=True
l1_dic={}

for ele in l1:
    l1_dic[ele]=True
    
for ele in l2:
    if ele not in l1_dic:
        check=False
        break
        
for ele in l3:
    if ele not in l1_dic:
        check=False
        break

if True:
    print('no problem')
else:
    print('error')

no problem


**make sure all movies in val and test sets are included in train set**

In [15]:
l1=list(df_train_filtered['Movie'].unique())
l2=list(df_val_filtered['Movie'].unique())
l3=list(df_test_filtered['Movie'].unique())
len(l1), len(l2), len(l3)

(4239, 4037, 4046)

In [16]:
check=True
l1_dic={}

for ele in l1:
    l1_dic[ele]=True
    
for ele in l2:
    if ele not in l1_dic:
        check=False
        break
        
for ele in l3:
    if ele not in l1_dic:
        check=False
        break

if True:
    print('no problem')
else:
    print('error')

del l1, l2, l3, l1_dic

no problem


**user_id_mapping**

In [17]:
user_id_mapping = {id:i for i, id in enumerate(df_train_filtered['User'].unique())}

In [18]:
user_id_mapping_list=[]
for key in list(user_id_mapping.keys()):
    user_id_mapping_list.append([key, user_id_mapping[key]])
    
with open('filtered_user_id_mapping_list.txt', 'wb') as fp:
    pickle.dump(user_id_mapping_list, fp)

**convert user_id using user_id_mapping**

In [19]:
train_user=df_train_filtered['User'].map(user_id_mapping).values
val_user=df_val_filtered['User'].map(user_id_mapping).values
test_user=df_test_filtered['User'].map(user_id_mapping).values

**create movie id feature**

In [20]:
train_movie=df_train_filtered['Movie'].values
val_movie=df_val_filtered['Movie'].values
test_movie=df_test_filtered['Movie'].values

**again we need rank_dic which stores the weighted average movie ratings sorted from high to low**

In [53]:
from collections import OrderedDict

temp=combined_df.reset_index()[['Id','weighted score']].sort_values('weighted score', ascending=False)

# {movie_id: weighted_score, ....}
rank_dic=OrderedDict()
for i in range(temp.shape[0]):
    rank_dic[temp.iloc[i,0]]=temp.iloc[i,1]
    
# filter the movies that don't have overview from rank_dic
temp=list(df_train_filtered['Movie'].unique())
for movie in list(rank_dic.keys()):
    if movie not in temp:
        rank_dic.pop(movie)

del temp

**create labels**

In [22]:
train_Y=df_train_filtered['Rating'].values.copy()
val_Y=df_val_filtered['Rating'].values.copy()
test_Y=df_test_filtered['Rating'].values.copy()

**subtract the weighted average score from the label**

In [23]:
train_Y-=df_train_filtered['Movie'].map(rank_dic).values
val_Y-=df_val_filtered['Movie'].map(rank_dic).values
test_Y-=df_test_filtered['Movie'].map(rank_dic).values

## model building

**our dataset will be too large in the original form of shape=(m, 768), so we need to define a batch_generator**

In [24]:
def batch_generator(X0, X1, Y, batch_size): 
    number_of_batches = X0.shape[0]//batch_size
    number_of_batches = 1000
    counter=0
    shuffle_index = np.arange(np.shape(Y)[0])
    np.random.shuffle(shuffle_index)
    X0 =  X0[shuffle_index]
    X1 =  X1[shuffle_index]
    Y =  Y[shuffle_index]
    while 1:
        index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]
        X0_batch = X0[index_batch]
        X1_batch = X1[index_batch]
        X1_batch = np.array(pd.Series(X1_batch).map(bert_dic).tolist())
        Y_batch = Y[index_batch]
        counter += 1
        yield [X0_batch, X1_batch], Y_batch
        if (counter >= number_of_batches):
            np.random.shuffle(shuffle_index)
            counter=0

In [25]:
# reserve some spaces for new users, for users who haven't rated any movies, recommend weighted average
users=int(len(user_id_mapping)*1.1)
embedding_size = 100

# use Input() to create tensors for - 'user' and 'movie'
user_id_input = Input(shape=(1,), name='user')

# Create embedding layer for users 
user_embedding = Embedding(output_dim=embedding_size, 
                           input_dim=users,
                           input_length=1, 
                           embeddings_regularizer=tf.keras.regularizers.l2(0.0001),
                           name='user_embedding')(user_id_input)

user_vector = Reshape([embedding_size])(user_embedding)
#################################################################################################

inp_bert = Input(shape=(768,), name='movie')
movie_vector = Dense(embedding_size)(inp_bert)

################################################################################################

output = Dot(1, normalize=False)([user_vector, movie_vector])

model = Model(inputs=[user_id_input, inp_bert], outputs=output)
model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4))

**map movie ids to movie overview embeddings**

In [26]:
val_bert=np.array(pd.Series(val_movie).map(bert_dic).tolist())

In [27]:
batch_size=1024*16
nb_epoch=20
steps_per_epoch=train_user.shape[0]//batch_size


es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                      patience=1,
                                      restore_best_weights=True,
                                      verbose=1)

model.fit_generator(generator=batch_generator(train_user, train_movie, train_Y, batch_size),
                    epochs=nb_epoch,
                    callbacks=[es],
                    validation_data=([val_user, val_bert], val_Y),
                    steps_per_epoch=steps_per_epoch)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Restoring model weights from the end of the best epoch.
Epoch 00004: early stopping


<keras.callbacks.History at 0x7f6ab53d9f70>

In [28]:
# model = keras.models.load_model('./content_based.h5')

In [29]:
model.save('content_based.h5')

## model evaluation

### define a function that makes prediction

In [30]:
def make_pred(user_id, movie_id, model, rank_dic):
    if type(user_id)!=type('asdf'):
        print('please enter a string for user id')
        return None
    if movie_id not in rank_dic:
        print('movie id non-existent')
        
    if user_id not in user_id_mapping:
        user=users-1
    else:
        user=user_id_mapping[user_id]
        
    movie=bert_dic[movie_id]
    pred=model.predict([np.array([user]), np.array([movie])])[0,0]
    pred+=rank_dic[movie_id]
    if pred<1: pred=1
    elif pred>5: pred=5
    return pred

**as you can see, for new users, we simply predict weighted average ratings of movies**

In [31]:
movie_id=list(rank_dic.keys())[13]
make_pred('asdf', movie_id, model, rank_dic), rank_dic[movie_id]

(4.473463689227797, 4.473463689227797)

In [32]:
movie_id=list(rank_dic.keys())[13]
user_id=list(df_test_filtered['User'].unique())[7]
make_pred(user_id, movie_id, model, rank_dic), rank_dic[movie_id]

(4.652288099070288, 4.473463689227797)

### MAE and RMSE evaluation

In [33]:
test_bert=np.array(pd.Series(test_movie).map(bert_dic).tolist())

y_pred=model.predict([test_user, test_bert]).flatten()

# add back the weighted score
y_pred+=df_test_filtered['Movie'].map(rank_dic).values

# clip the predicted score that's lower than 1 or larger than 5
y_pred = np.array(list(map(lambda x: 1.0 if x < 1 else 5.0 if x > 5.0 else x, y_pred)))

y_true = test_Y.copy()
y_true+=df_test_filtered['Movie'].map(rank_dic).values

rmse = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred))
mae = mean_absolute_error(y_true=y_true, y_pred=y_pred)
print("The RMSE Value for the content-based recommender:", rmse)
print("The MAE Value for the content-based recommender:", mae)

The RMSE Value for the content-based recommender: 0.8797305199855675
The MAE Value for the content-based recommender: 0.6921320720882057


**compare that with global recommender, we can see that content-based method is better**

In [34]:
y_pred = df_test_filtered['Movie'].map(rank_dic).values
rmse = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred))
mae = mean_absolute_error(y_true=y_true, y_pred=y_pred)
print("The RMSE Value for the global rated average recommender:", rmse)
print("The MAE Value for the global rated average recommender:", mae)

The RMSE Value for the global rated average recommender: 0.9849409787626872
The MAE Value for the global rated average recommender: 0.7940365856690194


### MAP@K Evaluation

**list of unique users in the test**

In [35]:
test_user_ids=list(df_test_filtered['User'].unique())

temp={}
for ele in test_user_ids:
    temp[ele]=True
test_user_ids=temp

del temp

len(test_user_ids)

57069

**create dictionaries that store movie id's and the average ratings of users in the train set**

In [36]:
# {user_id: [movie_id_1, movie_id_2, ...], ...}
train_movie_ids={}
# {user_id: average_rating, ...}
train_movie_average_ratings={}
ct=0

for i in range(df_train_filtered.shape[0]):
    if i % (df_train_filtered.shape[0]//100)==0: 
        print(str(ct)+' percent of job done')
        ct+=1
        
    user = df_train_filtered.iloc[i, 0]
    if user in test_user_ids:
        try:
            train_movie_ids[user].append(df_train_filtered.iloc[i,2])
            train_movie_average_ratings[user]+=df_train_filtered.iloc[i,1]
        except:
            train_movie_ids[user]=[df_train_filtered.iloc[i,2]]
            train_movie_average_ratings[user]=df_train_filtered.iloc[i,1]

del user

0 percent of job done
1 percent of job done
2 percent of job done
3 percent of job done
4 percent of job done
5 percent of job done
6 percent of job done
7 percent of job done
8 percent of job done
9 percent of job done
10 percent of job done
11 percent of job done
12 percent of job done
13 percent of job done
14 percent of job done
15 percent of job done
16 percent of job done
17 percent of job done
18 percent of job done
19 percent of job done
20 percent of job done
21 percent of job done
22 percent of job done
23 percent of job done
24 percent of job done
25 percent of job done
26 percent of job done
27 percent of job done
28 percent of job done
29 percent of job done
30 percent of job done
31 percent of job done
32 percent of job done
33 percent of job done
34 percent of job done
35 percent of job done
36 percent of job done
37 percent of job done
38 percent of job done
39 percent of job done
40 percent of job done
41 percent of job done
42 percent of job done
43 percent of job don

**divide the sum of ratings by the number of movies to get average rating for each user**

In [37]:
for key in test_user_ids:
    train_movie_average_ratings[key]/=len(train_movie_ids[key])

**test_movie_ids is a dicitionary that stores positively rated movies in the test sets for each user in the test sets**

In [38]:
# {user_id: [movie_id_1, movie_id_2, ...], ...}
test_movie_ids={}

for i in range(df_test_filtered.shape[0]):
    user=df_test_filtered.iloc[i,0]
    if df_test_filtered.iloc[i,1]>train_movie_average_ratings[user]:
        try:
            test_movie_ids[user].append(df_test_filtered.iloc[i,2])
        except:
            test_movie_ids[user]=[df_test_filtered.iloc[i,2]]

**define a function that calculates map@k**

In [39]:
def average_precision_at_k(rel: [int], pred: [int], k: int) -> float:
    # this function works only for a single user
    # rel is an list of movie id's for all relevant movies in the test set 
    # pred is the prediction of the model excluding those ratings already in the training set
    # pred is a list of movie id's whose scores are ranked from high to low
    # len(pred) should be large enough for k
    # k is the cutoff
    temp=0
    true_positive=0
    for i in range(min(k,len(pred))):
        if pred[i] in rel:
            true_positive+=1
            temp+=true_positive/(i+1)
    return temp/len(rel)

**loop through users in test set and apply the function we defined**

In [None]:
result1=[]
result2=[]
user_list=list(test_movie_ids.keys())

mile=len(user_list)//100
ct=0

for i in range(len(user_list)):
    
    if ct%mile==1: 
        print(ct//mile)
        print('content-based method', sum(result1)/len(result1))
        print('global method ', sum(result2)/len(result2))
    ct+=1

    user=user_list[i]

    rank_dic_copy=rank_dic.copy()

    rel=test_movie_ids[user]

    pred=[]
    # list of movies already rated in train set
    already=train_movie_ids[user]

    # remove movies that are already in the train set
    for ele in already:
        rank_dic_copy.pop(ele)

    # save a copy of movie ids
    aaa=list(rank_dic_copy.keys())
    
    # map movie ids to movie vocabulary number
    X_bert=np.array(pd.Series(np.array(aaa)).map(bert_dic).tolist())
    # map user ids to user vocabulary number
    X_user=pd.Series(np.array([user for i in range(X_bert.shape[0])])).map(user_id_mapping).values

    Y=model.predict([X_user, X_bert])
    Y=Y[:,0]
    Y=list(Y)

    pred=[]
    for iii, y in enumerate(Y):
        pred.append([aaa[iii], y+rank_dic[aaa[iii]]])

    # sort by score from high to low
    pred.sort(key=lambda x : x[1], reverse=True)
    pred=np.array(pred)
    pred=pred[:,0]
    pred=list(pred)
    result1.append(average_precision_at_k(rel, pred, 100000))
    
    rank_dic_copy=rank_dic.copy()
    for ele in train_movie_ids[user]:
        rank_dic_copy.pop(ele)
    pred=list(rank_dic_copy.keys())
    result2.append(average_precision_at_k(rel, pred, 100000))

0
content-based method 0.0029850746268656717
global method  0.000851063829787234
1
content-based method 0.017941247634556026
global method  0.017442572183923515
2
content-based method 0.01621911981626531
global method  0.018556598545060064
3
content-based method 0.01687817226272306
global method  0.018372700251581824
4
content-based method 0.0191977682906878
global method  0.020920093557158792
5
content-based method 0.019271091369626635
global method  0.020115653498201515
6
content-based method 0.019335563452047563
global method  0.019465341722946804
7
content-based method 0.01952718839051206
global method  0.02014665886058497
8
content-based method 0.019433979096104843
global method  0.020661304626061017
9
content-based method 0.01977118693688922
global method  0.020433044029210458
10
content-based method 0.019453081877083653
global method  0.020242167582991032
11
content-based method 0.019204173723092195
global method  0.01964518832870463
12
content-based method 0.019679139970014733


**Interestingly, the mean average precision of the content-based method is comparable to the global method, although the content based method apparently has better RMSE and MAE.**

In [None]:
print('mean average precision for content-based method is {}'.format(sum(result1)/len(result1)))
print('mean average precision for global method is {}'.format(sum(result2)/len(result2)))