# NMF - movie recommentation 

In [1]:
import numpy as np
from sklearn.decomposition import NMF
import pandas as pd
import pandas_profiling as pp 
import pickle

In [2]:
df = pd.read_csv('ml-latest-small/ratings.csv')

In [3]:
movies = pd.read_csv('ml-latest-small/movies.csv')

In [4]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
unique = pd.DataFrame(df.movieId.unique())
unique.columns = ['movieId'] 
unique

Unnamed: 0,movieId
0,1
1,3
2,6
3,47
4,50
...,...
9719,160341
9720,160527
9721,160836
9722,163937


In [6]:
movies = movies.merge(unique, how='right')

In [7]:
#pp.ProfileReport(df)

In [8]:
df = pd.merge(df, movies, on='movieId')
movies = df 

In [9]:
df = df.drop('timestamp', axis =1)

In [10]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100836 non-null  int64  
 1   movieId  100836 non-null  int64  
 2   rating   100836 non-null  float64
 3   title    100836 non-null  object 
 4   genres   100836 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 4.6+ MB


In [11]:
df2 = df.pivot(index='userId', columns='movieId', values='rating')

In [12]:
df2 

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [13]:
df2.isna().sum().sum()

5830804

In [14]:
df2.notna().sum().sum()

100836

### filling the missing values with median

In [15]:
med_values = df2.median().median()
med_values

3.5

In [16]:
df_filled = df2.fillna(med_values)

## train the NMF model

In [17]:
m = NMF(n_components=30)

In [18]:
m.fit(df_filled)



NMF(n_components=30)

### checking the matrices

In [19]:
Q = m.components_
P = m.transform(df_filled)
error = m.reconstruction_err_ #this is an absolute score, so no intuition from looking at in isolation! 
P.shape, Q.shape, error

((610, 30), (30, 9724), 260.20170627679533)

### make a prediction based on new user input

In [20]:
# the ratings went from 0.5 to 5
ratings = [0.5, 1, 1.5, 2., 2.5, 3, 3.5, 4, 4.5, 5]

In [21]:
#finding the probabilities of a rating to appear if we have all values:
prob_ratings = []
for i in ratings:
    prob = len(df['rating'][df['rating'] == i])/ len(df)
    prob_ratings.append(prob)

In [22]:
#probabilities that there is a Nan value:
prob_nan = df2.isna().sum().sum() / (610*9724)
# probability that there is a rating for a movie for a user:
prob_notna = 1 - prob_nan

In [23]:
#probabilities for the ratings:
prob_rated = np.asarray(prob_ratings) * prob_notna

In [24]:
prob_rated = prob_rated.tolist()

In [25]:
prob_rated.append(prob_nan)

In [26]:
prob_rated 

[0.00023096479219912182,
 0.0004738992926071032,
 0.00030194010425447246,
 0.0012730037561281525,
 0.0009356602895657856,
 0.0033796724008874415,
 0.002214564606078587,
 0.0045211779541577,
 0.0014415911956895554,
 0.002227208664045692,
 0.9830003169443864]

In [27]:
ratings = [0.5, 1, 1.5, 2., 2.5, 3, 3.5, 4, 4.5, 5, np.nan]

In [28]:
# generate a user input , so a new row of 9724 values corresponding to random ratings for each of the 9724 movies
new_user_input = pd.Series(np.random.choice(ratings, 9724, p=prob_rated))

In [29]:
new_user_input

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
9719   NaN
9720   NaN
9721   NaN
9722   NaN
9723   NaN
Length: 9724, dtype: float64

In [30]:
new_user_input.isna().sum()

9547

In [31]:
#Fill missing data
new_user_input = new_user_input.fillna(med_values)

In [32]:
# make sure the new input has >1 dimension & has as many columns as there are films!
new_user_input = np.array(new_user_input).reshape(1,9724)

In [33]:
#Prediction step 1 - generate extra a user_P
user_P = m.transform(new_user_input)

In [34]:
#new user R - reconstruct R but for this new user only
user_R = np.dot(user_P,Q)

In [35]:
user_R #impute with median - order of recommendations is the same as below

array([[3.58951668, 3.52928345, 3.47297571, ..., 3.49943538, 3.49943538,
        3.4998122 ]])

In [36]:
user_R.shape

(1, 9724)

In [37]:
user_R = user_R.tolist()

In [38]:
len(user_R[0])

9724

## matching the recommendations with the movie names

In [39]:
movieId = df.movieId.unique()
movieId 

array([     1,      3,      6, ..., 160836, 163937, 163981])

In [40]:
user_recom = pd.DataFrame({'movieId' : movieId , 'recommendations': user_R[0]})

In [41]:
user_recom.sort_values(by= 'recommendations', ascending=False)

Unnamed: 0,movieId,recommendations
277,3949,3.788900
314,368,3.734204
257,122882,3.621382
322,588,3.615617
224,3703,3.615059
...,...,...
18,333,3.424615
6293,66427,3.419518
197,2997,3.415643
2013,134368,3.392463


In [42]:
user_recom = pd.merge(user_recom, movies, on='movieId')

In [43]:
user_recom.sort_values(by= 'recommendations', ascending=False)

Unnamed: 0,movieId,recommendations,userId,rating,timestamp,title,genres
18779,3949,3.788900,448,4.0,1296299329,Requiem for a Dream (2000),Drama
18731,3949,3.788900,132,4.0,1157920746,Requiem for a Dream (2000),Drama
18723,3949,3.788900,103,5.0,1431957513,Requiem for a Dream (2000),Drama
18724,3949,3.788900,105,4.5,1446573676,Requiem for a Dream (2000),Drama
18725,3949,3.788900,110,4.0,1175330135,Requiem for a Dream (2000),Drama
...,...,...,...,...,...,...,...
19279,190,3.387476,603,3.0,1000194311,Safe (1995),Thriller
19278,190,3.387476,325,5.0,1039398384,Safe (1995),Thriller
19277,190,3.387476,275,3.0,1049078712,Safe (1995),Thriller
19276,190,3.387476,195,1.0,974710304,Safe (1995),Thriller


# save the model

In [44]:
pickle.dump(m, open('nmf_small.m', 'wb'))

# conect to the db to give recommendations

In [45]:
import sqlalchemy as sql 

In [69]:
HOST = '34.89.195.148'
USERNAME = 'postgres'
PORT = '5432'
DB = 'moviedb'
PASSWORD = 'postgres'

In [70]:
engine = sql.create_engine(f'postgres://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/{DB}')

In [83]:
query = sql.text(" select * from ratings WHERE userId = 1")
results = engine.execute(query)

In [84]:
results = pd.DataFrame(results)

In [81]:
movies = pd.read_csv('ml-latest-small/movies.csv')

In [85]:
#column 0 is the user id 
#column 1 is movie id 
#column 2 is the rating 
results.columns = ['userId', 'movieId', 'ratings', 'timestamp']

In [86]:
results = results.drop('timestamp', axis = 1)

In [87]:
movies  = movies.drop('genres', axis = 1)

In [88]:
unique.columns = ['movieId'] 
movies = movies.merge(unique, how='right')

In [89]:
user_input = results.merge(movies, how = 'right')

In [90]:
user_input['input'] = user_input['ratings'].fillna(med_values)

In [91]:
user_input

Unnamed: 0,userId,movieId,ratings,title,input
0,1.0,1,4.0,Toy Story (1995),4.0
1,1.0,3,4.0,Grumpier Old Men (1995),4.0
2,1.0,6,4.0,Heat (1995),4.0
3,1.0,47,5.0,Seven (a.k.a. Se7en) (1995),5.0
4,1.0,50,5.0,"Usual Suspects, The (1995)",5.0
...,...,...,...,...,...
9719,,160341,,Bloodmoon (1997),3.5
9720,,160527,,Sympathy for the Underdog (1971),3.5
9721,,160836,,Hazard (2005),3.5
9722,,163937,,Blair Witch (2016),3.5


In [92]:
# make sure the new input has >1 dimension & has as many columns as there are films!
new_user_input = np.array(user_input['input']).reshape(1,9724)

In [93]:
user_P = m.transform(new_user_input)
user_R = np.dot(user_P,Q)
user_R = user_R.tolist()

In [94]:
user_input['recommendation'] = user_R[0]

In [95]:
user_input = user_input[user_input['ratings'].isna()]

In [96]:
user_input.movieId.iloc[0]

318

In [111]:
user_input

Unnamed: 0,userId,movieId,ratings,title,input,recommendation
232,,318,,"Shawshank Redemption, The (1994)",3.5,3.560035
233,,1704,,Good Will Hunting (1997),3.5,3.534742
234,,6874,,Kill Bill: Vol. 1 (2003),3.5,3.522012
235,,8798,,Collateral (2004),3.5,3.545696
236,,46970,,Talladega Nights: The Ballad of Ricky Bobby (2...,3.5,3.569844
...,...,...,...,...,...,...
9719,,160341,,Bloodmoon (1997),3.5,3.510320
9720,,160527,,Sympathy for the Underdog (1971),3.5,3.509505
9721,,160836,,Hazard (2005),3.5,3.509505
9722,,163937,,Blair Witch (2016),3.5,3.509505


In [97]:
return_list = []
for i in range(3):
    movieId = user_input.movieId.iloc[i]
    return_list.append(movieId)

In [98]:
return_list

[318, 1704, 6874]

In [108]:
return_list_name = []
for i in range(3):
    moviename= user_input['title'].iloc[i]
    return_list_name.append(moviename)

In [109]:
return_list_name

['Shawshank Redemption, The (1994)',
 'Good Will Hunting (1997)',
 'Kill Bill: Vol. 1 (2003)']

In [110]:
i

2