In [1]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF

In [2]:
#import data
ratings=pd.read_csv('ml-latest-small/ratings.csv')
movies=pd.read_csv('ml-latest-small/movies.csv')
tags=pd.read_csv('ml-latest-small/tags.csv')
links=pd.read_csv('ml-latest-small/links.csv')

In [3]:
#create movie index for later
movie_index=dict(zip(movies.movieId, movies.title))

In [4]:
#create the input data matrix
Rtrue=ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0) # OR fillna:0/3/2.5/mean
Rtrue.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
#fsave the movieId for indexing results later
movieId=Rtrue.columns

In [6]:
#transform input data into a numpy array 
Rtrue=np.array(Rtrue)
Rtrue.shape #610 users, 9724 movies

(610, 9724)

In [7]:
#training the model 
m=NMF(20) #n_components recommended by lecture notes
m.fit(Rtrue)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=20, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [8]:
#reconstruction
component1=m.components_ #movieId-rating matrix
component2=m.transform(Rtrue) #userId-rating matrix
component2.shape, component1.shape

((610, 20), (20, 9724))

In [9]:
R2=np.dot(component2, component1).round(1)

In [10]:
#reconstruction error 
m.reconstruction_err_

874.9034748034848

In [11]:
#reconstructed matrix
R2.shape

(610, 9724)

In [12]:
#create a new user matrix based on the number of movies in the input data matrix
new_user=[np.random.randint(0,5,len(R2[0]))]

In [13]:
#predict new data points of missing features 
user_profile=m.transform(new_user)
user_profile.shape, component1.shape

((1, 20), (20, 9724))

In [14]:
#weights of each movie based on model 
results=np.dot(user_profile, component1)

In [15]:
#match movie preference weight to coresponding movie title 
results2=pd.DataFrame(results[0]).set_index(movieId).reset_index()
recommendations=results2.replace({'movieId':movie_index})

In [16]:
#show movie recommnedations in descneding order 
recommendations.sort_values(0, ascending=False)

Unnamed: 0,movieId,0
706,2001: A Space Odyssey (1968),5.852782
827,Reservoir Dogs (1992),5.789846
1297,"Big Lebowski, The (1998)",5.776235
902,"Good, the Bad and the Ugly, The (Buono, il bru...",5.671457
3136,Memento (2000),5.510543
...,...,...
3174,Frankie and Johnny (1966),0.000000
3906,Red Beard (Akahige) (1965),0.000000
3904,In Like Flint (1967),0.000000
531,"Thin Line Between Love and Hate, A (1996)",0.000000
