In [1]:
import pandas as pd
import sqlite3
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.decomposition import NMF
import numpy as np
import pickle
from fuzzywuzzy import process



In [2]:
def join_df(df1, df2, index):
    """Joins one dataframe to the other on a given basis (df1 on the left), then removes any additional rows created"""
    df = df1.join(df2.set_index(index), how='right', on=index)
    df = df.dropna()
    return df

In [3]:
def pivot(df, rows, columns, values, empty_val):
    """Pivots a dataframe, allowing us to select which category is desired as rows, columns and values of a new dataframe.
    Also imputes missing values with a given value"""
    R = df.groupby([rows, columns])[[values]].first().unstack()
    R.fillna(empty_val, inplace=True)
    return R

In [5]:
ratings = pd.read_csv('ratings.csv')

In [6]:
movies = pd.read_csv('movies.csv')

We join the movies dataframe onto the ratings one in order to get the movie information for each rating:

In [7]:
df = join_df(ratings, movies, 'movieId')

In [8]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1.0,1,4.0,964982700.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
516,5.0,1,4.0,847435000.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
874,7.0,1,4.5,1106636000.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1434,15.0,1,2.5,1510578000.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1667,17.0,1,4.5,1305696000.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


### We take a look at Non-negative Matrix Factorisation:

We start by running the 'pivot' function to take a dataframe and put it into the desired shape - in this case users as rows, movies as columns, ratings as values. We also impute the empty values with 3, the median possible rating:

In [9]:
R = pivot(df, 'userId', 'movieId', 'rating', 3.0)

In [10]:
R.head()

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1.0,4.0,3.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
5.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0


The we define the model, which produces our matrices P and Q whose dot product should approximate R. We give our P and Q an *n-components* value of n, meaning n hidden features:

In [11]:
nmf = NMF(n_components=3, init='random', random_state=10)

In [12]:
nmf.fit(R)

NMF(alpha=0.0, beta_loss='frobenius', init='random', l1_ratio=0.0,
  max_iter=200, n_components=3, random_state=10, shuffle=False,
  solver='cd', tol=0.0001, verbose=0)

In [13]:
Q = nmf.components_
Q.shape

(3, 9724)

In [14]:
P = nmf.transform(R)
P.shape

(610, 3)

We calculate the error between the training matrix and the reconstructed matrix from the fitted model (this is similar to the mean squared error in the gradient descent algorithm):

In [15]:
print(nmf.reconstruction_err_)

355.68957380660083


(...and store the model in a pickle)

In [16]:
binary = pickle.dumps(nmf)
open('nmf_model.bin', 'wb').write(binary)

233982

In [17]:
binary = open('nmf_model.bin', 'rb').read()
nmf = pickle.loads(binary)

Then  we view our reconstructed matrix:

In [18]:
nR = np.dot(P, Q)

In [19]:
nR

array([[3.35264953, 3.10172281, 3.04714722, ..., 3.02662218, 3.02564113,
        3.02685496],
       [3.31813476, 3.07380821, 3.01666826, ..., 2.99353023, 2.99486802,
        2.99533524],
       [3.31354692, 3.06853897, 3.00922071, ..., 2.98776913, 2.98504413,
        2.98476237],
       ...,
       [3.32871307, 3.08186057, 3.02676756, ..., 3.0045168 , 3.00584524,
        3.00689961],
       [3.31678603, 3.07065793, 3.01402998, ..., 2.99254238, 2.99165864,
        2.99216957],
       [3.41765713, 3.163031  , 3.11139152, ..., 3.08814168, 3.09339589,
        3.09592193]])

Then we can subtract our two matrices to see how our reconstructed matrix differs from the one we started with:

In [20]:
diff = R - nR
diff.head()

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1.0,0.64735,-0.101723,0.952853,-0.017814,-0.02939,0.814222,-0.042445,-0.023299,-0.028228,-0.134313,...,-0.025924,-0.023943,-0.026943,-0.027167,-0.025169,-0.026518,-0.024849,-0.026622,-0.025641,-0.026855
2.0,-0.318135,-0.073808,-0.016668,0.014463,-0.000613,-0.150977,-0.00895,0.007678,0.002749,-0.099981,...,0.005446,0.005376,0.004878,0.005367,0.004782,0.00454,0.004879,0.00647,0.005132,0.004665
3.0,-0.313547,-0.068539,-0.009221,0.014073,0.00389,-0.145596,-0.004782,0.017403,0.011428,-0.094756,...,0.014986,0.011269,0.013724,0.011015,0.012929,0.012163,0.007752,0.012231,0.014956,0.015238
4.0,-0.327239,-0.078489,-0.03051,0.013973,-0.006055,-0.164711,-0.020409,-0.012034,-0.014694,-0.113483,...,-0.014415,-0.003861,-0.014173,-0.008019,-0.010089,-0.010835,0.001393,-0.007981,-0.014453,-0.017567
5.0,0.681584,-0.072838,-0.016822,0.014076,-0.000114,-0.151945,-0.009903,0.007158,0.002304,-0.100959,...,0.004819,0.005604,0.004149,0.004589,0.004686,0.004122,0.005149,0.005505,0.00468,0.003935


### We create a profile for a new user:

Firstly we generate a list of all movie ID's:

In [21]:
movieslist = list(list(zip(*R.columns))[1])

Then we generate an "empty" set of selections for the new user:

In [22]:
profile = {movie: np.nan for movie in movieslist}

Then we assign some ratings (manually selected ones for now...)

In [23]:
profile[5106] = 5
profile[7451] = 5
profile[39] = 5

Our Xpred needs to be a matrix of these ratings. We convert them to a dataframe so that we can replace the NaN's with a value (in this case 3 - the neutral rating):

In [24]:
df2 = pd.DataFrame(list(profile.items()))

In [25]:
df2.sort_values(by=0, inplace=True)

In [26]:
df2[1].fillna(3.0, inplace=True)

In [27]:
Xpred = [df2[1]]

We produce the "hidden" profile based on our Xpred:

In [28]:
hidden_profile = nmf.transform(Xpred)
hidden_profile

array([[1.22743924, 1.01403513, 0.95350832]])

In [29]:
nmf.components_.shape

(3, 9724)

And reconstruct to get our new user's predicted ratings for all movies:

In [30]:
ypred = np.dot(hidden_profile, nmf.components_)
ypred

array([[3.31565391, 3.07230689, 3.01405729, ..., 2.99053992, 2.9917284 ,
        2.99188658]])

### Now we wish to output the best-ranking movies that the user has not rated yet - our recommendations!

In [31]:
df3 = pd.DataFrame(ypred.T, columns=['rating'], index=movieslist)

In [32]:
df3['profile'] = profile.values()

In [33]:
df3.head()

Unnamed: 0,rating,profile
1,3.315654,
2,3.072307,
3,3.014057,
4,2.983799,
5,2.998892,


We need to get movieId into the table:

In [44]:
lookuptable = dict(movies[['movieId', 'title']].groupby('movieId')['title'].first())
lookuptable2 = dict(movies[['title', 'movieId']].groupby('title')['movieId'].first())

In [34]:
df4 = df3[df3['profile'].isna()].sort_values(by='rating', ascending=False)
df4['movieId'] = df4.index
df4.head()

Unnamed: 0,rating,profile,movieId
318,3.732788,,318
356,3.619126,,356
296,3.591657,,296
2571,3.535687,,2571
593,3.523176,,593


In [36]:
df4['title'] = df4['movieId'].map(lambda x: lookuptable[x])
df4.head()

Unnamed: 0,rating,profile,movieId,title
318,3.732788,,318,"Shawshank Redemption, The (1994)"
356,3.619126,,356,Forrest Gump (1994)
296,3.591657,,296,Pulp Fiction (1994)
2571,3.535687,,2571,"Matrix, The (1999)"
593,3.523176,,593,"Silence of the Lambs, The (1991)"


### Now we need to be able to match user input to a valid movie title in order to then lookup the ID:

In [46]:
def find_movies(input1, input2, input3):
    """Generates recommendations from user's input"""
    titles = list(movies['title'])
    movie1 = process.extractOne(input1, titles)
    id1 = movie1.map(lambda x: lookuptable2[x])
    movie2 = process.extractOne(input2, titles)
    id2 = movie2.map(lambda x: lookuptable2[x])
    movie3 = process.extractOne(input3, titles)
    id3 = movie3.map(lambda x: lookuptable2[x])
    return [id1, id2, id3]

In [47]:
print(find_movies('Mean Gurls', 'Titonic', 'Djongo'))

AttributeError: 'tuple' object has no attribute 'map'