# Unsupervised learning-movie recommender

## Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.decomposition import NMF

In [3]:
import pickle

## Import data

In [4]:
ratings = pd.read_csv("ratings.csv")

In [5]:
movies = pd.read_csv("movies.csv")

Inspect the tables.

In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
ratings.shape

(100836, 4)

In [8]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
movies.shape

(9742, 3)

## Check of duplicates in movies

In [10]:
movies.duplicated(subset = "title").value_counts()

False    9737
True        5
dtype: int64

## Drop duplicates

In [11]:
movies.drop_duplicates(subset = "title", keep = "first", inplace = True)

In [12]:
movies.shape

(9737, 3)

## Merge tables

Merge ratings and movies.

In [13]:
df = pd.merge(movies, ratings, on = "movieId", how = "right")

In [14]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,3,Grumpier Old Men (1995),Comedy|Romance,1,4.0,964981247
2,6,Heat (1995),Action|Crime|Thriller,1,4.0,964982224
3,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1,5.0,964983815
4,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1,5.0,964982931


In [15]:
df.shape

(100836, 6)

## Drop genres, timestamp and movieId columns

In [16]:
df = df.drop(columns = ["genres", "timestamp", "movieId"])

In [17]:
df.isna().sum()

title     6
userId    0
rating    0
dtype: int64

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   title   100830 non-null  object 
 1   userId  100836 non-null  int64  
 2   rating  100836 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 3.1+ MB


## Filter out movies that have been watched by less than 20 users

In [19]:
df["reviews"] = df.groupby(["title"])["rating"].transform("count")

In [20]:
df.head()

Unnamed: 0,title,userId,rating,reviews
0,Toy Story (1995),1,4.0,215.0
1,Grumpier Old Men (1995),1,4.0,52.0
2,Heat (1995),1,4.0,102.0
3,Seven (a.k.a. Se7en) (1995),1,5.0,203.0
4,"Usual Suspects, The (1995)",1,5.0,204.0


In [21]:
df.shape

(100836, 4)

In [22]:
df = df[df["reviews"] > 20][["userId", "title", "rating"]]

In [23]:
df.head()

Unnamed: 0,userId,title,rating
0,1,Toy Story (1995),4.0
1,1,Grumpier Old Men (1995),4.0
2,1,Heat (1995),4.0
3,1,Seven (a.k.a. Se7en) (1995),5.0
4,1,"Usual Suspects, The (1995)",5.0


In [24]:
df.shape

(66658, 3)

## Convert matrix into long format

In [25]:
df_pivot = pd.pivot_table(df, index = "userId", columns = "title", values = "rating" )

In [26]:
df_pivot

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),13 Going on 30 (2004),"13th Warrior, The (1999)",1408 (2007),2001: A Space Odyssey (1968),2012 (2009),...,Young Frankenstein (1974),Young Guns (1988),Zack and Miri Make a Porno (2008),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,4.0,,,,...,5.0,,,,,,,,,4.0
2,,,,,,,,,,,...,,,,,3.0,,,,,
3,,,,,,,,,,0.5,...,,,,,,,,,,
4,,,,,5.0,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,5.0,,...,3.5,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,3.0,,...,,,,,,3.0,,4.5,3.5,
609,,,,,,,,,,,...,,,,,,,,,,


## Impute the missing values

In [27]:
df_pivot = df_pivot.fillna(0)

In [28]:
df_pivot.head()

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),13 Going on 30 (2004),"13th Warrior, The (1999)",1408 (2007),2001: A Space Odyssey (1968),2012 (2009),...,Young Frankenstein (1974),Young Guns (1988),Zack and Miri Make a Porno (2008),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
df_pivot.shape

(610, 1235)

## NMF

In [30]:
R = df_pivot.values

In [31]:
len(R)

610

Check the number of unique movie genres to decide on number of components for the NMF.

In [32]:
movies["genres"].nunique()

951

In [33]:
m = NMF(n_components = 450, init = "random", random_state = 10, max_iter = 200)

In [34]:
m.fit(R)



NMF(init='random', n_components=450, random_state=10)

In [35]:
Q = m.components_

In [36]:
Q

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.07816135, 0.        , 0.        , ..., 0.02872464, 0.03046956,
        0.07182464],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.01815564, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.13936332, 0.        , ..., 0.69663316, 0.        ,
        0.27872838],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [37]:
P = m.transform(R)

In [38]:
P

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [39]:
print(m.reconstruction_err_)

178.1992642070337


In [40]:
nR = np.dot(P, Q)

In [41]:
nR

array([[5.39915407e-07, 2.47310491e-07, 3.33702512e-07, ...,
        4.51421320e-04, 0.00000000e+00, 3.99968435e+00],
       [1.75492217e-01, 1.57998483e-01, 6.44679860e-02, ...,
        3.97695312e-05, 0.00000000e+00, 2.44023768e-04],
       [4.52737599e-06, 1.03983324e-03, 5.33088907e-02, ...,
        9.07705648e-03, 0.00000000e+00, 2.07464446e-02],
       ...,
       [1.10116325e-04, 3.32620959e-03, 8.35773998e-04, ...,
        4.50107599e+00, 3.49686447e+00, 7.88179098e-04],
       [1.71905620e-02, 2.43819605e-02, 2.26830484e-02, ...,
        3.19460877e-03, 0.00000000e+00, 0.00000000e+00],
       [3.50016099e+00, 1.32099733e-06, 3.58758344e-06, ...,
        4.65993805e-03, 2.00006934e+00, 0.00000000e+00]])

In [42]:
nR.shape

(610, 1235)

## Save the model

In [43]:
filename = "my_nmf_model.sav"

In [44]:
pickle.dump(m, open(filename, "wb"))

## Load the model

In [45]:
model = pickle.load(open(filename, "rb"))

## Create a film dataframe for all the films in the pivot table.

In [46]:
mv = df_pivot.columns

In [47]:
mv = pd.DataFrame(mv)

In [48]:
mv

Unnamed: 0,title
0,(500) Days of Summer (2009)
1,10 Things I Hate About You (1999)
2,101 Dalmatians (1996)
3,101 Dalmatians (One Hundred and One Dalmatians...
4,12 Angry Men (1957)
...,...
1230,Zoolander (2001)
1231,Zootopia (2016)
1232,eXistenZ (1999)
1233,xXx (2002)


In [49]:
mv.to_csv("movies_list.csv")

## Assign some ratings to a user.

In [50]:
user_query = { "Matrix, The (1999)" : 1, "Shawshank Redemption, The (1994)" : 3, "Jewel of the Nile, The (1985)" : 4.5}

In [61]:
def recommend_nmf(query, movies_list, model = "my_nmf_model.sav", k = 10):    
   
    """
    Filters and recommends the top k movies 
    for any given input query based 
    on a trained NMF model.

    Parameters
    ----------
    query : dict
        A dictionary of movies already seen. 
        Takes the form {"movie_A": 3, "movie_B": 3} etc
        
    model : pickle
        pickle model read from disk
        
    k : int, optional
        no. of top movies to recommend, by default 10
        
    """
    
    user = pd.DataFrame(list(query.items()), columns = ["title", "rating"])
    user.set_index("title", inplace = True)
    
    
    user_merge = pd.merge(movies_list, user, on = "title", how = "left")
   
    
    
    # calculate the score with the NMF model
    user_query = user_merge["rating"]
    user_query = user_query.fillna(0)
    user_query = np.array(user_query)
    
    user2 = np.array(user_query)
    user2 = user2.reshape(1, -1)
    user2 = model.transform(user2)
    
    
    Q = model.components_
    rec = np.dot(user2, Q)
    rec = pd.Series(rec[0], index = df_pivot.columns)
    
    # set zero score to movies allready seen by the user
    rec = pd.DataFrame(rec)
    
    for i in user.index:
        rec.drop(index = [i], inplace = True)
        
        
    # return the top-k highst rated movie ids or titles
    
    recommendations = rec.sort_values(by = 0, ascending = False).head(k)
    return recommendations.index.tolist()

In [62]:
recommendations = recommend_nmf(user_query, mv, model, k = 10)

In [63]:
recommendations

['Good Will Hunting (1997)',
 'Saving Private Ryan (1998)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Mummy, The (1999)',
 'Total Recall (1990)',
 'Beautiful Mind, A (2001)',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
 'Monty Python and the Holy Grail (1975)',
 'Princess Bride, The (1987)',
 'Men in Black (a.k.a. MIB) (1997)']