# Unsupervised learning-movie recommender

## Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.decomposition import NMF

# Project goal

In this project, you will build a proof of concept: A web application that showcases different movie recommendation algorithms.

1. Download a small version of the MovieLens-dataset


**2. Implement a baseline recommender**


3. Derive a user-item matrix


4. Pick and implement a Collaborative Filtering recommender:

    a) Collaborative Filtering with Matrix Factorization
    
    b) Neighbourhood based Collaborative Filtering
    
    
5. Write a flask web interface


6. Connect your recommender-model to flask



# Import data

Import all 4 csv files of the *ml-latest-small.zip* dataset.

In [3]:
ratings = pd.read_csv("ratings.csv")

In [4]:
movies = pd.read_csv("movies.csv")

In [5]:
tags = pd.read_csv("tags.csv")

In [6]:
links = pd.read_csv("links.csv")

Inspect the tables.

In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
ratings.shape

(100836, 4)

In [9]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
movies.shape

(9742, 3)

In [11]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [12]:
tags.shape

(3683, 4)

In [13]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [14]:
links.shape

(9742, 3)

# Merge tables

Merge ratings and movies.

In [15]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [16]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [17]:
movies["movieId"].nunique()

9742

In [18]:
df = pd.merge(movies, ratings, on = "movieId", how = "right")

In [19]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,3,Grumpier Old Men (1995),Comedy|Romance,1,4.0,964981247
2,6,Heat (1995),Action|Crime|Thriller,1,4.0,964982224
3,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1,5.0,964983815
4,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1,5.0,964982931


In [20]:
df.shape

(100836, 6)

Drop genres and timestamp columns.

In [21]:
df = df.drop(columns = ["genres", "timestamp"])

In [22]:
df

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,3,Grumpier Old Men (1995),1,4.0
2,6,Heat (1995),1,4.0
3,47,Seven (a.k.a. Se7en) (1995),1,5.0
4,50,"Usual Suspects, The (1995)",1,5.0
...,...,...,...,...
100831,166534,Split (2017),610,4.0
100832,168248,John Wick: Chapter Two (2017),610,5.0
100833,168250,Get Out (2017),610,5.0
100834,168252,Logan (2017),610,5.0


In [23]:
df.isna().sum()

movieId    0
title      0
userId     0
rating     0
dtype: int64

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   movieId  100836 non-null  int64  
 1   title    100836 non-null  object 
 2   userId   100836 non-null  int64  
 3   rating   100836 non-null  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 3.8+ MB


# Calculate the average rating for each movie in the dataset

In [25]:
df.groupby(["title"])["rating"].mean().sort_values(ascending = False)

title
Gena the Crocodile (1969)                  5.0
True Stories (1986)                        5.0
Cosmic Scrat-tastrophe (2015)              5.0
Love and Pigeons (1985)                    5.0
Red Sorghum (Hong gao liang) (1987)        5.0
                                          ... 
Don't Look Now (1973)                      0.5
Journey 2: The Mysterious Island (2012)    0.5
Joe Dirt 2: Beautiful Loser (2015)         0.5
Jesus Christ Vampire Hunter (2001)         0.5
Fullmetal Alchemist 2018 (2017)            0.5
Name: rating, Length: 9719, dtype: float64

# Filter out movies that have been watched by less than 20 users

In [26]:
df["reviews"] = df.groupby(["title"])["rating"].transform("count")

In [27]:
df.head()

Unnamed: 0,movieId,title,userId,rating,reviews
0,1,Toy Story (1995),1,4.0,215
1,3,Grumpier Old Men (1995),1,4.0,52
2,6,Heat (1995),1,4.0,102
3,47,Seven (a.k.a. Se7en) (1995),1,5.0,203
4,50,"Usual Suspects, The (1995)",1,5.0,204


In [28]:
df.shape

(100836, 5)

In [29]:
df = df[df["reviews"] > 20][["userId", "title", "movieId", "rating"]]

In [30]:
df.head()

Unnamed: 0,userId,title,movieId,rating
0,1,Toy Story (1995),1,4.0
1,1,Grumpier Old Men (1995),3,4.0
2,1,Heat (1995),6,4.0
3,1,Seven (a.k.a. Se7en) (1995),47,5.0
4,1,"Usual Suspects, The (1995)",50,5.0


In [31]:
df.shape

(66661, 4)

# Extract film, movieId, rating lists

In [32]:
df = df.sort_values(by = ["movieId"], ascending = True)

In [33]:
df.head()

Unnamed: 0,userId,title,movieId,rating
0,1,Toy Story (1995),1,4.0
61023,396,Toy Story (1995),1,5.0
22638,155,Toy Story (1995),1,3.0
22684,156,Toy Story (1995),1,4.0
60405,391,Toy Story (1995),1,3.0


In [34]:
df.shape

(66661, 4)

Films.

In [35]:
films = df["title"].unique().tolist()

In [36]:
films

['Toy Story (1995)',
 'Jumanji (1995)',
 'Grumpier Old Men (1995)',
 'Father of the Bride Part II (1995)',
 'Heat (1995)',
 'Sabrina (1995)',
 'GoldenEye (1995)',
 'American President, The (1995)',
 'Casino (1995)',
 'Sense and Sensibility (1995)',
 'Ace Ventura: When Nature Calls (1995)',
 'Get Shorty (1995)',
 'Copycat (1995)',
 'Powder (1995)',
 'Leaving Las Vegas (1995)',
 'City of Lost Children, The (Cité des enfants perdus, La) (1995)',
 'Dangerous Minds (1995)',
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)',
 'Babe (1995)',
 'Dead Man Walking (1995)',
 'Clueless (1995)',
 'Mortal Kombat (1995)',
 'To Die For (1995)',
 'Seven (a.k.a. Se7en) (1995)',
 'Pocahontas (1995)',
 'Usual Suspects, The (1995)',
 'Mighty Aphrodite (1995)',
 'Postman, The (Postino, Il) (1994)',
 'Indian in the Cupboard, The (1995)',
 "Mr. Holland's Opus (1995)",
 'Bio-Dome (1996)',
 'From Dusk Till Dawn (1996)',
 'Broken Arrow (1996)',
 'Bottle Rocket (1996)',
 'Happy Gilmore (1996)',
 'Bridges of Madison Cou

MovieIds.

In [37]:
movieIds = df["movieId"].unique().tolist()

In [38]:
movieIds

[1,
 2,
 3,
 5,
 6,
 7,
 10,
 11,
 16,
 17,
 19,
 21,
 22,
 24,
 25,
 29,
 31,
 32,
 34,
 36,
 39,
 44,
 45,
 47,
 48,
 50,
 52,
 58,
 60,
 62,
 65,
 70,
 95,
 101,
 104,
 105,
 107,
 110,
 111,
 112,
 135,
 141,
 145,
 150,
 151,
 153,
 158,
 160,
 161,
 163,
 165,
 168,
 170,
 172,
 173,
 180,
 185,
 186,
 193,
 196,
 198,
 203,
 204,
 207,
 208,
 215,
 216,
 223,
 224,
 225,
 227,
 230,
 231,
 235,
 236,
 237,
 246,
 247,
 252,
 253,
 256,
 260,
 261,
 262,
 265,
 266,
 272,
 273,
 276,
 277,
 282,
 288,
 292,
 293,
 296,
 300,
 303,
 307,
 315,
 316,
 317,
 318,
 327,
 329,
 333,
 337,
 339,
 342,
 344,
 345,
 348,
 349,
 350,
 353,
 355,
 356,
 357,
 362,
 364,
 367,
 368,
 370,
 371,
 372,
 374,
 376,
 377,
 379,
 380,
 383,
 410,
 413,
 420,
 431,
 432,
 434,
 435,
 440,
 441,
 442,
 454,
 455,
 457,
 466,
 468,
 471,
 474,
 475,
 480,
 485,
 494,
 497,
 500,
 508,
 509,
 515,
 519,
 520,
 524,
 527,
 529,
 531,
 539,
 541,
 543,
 546,
 551,
 552,
 553,
 555,
 562,
 585,
 586,
 

Rating.

In [39]:
rates = df.groupby(["movieId"])["rating"].mean().values.tolist()

In [40]:
rates

[3.9209302325581397,
 3.4318181818181817,
 3.2596153846153846,
 3.0714285714285716,
 3.946078431372549,
 3.185185185185185,
 3.496212121212121,
 3.6714285714285713,
 3.926829268292683,
 3.7761194029850746,
 2.727272727272727,
 3.49438202247191,
 3.2222222222222223,
 3.125,
 3.625,
 4.0131578947368425,
 3.1842105263157894,
 3.983050847457627,
 3.65234375,
 3.8358208955223883,
 3.293269230769231,
 2.5434782608695654,
 3.3125,
 3.9753694581280787,
 3.1470588235294117,
 4.237745098039215,
 3.4642857142857144,
 4.027027027027027,
 3.235294117647059,
 3.70625,
 2.532258064516129,
 3.5090909090909093,
 3.0238095238095237,
 3.782608695652174,
 3.4393939393939394,
 3.282608695652174,
 3.326923076923077,
 4.031645569620253,
 4.105769230769231,
 3.5434782608695654,
 2.9516129032258065,
 3.494186046511628,
 3.2450980392156863,
 3.845771144278607,
 3.5454545454545454,
 2.9160583941605838,
 2.806451612903226,
 2.8771929824561404,
 3.6359223300970873,
 3.5606060606060606,
 3.5555555555555554,
 3.0833

# Calculate again the average rating for each movie in the dataset

In [41]:
df.groupby(["title"])["rating"].mean().sort_values(ascending = False)

title
Shawshank Redemption, The (1994)                 4.429022
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)    4.333333
Philadelphia Story, The (1940)                   4.310345
Lawrence of Arabia (1962)                        4.300000
In the Name of the Father (1993)                 4.300000
                                                   ...   
I Know What You Did Last Summer (1997)           2.109375
Inspector Gadget (1999)                          2.095238
Super Mario Bros. (1993)                         2.000000
Godzilla (1998)                                  1.954545
Anaconda (1997)                                  1.925926
Name: rating, Length: 1235, dtype: float64

# Calculate the number of ratings per user

In [42]:
df.groupby(["userId"])["rating"].count().sort_values(ascending = False)

userId
414    1070
599     926
68      853
474     724
274     700
       ... 
320       9
324       9
397       9
578       8
175       4
Name: rating, Length: 610, dtype: int64

# Calculate the number of ratings per film

In [43]:
df.groupby(["title"])["rating"].count().sort_values(ascending = False)

title
Forrest Gump (1994)                 329
Shawshank Redemption, The (1994)    317
Pulp Fiction (1994)                 307
Silence of the Lambs, The (1991)    279
Matrix, The (1999)                  278
                                   ... 
Keeping the Faith (2000)             21
Joe Dirt (2001)                      21
Jewel of the Nile, The (1985)        21
Other Guys, The (2010)               21
Heavenly Creatures (1994)            21
Name: rating, Length: 1235, dtype: int64

# Ratings dictionary

In [52]:
def dictionary(Ids, rating):

    """
    
    Creates a dictionary of the films and their ratings 
    and returns it.
    
    Parameters:
    ----------
    Ids: List of movieIds.
    rating: List of ratings.
    
    """
    
    keys = []
    values = []
    for movie in Ids:
        keys.append(movie)
    for n in rating:
        values.append(n)

    dic = {keys[i]: values[i] for i in range(len(keys))}
 
    return dic

In [53]:
ratings = dictionary(movieIds, rates)

In [55]:
#ratings

Alternatively run the following code.

In [56]:
#ratings = df.groupby(["movieId"])["rating"].mean().to_dict()
#ratings

Convert dictionary to a dataframe.

In [57]:
df_ratings = pd.DataFrame(list(ratings.items()), columns = ["movieId", "rating"])

In [59]:
df_ratings.head()

Unnamed: 0,movieId,rating
0,1,3.92093
1,2,3.431818
2,3,3.259615
3,5,3.071429
4,6,3.946078


In [61]:
df_ratings.shape

(1237, 2)

In [60]:
df_ratings.columns

Index(['movieId', 'rating'], dtype='object')

# Baseline recommender

In [62]:
user_query = { 246 : 1, 318 : 3, 50 : 4.5}

In [63]:
#ratings

In [65]:
def recommend_popular(query, ratings, k = 10):
    
    """
    
    This function recommends popular movies depeding on their rating.
    
    Parameters:
    -----------
    query: A dictionary referring to ratings of seen movies from one user,
    with movieIds as keys and ratings as values.
    
    ratings: A dataframe with movieIds and their ratings.
    
    k: the number of movies to be recommended.
    """
    user = pd.DataFrame(list(query.items()), columns = ["movieId", "rating"])
    user.set_index("movieId", inplace = True)
    
    ratings = ratings.sort_values(by = "rating", ascending = False)
    ratings_index = ratings.set_index("movieId")
    
    for i in user.index:
        ratings_index.drop(index = [i], inplace = True)
        
    ratings_index.reset_index(inplace = True)
    
    return ratings_index.head(k)

In [66]:
rec = recommend_popular(user_query, df_ratings, 10)

In [67]:
rec

Unnamed: 0,movieId,rating
0,922,4.333333
1,898,4.310345
2,475,4.3
3,1204,4.3
4,858,4.289062
5,1235,4.288462
6,168252,4.28
7,2959,4.272936
8,1276,4.27193
9,750,4.268041


In [None]:
#df

In [None]:
#df["userId"].max()

In [None]:
#user

In [None]:
#user["userId"] = 611

In [None]:
#seen_movies = user[["movieId"]]
#seen_movies.set_index("movieId", inplace = True)
#seen_movies

In [None]:
#all_movies = df_ratings[["movieId"]]
#all_movies.set_index("movieId", inplace = True)
#all_movies

In [None]:
#for i in seen_movies.index:
    #all_movies = all_movies.drop(index = [i])
    #df_test = df_ratings.loc[seen_movies[i]]
