# Unsupervised learning-movie recommender

## Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["figure.figsize"] = (16, 9)

# Project goal

In this project, you will build a proof of concept: A web application that showcases different movie recommendation algorithms.

1. Download a small version of the MovieLens-dataset


2. Implement a baseline recommender


3. Derive a user-item matrix


4. Pick and implement a Collaborative Filtering recommender:

    a) Collaborative Filtering with Matrix Factorization
    
    **b) Neighbourhood based Collaborative Filtering**
    
    
5. Write a flask web interface


6. Connect your recommender-model to flask



# Import data

## Import the movies and ratings csv files of the *ml-latest-small.zip* dataset

In [3]:
ratings = pd.read_csv("ratings.csv")

In [4]:
movies = pd.read_csv("movies.csv")

## Inspect the tables

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
ratings.shape

(100836, 4)

In [7]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
movies.shape

(9742, 3)

## Check for duplicates in movies

In [9]:
movies.duplicated(subset = "title").value_counts()

False    9737
True        5
dtype: int64

### Drop duplicates

In [10]:
movies.drop_duplicates(subset = "title", keep = "first", inplace = True)

In [11]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


# Merge tables

## Merge ratings and movies

In [13]:
ratings["movieId"].nunique()

9724

In [14]:
movies["movieId"].nunique()

9737

In [15]:
df = pd.merge(movies, ratings, on = "movieId", how = "right")

In [16]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,3,Grumpier Old Men (1995),Comedy|Romance,1,4.0,964981247
2,6,Heat (1995),Action|Crime|Thriller,1,4.0,964982224
3,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1,5.0,964983815
4,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1,5.0,964982931


In [17]:
df.shape

(100836, 6)

## Drop genres, timestamp and movieId columns

In [19]:
df = df.drop(columns = ["genres", "timestamp", "movieId"])

In [20]:
df

Unnamed: 0,title,userId,rating
0,Toy Story (1995),1,4.0
1,Grumpier Old Men (1995),1,4.0
2,Heat (1995),1,4.0
3,Seven (a.k.a. Se7en) (1995),1,5.0
4,"Usual Suspects, The (1995)",1,5.0
...,...,...,...
100831,Split (2017),610,4.0
100832,John Wick: Chapter Two (2017),610,5.0
100833,Get Out (2017),610,5.0
100834,Logan (2017),610,5.0


In [21]:
df.isna().sum()

title     6
userId    0
rating    0
dtype: int64

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   title   100830 non-null  object 
 1   userId  100836 non-null  int64  
 2   rating  100836 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 3.1+ MB


# Filter out movies that have been watched by less than 20 users

In [23]:
df["reviews"] = df.groupby(["title"])["rating"].transform("count")

In [24]:
df.head()

Unnamed: 0,title,userId,rating,reviews
0,Toy Story (1995),1,4.0,215.0
1,Grumpier Old Men (1995),1,4.0,52.0
2,Heat (1995),1,4.0,102.0
3,Seven (a.k.a. Se7en) (1995),1,5.0,203.0
4,"Usual Suspects, The (1995)",1,5.0,204.0


In [25]:
df.shape

(100836, 4)

In [27]:
df = df[df["reviews"] > 20][["userId", "title", "rating"]]

In [31]:
df.head()

Unnamed: 0,userId,title,rating
0,1,Toy Story (1995),4.0
1,1,Grumpier Old Men (1995),4.0
2,1,Heat (1995),4.0
3,1,Seven (a.k.a. Se7en) (1995),5.0
4,1,"Usual Suspects, The (1995)",5.0


In [32]:
df.shape

(66658, 3)

# Traspose the dataframe

In [40]:
df_pivot = pd.pivot_table(df, index = "title", columns = "userId", values = "rating")

## Impute the missing values

In [41]:
df_pivot = df_pivot.fillna(0)

In [42]:
df_pivot.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (1996),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (One Hundred and One Dalmatians) (1961),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
df_pivot

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (1996),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (One Hundred and One Dalmatians) (1961),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zoolander (2001),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0
Zootopia (2016),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
eXistenZ (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,2.0


# Cosine similarity matrix

In [44]:
def cosim(vec1, vec2):
    
    """
    
    Returns the cosine similarity between vectors.
    
    """
    num = np.dot(vec1, vec2)
    den = np.sqrt(np.dot(vec1, vec1)) * np.sqrt(np.dot(vec2, vec2))
    
    return num / den

In [45]:
cosim_table = []

for i in df_pivot.columns:
    row = []
    
    for k in df_pivot.columns:
        row.append(cosim(df_pivot[i], df_pivot[k]))
    
    cosim_table.append(row)

In [46]:
df_cosim = pd.DataFrame(cosim_table, index = df_pivot.columns, columns = df_pivot.columns).round(2)

In [47]:
df_cosim

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.00,0.03,0.12,0.28,0.15,0.19,0.19,0.15,0.09,0.03,...,0.10,0.20,0.38,0.09,0.20,0.27,0.32,0.36,0.13,0.23
2,0.03,1.00,0.00,0.01,0.02,0.04,0.03,0.03,0.00,0.10,...,0.24,0.02,0.02,0.00,0.00,0.05,0.02,0.06,0.04,0.17
3,0.12,0.00,1.00,0.01,0.01,0.01,0.00,0.01,0.00,0.00,...,0.01,0.01,0.07,0.00,0.03,0.03,0.04,0.05,0.00,0.06
4,0.28,0.01,0.01,1.00,0.15,0.13,0.15,0.08,0.02,0.05,...,0.11,0.15,0.42,0.08,0.13,0.31,0.19,0.22,0.03,0.18
5,0.15,0.02,0.01,0.15,1.00,0.41,0.12,0.45,0.00,0.04,...,0.07,0.46,0.15,0.31,0.20,0.16,0.19,0.16,0.33,0.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.27,0.05,0.03,0.31,0.16,0.17,0.32,0.15,0.13,0.15,...,0.29,0.19,0.50,0.11,0.26,1.00,0.26,0.45,0.12,0.41
607,0.32,0.02,0.04,0.19,0.19,0.21,0.23,0.22,0.02,0.02,...,0.11,0.25,0.34,0.16,0.18,0.26,1.00,0.34,0.21,0.24
608,0.36,0.06,0.05,0.22,0.16,0.26,0.40,0.22,0.14,0.12,...,0.19,0.24,0.40,0.20,0.25,0.45,0.34,1.00,0.17,0.49
609,0.13,0.04,0.00,0.03,0.33,0.34,0.12,0.51,0.00,0.04,...,0.05,0.45,0.09,0.31,0.15,0.12,0.21,0.17,1.00,0.10


# Predictions

## Pick a user

In [48]:
df_20 = pd.DataFrame(df_pivot[20])

In [49]:
df_20

Unnamed: 0_level_0,20
title,Unnamed: 1_level_1
(500) Days of Summer (2009),0.0
10 Things I Hate About You (1999),0.0
101 Dalmatians (1996),3.0
101 Dalmatians (One Hundred and One Dalmatians) (1961),4.0
12 Angry Men (1957),0.0
...,...
Zoolander (2001),3.5
Zootopia (2016),0.0
eXistenZ (1999),0.0
xXx (2002),0.5


## Extract unseen movies for the user

In [55]:
df_20 = df_20[df_20[20] == 0]

In [56]:
df_20

Unnamed: 0_level_0,20
title,Unnamed: 1_level_1
(500) Days of Summer (2009),0.0
10 Things I Hate About You (1999),0.0
12 Angry Men (1957),0.0
13 Going on 30 (2004),0.0
"13th Warrior, The (1999)",0.0
...,...
Zodiac (2007),0.0
Zombieland (2009),0.0
Zootopia (2016),0.0
eXistenZ (1999),0.0


## Get the list of the users who have seen these movies

In [66]:
df_user = df_pivot.loc[df_pivot[20] == 0]

In [68]:
df_user

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13 Going on 30 (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
"13th Warrior, The (1999)",4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zodiac (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
Zombieland (2009),0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
Zootopia (2016),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
eXistenZ (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0


## Cosine similarity ł predictions

In [69]:
cosim_user = pd.DataFrame(df_cosim[20])
cosim_user

Unnamed: 0_level_0,20
userId,Unnamed: 1_level_1
1,0.18
2,0.02
3,0.02
4,0.16
5,0.12
...,...
606,0.23
607,0.10
608,0.33
609,0.00


In [70]:
num = np.dot(df_user, cosim_user)
num

array([[27.175],
       [38.505],
       [32.61 ],
       ...,
       [19.265],
       [15.26 ],
       [14.52 ]])

In [71]:
df_count = df_user.where(df_user == 0, 1)

In [72]:
df_count

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13 Going on 30 (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
"13th Warrior, The (1999)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zodiac (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Zombieland (2009),0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Zootopia (2016),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
eXistenZ (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [73]:
denom = np.dot(df_count, cosim_user)

In [74]:
denom

array([[ 7.36],
       [10.83],
       [ 7.79],
       ...,
       [ 5.12],
       [ 3.98],
       [ 4.64]])

In [75]:
predict_ratings = num/denom

In [76]:
predict_ratings

array([[3.69225543],
       [3.55540166],
       [4.18613607],
       ...,
       [3.76269531],
       [3.83417085],
       [3.12931034]])

In [77]:
df_predict_ratings = pd.DataFrame(predict_ratings, columns = ["prediction"])

In [78]:
df_predict_ratings

Unnamed: 0,prediction
0,3.692255
1,3.555402
2,4.186136
3,3.127599
4,2.702614
...,...
1068,3.667411
1069,3.957871
1070,3.762695
1071,3.834171


In [79]:
df_predict_ratings.isna().sum()

prediction    0
dtype: int64

In [80]:
recommendations = pd.DataFrame(df_20.reset_index())

In [81]:
recommendations["prediction"] = df_predict_ratings["prediction"]

In [82]:
recommendations

Unnamed: 0,title,20,prediction
0,(500) Days of Summer (2009),0.0,3.692255
1,10 Things I Hate About You (1999),0.0,3.555402
2,12 Angry Men (1957),0.0,4.186136
3,13 Going on 30 (2004),0.0,3.127599
4,"13th Warrior, The (1999)",0.0,2.702614
...,...,...,...
1068,Zodiac (2007),0.0,3.667411
1069,Zombieland (2009),0.0,3.957871
1070,Zootopia (2016),0.0,3.762695
1071,eXistenZ (1999),0.0,3.834171


# Recommendation

## Find the most similar users

In [83]:
df_cosim

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.00,0.03,0.12,0.28,0.15,0.19,0.19,0.15,0.09,0.03,...,0.10,0.20,0.38,0.09,0.20,0.27,0.32,0.36,0.13,0.23
2,0.03,1.00,0.00,0.01,0.02,0.04,0.03,0.03,0.00,0.10,...,0.24,0.02,0.02,0.00,0.00,0.05,0.02,0.06,0.04,0.17
3,0.12,0.00,1.00,0.01,0.01,0.01,0.00,0.01,0.00,0.00,...,0.01,0.01,0.07,0.00,0.03,0.03,0.04,0.05,0.00,0.06
4,0.28,0.01,0.01,1.00,0.15,0.13,0.15,0.08,0.02,0.05,...,0.11,0.15,0.42,0.08,0.13,0.31,0.19,0.22,0.03,0.18
5,0.15,0.02,0.01,0.15,1.00,0.41,0.12,0.45,0.00,0.04,...,0.07,0.46,0.15,0.31,0.20,0.16,0.19,0.16,0.33,0.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.27,0.05,0.03,0.31,0.16,0.17,0.32,0.15,0.13,0.15,...,0.29,0.19,0.50,0.11,0.26,1.00,0.26,0.45,0.12,0.41
607,0.32,0.02,0.04,0.19,0.19,0.21,0.23,0.22,0.02,0.02,...,0.11,0.25,0.34,0.16,0.18,0.26,1.00,0.34,0.21,0.24
608,0.36,0.06,0.05,0.22,0.16,0.26,0.40,0.22,0.14,0.12,...,0.19,0.24,0.40,0.20,0.25,0.45,0.34,1.00,0.17,0.49
609,0.13,0.04,0.00,0.03,0.33,0.34,0.12,0.51,0.00,0.04,...,0.05,0.45,0.09,0.31,0.15,0.12,0.21,0.17,1.00,0.10


## Select on user and find their neighbors

In [85]:
cosim_neighbors = pd.DataFrame(df_cosim[20])

In [86]:
cosim_neighbors.drop(index = 20, inplace = True)

In [103]:
cosim_neighbors = cosim_user.sort_values(by = 20, ascending = False).head(30)

In [104]:
cosim_neighbors

Unnamed: 0_level_0,20
userId,Unnamed: 1_level_1
20,1.0
169,0.39
381,0.39
177,0.37
517,0.37
525,0.37
232,0.35
509,0.35
380,0.35
474,0.35


In [105]:
neighbors = list(cosim_user.sort_values(by = 20, ascending = True).index[:30])

In [106]:
neighbors

[578,
 547,
 302,
 360,
 110,
 251,
 383,
 481,
 118,
 421,
 297,
 241,
 532,
 544,
 128,
 152,
 389,
 515,
 609,
 397,
 197,
 506,
 499,
 329,
 335,
 72,
 568,
 85,
 207,
 440]

## Calculate average movie rating of the neighbors

In [107]:
df_rating_neighbors = df.pivot_table(values = "rating", columns = "title", index = "userId")

In [108]:
df_rating_neighbors

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),13 Going on 30 (2004),"13th Warrior, The (1999)",1408 (2007),2001: A Space Odyssey (1968),2012 (2009),...,Young Frankenstein (1974),Young Guns (1988),Zack and Miri Make a Porno (2008),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,4.0,,,,...,5.0,,,,,,,,,4.0
2,,,,,,,,,,,...,,,,,3.0,,,,,
3,,,,,,,,,,0.5,...,,,,,,,,,,
4,,,,,5.0,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,5.0,,...,3.5,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,3.0,,...,,,,,,3.0,,4.5,3.5,
609,,,,,,,,,,,...,,,,,,,,,,


In [109]:
df_rating_neighbors.fillna(0, inplace = True)

In [110]:
df_rating_neighbors

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),13 Going on 30 (2004),"13th Warrior, The (1999)",1408 (2007),2001: A Space Odyssey (1968),2012 (2009),...,Young Frankenstein (1974),Young Guns (1988),Zack and Miri Make a Porno (2008),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.5,3.5,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
df_neighbors = df_rating_neighbors.loc[neighbors]

In [112]:
df_neighbors

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),13 Going on 30 (2004),"13th Warrior, The (1999)",1408 (2007),2001: A Space Odyssey (1968),2012 (2009),...,Young Frankenstein (1974),Young Guns (1988),Zack and Miri Make a Porno (2008),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
547,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
421,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0


## Numerator

In [114]:
num = np.dot(cosim_neighbors.T, df_neighbors)

In [115]:
num.shape

(1, 1235)

## Denominator

In [116]:
df_rating_count = df_neighbors.where(df_neighbors == 0, 1)

In [117]:
df_rating_count

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),13 Going on 30 (2004),"13th Warrior, The (1999)",1408 (2007),2001: A Space Odyssey (1968),2012 (2009),...,Young Frankenstein (1974),Young Guns (1988),Zack and Miri Make a Porno (2008),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
547,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
421,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [118]:
denom = np.dot(cosim_neighbors.T, df_rating_count)

In [119]:
denom.shape

(1, 1235)

## Prediction

In [120]:
predict_ratings = num/denom

  predict_ratings = num/denom


In [121]:
predict_ratings

array([[nan, nan, nan, ..., nan, nan,  3.]])

In [122]:
df_predict_ratings = pd.DataFrame(predict_ratings.T, columns = ["prediction"])

In [123]:
df_predict_ratings

Unnamed: 0,prediction
0,
1,
2,
3,
4,4.742647
...,...
1230,
1231,
1232,
1233,


In [124]:
df_mv = df_rating_count.T.reset_index()

In [125]:
df_mv = pd.DataFrame(df_mv["title"])

In [126]:
df_mv

Unnamed: 0,title
0,(500) Days of Summer (2009)
1,10 Things I Hate About You (1999)
2,101 Dalmatians (1996)
3,101 Dalmatians (One Hundred and One Dalmatians...
4,12 Angry Men (1957)
...,...
1230,Zoolander (2001)
1231,Zootopia (2016)
1232,eXistenZ (1999)
1233,xXx (2002)


In [137]:
recommendations_2 = df_mv.join(df_predict_ratings).set_index("title")

In [138]:
recommendations_2.sort_values(by = "prediction", ascending = False)

Unnamed: 0_level_0,prediction
title,Unnamed: 1_level_1
Mr. Holland's Opus (1995),5.0
Amadeus (1984),5.0
"Birdcage, The (1996)",5.0
Heat (1995),5.0
"Boot, Das (Boat, The) (1981)",5.0
...,...
Zombieland (2009),
Zoolander (2001),
Zootopia (2016),
eXistenZ (1999),


## Drop the seen movies of the user

In [139]:
recommendations_2 = pd.merge(recommendations_2, df_20, on = "title")

In [140]:
recommendations_2

Unnamed: 0_level_0,prediction,20
title,Unnamed: 1_level_1,Unnamed: 2_level_1
(500) Days of Summer (2009),,0.0
10 Things I Hate About You (1999),,0.0
12 Angry Men (1957),4.742647,0.0
13 Going on 30 (2004),3.000000,0.0
"13th Warrior, The (1999)",,0.0
...,...,...
Zodiac (2007),,0.0
Zombieland (2009),,0.0
Zootopia (2016),,0.0
eXistenZ (1999),,0.0


In [141]:
recommendations_2.sort_values(by = "prediction", ascending = False).drop(columns = 20).head(10)

Unnamed: 0_level_0,prediction
title,Unnamed: 1_level_1
Amadeus (1984),5.0
Mr. Holland's Opus (1995),5.0
Heat (1995),5.0
"Birdcage, The (1996)",5.0
Star Wars: Episode I - The Phantom Menace (1999),5.0
"Shining, The (1980)",5.0
Sherlock Holmes (2009),5.0
Shakespeare in Love (1998),5.0
Batman Begins (2005),5.0
"Hurt Locker, The (2008)",5.0
