<a href="https://colab.research.google.com/github/dilanbakr/netflixRecommendationMovie/blob/main/Recommendation_with_Deep_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings('ignore')

import io
import os
import copy
import pickle
import zipfile
from urllib.error import URLError
from urllib.request import urlopen
from scipy import spatial

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
from keras.layers import Input, Dense, Embedding, Concatenate, Flatten, Dropout
from keras.models import Model, Sequential
from keras import optimizers

Using TensorFlow backend.


In [None]:
os.mkdir("Data")

In [None]:
np.random.seed(5)

In [None]:
def try_download(url, download_path):
    archive_name = url.split('/')[-1]
    folder_name, _ = os.path.splitext(archive_name)
    
    try:
        r = urlopen(url)
    except URLError as e:
        print('Cannot download the data. Error: {0}'.format(e))
        return 

    assert r.status == 200
    data = r.read()

    with zipfile.ZipFile(io.BytesIO(data)) as arch:
        arch.extractall(download_path)
        
    print('The archive is extracted into folder: %s' % download_path)

In [None]:
def read_data(path):
    files = {}
    for filename in os.listdir(path):
        if os.path.splitext(filename)[-1] == '.csv':
            files[os.path.splitext(filename)[0]] = pd.read_csv(os.path.join(path,filename))
        elif os.path.splitext(filename)[-1] == '.dat':
            if os.path.splitext(filename)[0] == 'ratings':
                columns = ['userId', 'movieId', 'rating', 'timestamp']
            else:
                columns = ['movieId', 'title', 'genres']
            data = pd.read_csv(os.path.join(path,filename), sep='::', names=columns, engine='python')
            files[os.path.splitext(filename)[0]] = data
    return files['ratings'], files['movies']

In [None]:
archive_url = f'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
download_path = os.path.join(os.curdir, 'Data')

In [None]:
try_download(archive_url, download_path)

The archive is extracted into folder: ./Data


In [None]:
ratings, movies = read_data(os.path.join(download_path, 'ml-1m'))

In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
# ratings.groupby('userId')['rating'].count().sort_values(ascending=False)
# ratings.groupby('movieId')['rating'].count().sort_values(ascending=False)

In [None]:
def tabular_preview(ratings, n=15):
    """Creates a cross-tabular view of users vs movies."""
    
    user_groups = ratings.groupby('userId')['rating'].count()
    top_users = user_groups.sort_values(ascending=False)[:15]

    movie_groups = ratings.groupby('movieId')['rating'].count()
    top_movies = movie_groups.sort_values(ascending=False)[:15]

    top = (
        ratings.
        join(top_users, rsuffix='_r', how='inner', on='userId').
        join(top_movies, rsuffix='_r', how='inner', on='movieId'))

    return pd.crosstab(top.userId, top.movieId, top.rating, aggfunc=np.sum)

In [None]:
tabular_preview(ratings, movies)

movieId,110,260,480,589,593,608,1196,1198,1210,1270,1580,2028,2571,2762,2858
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
889,4.0,4.0,3.0,5.0,5.0,4.0,4.0,,3.0,4.0,3.0,3.0,5.0,,2.0
1015,4.0,5.0,4.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,4.0,5.0,5.0,5.0,4.0
1150,2.0,5.0,,2.0,3.0,5.0,4.0,2.0,3.0,2.0,2.0,2.0,1.0,2.0,4.0
1181,3.0,4.0,2.0,5.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,4.0,5.0,4.0,3.0
1449,3.0,3.0,2.0,2.0,5.0,5.0,3.0,4.0,2.0,2.0,4.0,3.0,4.0,4.0,4.0
1680,1.0,2.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,3.0,4.0,5.0,3.0,5.0,5.0
1941,5.0,5.0,5.0,3.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,5.0,1.0
1980,4.0,4.0,4.0,4.0,5.0,5.0,4.0,5.0,4.0,5.0,4.0,5.0,5.0,5.0,5.0
2063,5.0,4.0,4.0,2.0,5.0,2.0,4.0,4.0,4.0,4.0,3.0,2.0,5.0,4.0,5.0
2909,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0


In [None]:
ratings = pd.merge(ratings, movies, on="movieId")
ratings.shape

(1000209, 6)

In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


In [None]:
def create_dataset(ratings, top=None):
    if top is not None:
        ratings.groupby('userId')['rating'].count()
    
    unique_users = ratings.userId.unique()
    user_to_index = {old: new for new, old in enumerate(unique_users)}
    new_users = ratings.userId.map(user_to_index)
    
    unique_movies = ratings.movieId.unique()
    movie_to_index = {old: new for new, old in enumerate(unique_movies)}
    new_movies = ratings.movieId.map(movie_to_index)
    
    n_users = unique_users.shape[0]
    n_movies = unique_movies.shape[0]
    
    X = pd.DataFrame({'user_id': new_users, 'movie_id': new_movies})
    y = ratings['rating'].astype(np.float32)
    return (n_users, n_movies), (X, y), (user_to_index, movie_to_index)

In [None]:
(n, m), (X, y), (user_to_index, movie_to_index) = create_dataset(ratings)
print(f'Embeddings: {n} users, {m} movies')
print(f'Dataset shape: {X.shape}')
print(f'Target shape: {y.shape}')

Embeddings: 6040 users, 3706 movies
Dataset shape: (1000209, 2)
Target shape: (1000209,)


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
y_normalized = scaler.fit_transform(y.to_numpy().reshape(-1, 1))
y_normalized = y_normalized.ravel()

In [None]:
y_normalized

array([1.  , 1.  , 0.75, ..., 0.  , 1.  , 0.75], dtype=float32)

# Creating The Model

In [None]:
input_user = Input(shape=(1,), dtype='int32', name='user_input')
input_movie = Input(shape=(1,), dtype='int32', name='movie_input')

x_1 = Embedding(output_dim=50, input_dim=n, input_length=1, name="user_embeddings")(input_user)
x_2 = Embedding(output_dim=50, input_dim=m, input_length=1, name="movie_embeddings")(input_movie)

x = Concatenate()([x_1, x_2])
x1_f = Flatten()(x)
x1 = Dropout(0.05)(x1_f)
x2_f = Dense(300, activation='relu')(x1)
x2 = Dropout(0.5)(x2_f)
x3_f = Dense(300, activation='relu')(x2)
x3 = Dropout(0.5)(x3_f)
x4 = Dense(300, activation='relu')(x3)
x4_f = Dropout(0.25)(x4)
pred = Dense(1, activation='linear')(x4_f)

model = Model(inputs=[input_user, input_movie], outputs=pred)
model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['mse'])





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.



In [None]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
movie_input (InputLayer)        (None, 1)            0                                            
__________________________________________________________________________________________________
user_embeddings (Embedding)     (None, 1, 50)        302000      user_input[0][0]                 
__________________________________________________________________________________________________
movie_embeddings (Embedding)    (None, 1, 50)        185300      movie_input[0][0]                
____________________________________________________________________________________________

In [None]:
model.fit([X.user_id.to_numpy(), X.movie_id.to_numpy()], y_normalized,
          epochs=10,
          batch_size=256,
          validation_split=0.25
          )




Train on 750156 samples, validate on 250053 samples
Epoch 1/10





Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3e18bd15c0>

In [None]:
ratings["user_id_index"] = ratings["userId"].apply(lambda x: user_to_index[x])

In [None]:
ratings["movie_id_index"] = ratings["movieId"].apply(lambda x: movie_to_index[x])

In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,user_id_index,movie_id_index
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,0,0
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama,1,0
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama,2,0
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama,3,0
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama,4,0


In [None]:
ratings.loc[ratings["userId"] == 387, :].head(8)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,user_id_index,movie_id_index
6054,387,1197,5,979421280,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance,3666,5
20263,387,2797,4,977280126,Big (1988),Comedy|Fantasy,3666,19
32809,387,745,5,976299594,"Close Shave, A (1995)",Animation|Comedy|Thriller,3666,29
46937,387,260,5,977280126,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi,3666,44
65759,387,2628,4,977280224,Star Wars: Episode I - The Phantom Menace (1999),Action|Adventure|Fantasy|Sci-Fi,3666,60
70908,387,1210,3,976298892,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War,3666,64
81447,387,3108,3,977279809,"Fisher King, The (1991)",Comedy|Drama|Romance,3666,75
92680,387,1188,5,977279619,Strictly Ballroom (1992),Comedy|Romance,3666,88


In [None]:
scaler.inverse_transform(model.predict([np.array([3666]), np.array([44])]))

array([[4.515724]], dtype=float32)

# Let's Inspect The Embeddings

In [None]:
model.layers

[<keras.engine.input_layer.InputLayer at 0x7f3e194688d0>,
 <keras.engine.input_layer.InputLayer at 0x7f3e19468908>,
 <keras.layers.embeddings.Embedding at 0x7f3e19468898>,
 <keras.layers.embeddings.Embedding at 0x7f3e19468ef0>,
 <keras.layers.merge.Concatenate at 0x7f3e19468d68>,
 <keras.layers.core.Flatten at 0x7f3e19468a20>,
 <keras.layers.core.Dropout at 0x7f3e1a6cb358>,
 <keras.layers.core.Dense at 0x7f3e1a6dc9e8>,
 <keras.layers.core.Dropout at 0x7f3e1a6e39b0>,
 <keras.layers.core.Dense at 0x7f3e1952ffd0>,
 <keras.layers.core.Dropout at 0x7f3e19540908>,
 <keras.layers.core.Dense at 0x7f3e1954c4a8>,
 <keras.layers.core.Dropout at 0x7f3e19553cc0>,
 <keras.layers.core.Dense at 0x7f3e19561b00>]

In [None]:
user_embed = pd.DataFrame(model.layers[2].get_weights()[0])
movie_embed = pd.DataFrame(model.layers[3].get_weights()[0])

In [None]:
movie_embed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
0,0.253975,0.176474,0.09001,-0.222203,-0.164028,-0.316189,0.268266,0.179048,-0.078391,0.080679,-0.161565,0.311383,-0.211504,0.134101,-0.152182,0.105115,0.066978,-0.245904,0.140979,0.164956,-0.041534,0.099636,0.072998,0.265233,0.200597,-0.247563,-0.206965,0.097533,0.04318,-0.033883,0.140361,0.28034,-0.086754,0.027525,-0.06014,-0.029645,-0.134235,0.198926,-0.235489,-0.135251,-0.228711,0.114509,-0.098725,0.186161,-0.24598,-0.203669,-0.039672,0.032241,0.124948,0.009561
1,-0.165665,-0.048002,0.09316,-0.161948,-0.071063,-0.074441,-0.054185,0.076814,-0.066596,-0.047965,-0.063991,0.133271,-0.068977,-0.005964,-0.069055,0.184843,0.198952,-0.113733,-0.021789,-0.17178,0.20195,-0.238482,0.097718,-0.050299,0.08529,-0.01026,0.032322,-0.025323,0.278816,-0.030457,0.212769,-0.051997,0.011985,-0.255332,-0.060123,0.104254,0.074663,0.011087,0.010397,-0.115894,0.045316,-0.102143,-0.074972,-0.23153,0.011884,-0.099963,-0.073384,-0.033952,0.063121,-0.132364
2,0.11456,0.320287,0.076128,0.038032,-0.280784,-0.041754,0.195205,0.19915,0.101836,0.277131,0.191714,0.064329,-0.107977,0.145088,-0.060287,-0.032678,0.318184,-0.097416,-0.111355,0.164856,-0.098647,-0.075861,-0.099364,0.088708,0.123056,-0.161307,-0.168974,0.026912,-0.150975,0.070641,0.119389,0.021584,-0.028378,0.103697,-0.090448,-0.233003,0.047752,0.134897,-0.054552,-0.015164,-0.142052,0.085196,0.128865,-0.079236,0.049123,-0.069657,-0.27601,0.115363,-0.224775,0.066536
3,0.116039,0.386919,0.111326,0.402479,0.150024,-0.048989,0.051012,0.038528,0.095663,0.25524,0.089748,0.082457,-0.051847,-0.034693,-0.168885,-0.169565,-0.025211,-0.105543,-0.121684,0.026756,-0.143365,-0.068494,-0.141014,-0.018097,-0.150213,-0.125496,-0.05631,0.164102,-0.295533,0.137185,0.042396,-0.052641,0.305431,0.158996,0.12916,-0.061563,0.251933,-0.075277,-0.109333,-0.02779,-0.015653,-0.059224,0.061455,-0.177581,0.053907,0.210007,-0.219839,0.068278,-0.34793,-0.160063
4,0.061016,0.294981,0.078544,-0.236874,-0.14085,-0.265434,0.037401,-0.028857,-0.036876,-0.148859,0.112146,-0.046558,-0.076214,0.187291,-0.111888,-0.155056,0.27906,0.132614,-0.014471,-0.063101,-0.208827,-0.245496,0.012176,0.103271,-0.111076,-0.137504,-0.022666,0.177393,0.239965,0.114255,-0.08096,-0.133743,0.037533,0.158676,0.144071,-0.137683,-0.112132,-0.054829,0.13835,-0.091319,0.021922,0.153021,-0.070286,-0.143328,-0.08311,-0.174044,-0.195893,0.129707,-0.16712,0.103428


In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,user_id_index,movie_id_index
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,0,0
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama,1,0
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama,2,0
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama,3,0
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama,4,0


In [None]:
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

In [None]:
def findMostSimilarMovies(ratings, movie_embed, movie_id):
    
    selectedMovie_index = ratings.loc[ratings["movieId"] == movie_id, "movie_id_index"].unique()[0]
    print("Name of Movie is {}".format(ratings.loc[ratings["movieId"] == movie_id, "title"].unique()[0]))
    
    selected_embedding = movie_embed.loc[selectedMovie_index, :].to_numpy()
    movie_embed = movie_embed.to_numpy()
    
    distance = np.zeros(movie_embed.shape[0])
    for i, movie in enumerate(movie_embed):
        distance[i] = 1 - spatial.distance.cosine(selected_embedding, movie)
    most_similars = np.argsort(distance)[-2:-11:-1]
    
    print("\n")
    print("Recommended Movies")
    print("=" * 20)
    for sim in most_similars:
        rec_movie_name = ratings.loc[ratings["movie_id_index"] == sim , "title"].unique()[0]
        rec_movie_genres = ratings.loc[ratings["movie_id_index"] == sim , "genres"].unique()[0]
        print(color.BOLD + color.BLUE + "Movie Name is => {0}, Movie Genres are {1}".format(rec_movie_name, rec_movie_genres))
    
    return None

In [None]:
findMostSimilarMovies(ratings, movie_embed, 1210)

Name of Movie is Star Wars: Episode VI - Return of the Jedi (1983)


Recommended Movies
[1m[94mMovie Name is => Star Wars: Episode V - The Empire Strikes Back (1980), Movie Genres are Action|Adventure|Drama|Sci-Fi|War
[1m[94mMovie Name is => Star Wars: Episode IV - A New Hope (1977), Movie Genres are Action|Adventure|Fantasy|Sci-Fi
[1m[94mMovie Name is => Indiana Jones and the Last Crusade (1989), Movie Genres are Action|Adventure
[1m[94mMovie Name is => Star Wars: Episode I - The Phantom Menace (1999), Movie Genres are Action|Adventure|Fantasy|Sci-Fi
[1m[94mMovie Name is => Braveheart (1995), Movie Genres are Action|Drama|War
[1m[94mMovie Name is => Shawshank Redemption, The (1994), Movie Genres are Drama
[1m[94mMovie Name is => Young Guns (1988), Movie Genres are Action|Comedy|Western
[1m[94mMovie Name is => Caddyshack (1980), Movie Genres are Comedy
[1m[94mMovie Name is => Vacation (1983), Movie Genres are Comedy


In [None]:
def findMostSimilarUsers(ratings, user_embed, user_id):
    
    selectedUser_index = ratings.loc[ratings["userId"] == user_id, "user_id_index"].unique()[0]
    
    selected_embedding = user_embed.loc[selectedUser_index, :].to_numpy()
    user_embed = user_embed.to_numpy()
    
    distance = np.zeros(user_embed.shape[0])
    for i, user in enumerate(user_embed):
        distance[i] = 1 - spatial.distance.cosine(selected_embedding, user)
    most_similars = np.argsort(distance)[-2:-5:-1]
    
    print("\n")
    print("Recommended Movies")
    print("=" * 20)
    for sim in most_similars:
        rec_user_name = ratings.loc[ratings["user_id_index"] == sim , "userId"].unique()[0]
        print(color.BOLD + color.BLUE + "User Name is => {0}".format(rec_user_name))
    
    return None

findMostSimilarUsers(ratings, user_embed, 189)



Recommended Movies
[1m[94mUser Name is => 2899
[1m[94mUser Name is => 748
[1m[94mUser Name is => 278
