In [1]:
import os
import pandas as pd
import numpy as np
PATH = os.getcwd()
os.chdir(PATH)

In [2]:
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Embedding, Flatten, Dropout
from keras.layers import Input, concatenate
from keras import regularizers, initializers
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  % self._get_c_name())


In [3]:
rating=pd.read_csv("ratings.csv")
movie = pd.read_csv("movies.csv")

In [4]:
print(rating.head())
print(rating.dtypes)
print(movie.head())
print(movie.dtypes)

   userId  movieId  rating   timestamp
0       1       31     2.5  1260759144
1       1     1029     3.0  1260759179
2       1     1061     3.0  1260759182
3       1     1129     2.0  1260759185
4       1     1172     4.0  1260759205
userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
movieId     int64
title      object
genres     object
dtype: object


###### Convert attributes to right data types

In [5]:
rating = rating.merge(movie,on='movieId',how='inner')

In [6]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama
1,7,31,3.0,851868750,Dangerous Minds (1995),Drama
2,31,31,4.0,1273541953,Dangerous Minds (1995),Drama
3,32,31,4.0,834828440,Dangerous Minds (1995),Drama
4,36,31,3.0,847057202,Dangerous Minds (1995),Drama


In [7]:
rating.userId = rating.userId.astype("category")
rating.movieId = rating.movieId.astype("category")
rating.genres = rating.genres.astype("category")

In [8]:
rating.dtypes

userId       category
movieId      category
rating        float64
timestamp       int64
title          object
genres       category
dtype: object

###### Check for Missing Values

In [9]:
rating.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64

In [10]:
np.unique(rating.movieId.values)[0:100]

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  52,  53,  54,
        55,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  68,  69,
        70,  71,  72,  73,  74,  76,  77,  78,  79,  80,  81,  82,  83,
        84,  85,  86,  87,  88,  89,  92,  93,  94,  95,  96,  97,  98,
        99, 100, 101, 102, 103, 104, 105, 107, 108])

In [11]:
np.unique(rating.genres.values)[0:10]

array(['(no genres listed)', 'Action', 'Action|Adventure',
       'Action|Adventure|Animation',
       'Action|Adventure|Animation|Children',
       'Action|Adventure|Animation|Children|Comedy',
       'Action|Adventure|Animation|Children|Comedy|Fantasy',
       'Action|Adventure|Animation|Children|Comedy|IMAX',
       'Action|Adventure|Animation|Children|Comedy|Romance',
       'Action|Adventure|Animation|Children|Comedy|Sci-Fi'], dtype=object)

In [12]:
userid = rating.userId.cat.codes.values
movieid = rating.movieId.cat.codes.values
genreid = rating.genres.cat.codes.values

In [13]:
np.unique(genreid)[0:100]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
      dtype=int16)

In [14]:
n_users = len(rating.userId.unique())
n_movies = len(rating.movieId.unique())
n_genres = len(rating.genres.unique())

###### Adding Cat Code Values to original dataframe

In [15]:
rating["userid_catcode"]=userid
rating["movieid_catcode"]=movieid
rating["genreid_catcode"]=genreid

In [16]:
embedding_dim = 50

###### Define Metrics

In [17]:
from keras import backend as K
def mape_error(y_true, y_pred): 
    return K.mean((K.abs(y_pred - y_true)/y_true), axis=0) * 100

In [18]:
def rmse_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=0)) 

In [19]:
def mse_error(y_true, y_pred):
        return K.mean(K.square(y_pred - y_true), axis=0) 

###### Split the data to Train and Test datasets

In [20]:
train_userid, test_userid, \
train_movieid, test_movieid, \
train_genreid, test_genreid, \
train_y, test_y = train_test_split(userid,
                                   movieid, 
                                   genreid,
                                   rating.rating, 
                                   test_size=0.3, random_state=2)

In [21]:
train_y.shape

(70002,)

In [22]:
train_y = train_y.reshape((-1,1))
test_y = test_y.reshape((-1,1))

  """Entry point for launching an IPython kernel.
  


In [23]:
train_y.shape
train_y[0:10]

array([[4.5],
       [3. ],
       [1. ],
       [5. ],
       [4. ],
       [2.5],
       [2. ],
       [2. ],
       [2. ],
       [5. ]])

###### Embedding UserIds

In [24]:
# encoder_UserID = Sequential()
# encoder_UserID.add(Embedding(n_users, embedding_dim, input_length=1,embeddings_regularizer=regularizers.l2(0.00001)))

###### Embedding MovieIds

In [25]:
# encoder_MovieID = Sequential()
# encoder_MovieID.add(Embedding(n_movies, embedding_dim, input_length=1,embeddings_regularizer=regularizers.l2(0.00001)))

###### Embedding Genreids

In [26]:
# encoder_genreID = Sequential()
# encoder_genreID.add(Embedding(n_genres, embedding_dim, input_length=1,embeddings_regularizer=regularizers.l2(0.00001)))


##### Define MLP 

In [27]:
n_users = 10000
embedding_dim = 50

# i = 1x10000
# w = 10000x50

# shape=(100,) --> 1D array

In [28]:
user_inp = Input(shape=(1, ))
user_mbd = Embedding(n_users, embedding_dim)(user_inp)

movie_inp = Input(shape=(1, ))
movie_mbd = Embedding(n_movies, embedding_dim)(movie_inp)

genre_inp = Input(shape=(1, ))
genre_mbd = Embedding(n_genres, embedding_dim)(genre_inp)

merged = concatenate([user_mbd, movie_mbd, genre_mbd])
fc1 = Dense(100,activation='relu')(merged)
fc2 = Dense(1)(fc1)

model = Model(inputs=[user_inp, movie_inp, genre_inp], outputs=fc2)
model.compile(optimizer='adam', loss='mse', metrics=[mse_error,rmse_error])

###### Train and fit the model

In [29]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 50)        500000      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_

In [30]:
model.fit([train_userid,train_movieid,train_genreid], train_y.reshape(-1,1,1), epochs=100, verbose=1,batch_size=500,validation_data=([test_userid,test_movieid,test_genreid], test_y.reshape(-1,1,1)))

Train on 70002 samples, validate on 30002 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
  500/70002 [..............................] - ETA: 0s - loss: 0.5701 - mse_error: 0.5701 - rmse_error: 0.7551

KeyboardInterrupt: 

###### Evaluating Scores Manually

In [31]:
scores= model.evaluate([test_userid,test_movieid,test_genreid],test_y.reshape(-1,1,1))
print(scores)

[0.8623999412294531, 0.8623999412294531, 0.9179448069719368]


In [32]:
test_pred = model.predict([test_userid,test_movieid,test_genreid])

In [33]:
mse_score = np.mean(np.square(test_pred.reshape(-1,1) - test_y))
print("mse_score:",mse_score)

('mse_score:', 0.8623999425777259)


###### Calculate MSE and RMSE Manually

In [34]:
rmse_score = np.sqrt(np.mean(np.square(test_pred.reshape(-1,1) - test_y)))
print("rmse_score:",rmse_score)


('rmse_score:', 0.928654910382606)


## Developing Recommendations

###### Extracting/Predicting user embeddings

In [35]:
encoder_UserID = Model(user_inp, user_mbd)
encoder_MovieID = Model(movie_inp, movie_mbd)

In [36]:
users_unique = np.unique(userid)
users_embeddings = encoder_UserID.predict(users_unique)

In [37]:
users_unique[0:3]
users_embeddings[0:3]

array([[[-0.02864648,  0.02404832, -0.01672194,  0.03472207,
         -0.04659946,  0.03847988,  0.04238483,  0.02280261,
         -0.03277983,  0.01710645, -0.01257738, -0.03413071,
         -0.03574416, -0.03548922,  0.05686403,  0.05906394,
         -0.00314287,  0.0086943 ,  0.03430032,  0.02617086,
         -0.01856759, -0.02931431,  0.00359461, -0.06082052,
          0.0498751 , -0.03115098,  0.03041959,  0.00075134,
         -0.03004984,  0.02628771,  0.00534042, -0.01367473,
          0.01413259, -0.02479629, -0.01653465, -0.05562741,
         -0.03990626, -0.00977885, -0.03099137,  0.01114488,
         -0.04286162, -0.00549906,  0.02666238,  0.01336491,
         -0.01424601, -0.0255978 ,  0.03819225,  0.02956324,
         -0.01353304,  0.01656003]],

       [[-0.03743275,  0.06609384, -0.04990283,  0.0969858 ,
          0.00324219,  0.02266745, -0.08097883,  0.02348179,
          0.10067439,  0.0562802 ,  0.06176184,  0.03668357,
         -0.00900587,  0.09635257, -0.01998773,

In [38]:
users_embeddings=pd.DataFrame(users_embeddings.reshape(-1,50))
users_embeddings["userid_catcode"] = users_unique

In [39]:
users_embeddings[0:3]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,userid_catcode
0,-0.028646,0.024048,-0.016722,0.034722,-0.046599,0.03848,0.042385,0.022803,-0.03278,0.017106,...,-0.005499,0.026662,0.013365,-0.014246,-0.025598,0.038192,0.029563,-0.013533,0.01656,0
1,-0.037433,0.066094,-0.049903,0.096986,0.003242,0.022667,-0.080979,0.023482,0.100674,0.05628,...,0.025168,-0.055709,0.071665,-0.033236,-0.04934,-0.069306,0.024734,0.056861,0.074493,1
2,0.01256,0.107519,-0.025038,0.075286,0.018018,0.029508,-0.066265,0.068305,0.071505,-0.014561,...,0.063819,-0.025325,0.044025,-0.0286,-0.025755,-0.037604,-0.009889,0.107348,0.101142,2


In [40]:
movies_unique = np.unique(movieid)
movies_embeddings=encoder_MovieID.predict(movies_unique)

In [41]:
movies_embeddings=pd.DataFrame(movies_embeddings.reshape(-1,50))
movies_embeddings["movieid_catcode"] = movies_unique

##### Identify nearest neighbours for a movie based on k-Nearest Neighbours Algorithm

In [42]:
from sklearn.neighbors import NearestNeighbors

In [43]:
nbrs = NearestNeighbors(n_neighbors=10, algorithm='auto').fit(movies_embeddings.drop(["movieid_catcode"],axis=1))
movie_distances, movie_nbrs = nbrs.kneighbors(movies_embeddings.drop(["movieid_catcode"],axis=1))
movie_nbrs=pd.DataFrame(movie_nbrs)
movie_nbrs.columns=["movieid_catcode","NN1","NN2","NN3","NN4","NN5","NN6","NN7","NN8","NN9"]

###### Define Recommender Function

In [46]:
def recommender(rating,movie_nbrs,model,uid,n):
    
    # Mapping the userid to its Cat Code
    uid_catcode=rating[rating.userId==uid]["userid_catcode"].unique()
    
    #Subsetting the records of the userid
    movies=(rating[rating.userid_catcode==uid_catcode[0]])
    
    #Sorting the user movies based on given ratings
    if len(movies)>=n :
        movies=movies.sort_values("rating",ascending=False)
        movies=list((movies[0:n]["movieid_catcode"]))
    else :
        movies=list(movies["movieid_catcode"])
    print(len(movies))
    
   
    Movie_nbrs=movie_nbrs[movie_nbrs.movieid_catcode.isin(movies)]
    print(Movie_nbrs.shape)
    Movie_nbrs=Movie_nbrs.drop("movieid_catcode",axis=1)
    print(Movie_nbrs.shape)
    
    Movie_nbrs=pd.DataFrame(np.unique(Movie_nbrs.values.flatten()))
    Movie_nbrs.columns=["movieid_catcode"]
    Movie_nbrs=Movie_nbrs[~(Movie_nbrs["movieid_catcode"].isin(list(rating[rating.userid_catcode==uid_catcode[0]]["movieid_catcode"])))]
    genre_nbrs=rating[rating.movieid_catcode.isin(Movie_nbrs.movieid_catcode)]["genreid_catcode"]
    genre_nbrs.columns=["genre_id"]
    
    
    if len(Movie_nbrs)>0:
        usid=np.repeat(uid_catcode[0], len(Movie_nbrs))
        Movie_nbrs=Movie_nbrs.movieid_catcode.values
        genre_nbrs = genre_nbrs.values
        test_pred = model.predict([usid,Movie_nbrs,genre_nbrs]).reshape(-1)
        
        r_Df=pd.DataFrame({"movieid_catcode":Movie_nbrs})
        r_Df["rating"]=test_pred
        
        Movie_catcodes=rating[["movieId","movieid_catcode","title"]]
        Movie_catcodes=Movie_catcodes.drop_duplicates()
        r_Df=pd.merge(r_Df,Movie_catcodes,how="left",on=["movieid_catcode"])
        r_Df=r_Df.sort_values("rating",ascending=False)
        r_Df=r_Df.iloc[0:n]
        r_Df["userId"] = uid
        return(r_Df)
    else :
        print("Try with another n value")
        r_Df=None
        return(r_Df)

###### Recommend movie for a user

In [47]:
uid=605
n=5
model = model
rec=recommender(rating,movie_nbrs,model,uid,n)

5
(5, 10)
(5, 9)


ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ValueError: "Buffer dtype mismatch, expected 'Python object' but got 'long'" in 'pandas._libs.lib.is_bool_array' ignored


In [48]:
rec

Unnamed: 0,movieid_catcode,rating,movieId,title,userId
31,7589,3.714301,80219,Machete (2010),605
28,7411,3.6989,73587,Soul Kitchen (2009),605
17,3918,3.69566,5062,Seconds (1966),605
42,8831,3.693542,132333,Seve (2014),605
40,8458,3.680357,107649,Borgman (2013),605
