In [1]:
%matplotlib inline

In [17]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Data loading
ratings_df = pd.read_csv('/Users/eshasingh/Downloads/ml-1m/ratings.dat', 
                         names=['UserID','MovieID','Rating','Time'], sep='::', engine='python')
# movies_df = pd.read_csv('/Users/eshasingh/Downloads/ml-1m/movies.dat', 
#                         names=['MovieID','Title','Genres'], sep='::', engine='python')
# users_df = pd.read_csv('/Users/eshasingh/Downloads/ml-1m/users.dat', 
#                        names=['UserID','Gender','Age','Occupation','Zip-code'], sep='::', engine='python')

### Task at hand: 

- Creation of embedding vectors for users and items (movies)
- These vectors are optimized over difference between ratings and dot product of user and item vectors

In [33]:
K = [1,2,3,4,5]
M = [2,3,1,5]
print (LabelEncoder().fit_transform(K))
print (LabelEncoder().fit_transform(M))
print (ratings_df['MovieID'].nunique())
[i for i in range(60434) if i not in ratings_df['UserID']]
n_users = ratings_df['UserID'].nunique()
n_movies = ratings_df['MovieID'].nunique()
print (n_users*n_movies)
print (len(ratings_df['Rating']))

[0 1 2 3 4]
[1 2 0 3]
3706
22384240
1000209


In [23]:
# Analysis (plag)
g = ratings_df.groupby('UserID')['Rating'].count()
top_users = g.sort_values(ascending=False)[:15]
g = ratings_df.groupby('MovieID')['Rating'].count()
top_movies = g.sort_values(ascending=False)[:15]

top_r = ratings_df.join(top_users, rsuffix='_u', how='inner', on='UserID')
top_r = top_r.join(top_movies, rsuffix='_m', how='inner', on='MovieID')

In [21]:
# Analysis_2 (plag)

user_enc = LabelEncoder()
ratings_df['User'] = user_enc.fit_transform(ratings_df['UserID'].values)
n_users = ratings_df['User'].nunique()
item_enc = LabelEncoder()
ratings_df['Movie'] = item_enc.fit_transform(ratings_df['MovieID'].values)
n_movies = ratings_df['Movie'].nunique()
print (type(ratings_df['Rating'][0]))
ratings_df['Rating'] = ratings_df['Rating'].values.astype(np.float32)
min_rating = min(ratings_df['Rating'])
max_rating = max(ratings_df['Rating'])

n_users, n_movies, min_rating, max_rating

<class 'numpy.int64'>


(6040, 3706, 1.0, 5.0)

In [22]:
# Analysis_3 (plag)
from sklearn.model_selection import train_test_split

X = ratings_df[['User', 'Movie']].values
y = ratings_df['Rating'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# converting into list of lists
X_train_array = [X_train[:, 0], X_train[:, 1]]
X_test_array = [X_test[:, 0], X_test[:, 1]]

In [23]:
# Deep Learning (plag)
# computation graph creation

from keras.models import Model
from keras.layers import Input, Reshape, Dot
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.regularizers import l2

def RecommenderV1(n_users, n_movies, n_factors):
    user = Input(shape=(1,), name='user_input')
    u = Embedding(n_users, n_factors, embeddings_initializer='he_normal',
                  embeddings_regularizer=l2(1e-6))(user)
    u = Reshape((n_factors,))(u)
    
    movie = Input(shape=(1,))
    m = Embedding(n_movies, n_factors, embeddings_initializer='he_normal',
                  embeddings_regularizer=l2(1e-6))(movie)
    m = Reshape((n_factors,))(m)
    
    x = Dot(axes=1)([u, m])
    model = Model(inputs=[user, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=opt)
    return model

In [24]:
# Initialization (plag)
n_factors = 50
model = RecommenderV1(n_users, n_movies, n_factors)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 50)        302000      user_input[0][0]                 
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 50)        185300      input_1[0][0]                    
____________________________________________________________________________________________

In [25]:
# Model fitting (plag)
history = model.fit(x=X_train_array, y=y_train, batch_size=64, epochs=5,
                    verbose=1, validation_data=(X_test_array, y_test))

for layer in model.layers:
    print(layer.output_shape)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 900188 samples, validate on 100021 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
(None, 1)
(None, 1)
(None, 1, 50)
(None, 1, 50)
(None, 50)
(None, 50)
(None, 1)


In [27]:
type(history)

keras.callbacks.callbacks.History

In [34]:
# Improvement (plag)
from keras.layers import Add, Activation, Lambda, Dense
class EmbeddingLayer:
    def __init__(self, n_items, n_factors):
        self.n_items = n_items
        self.n_factors = n_factors
    
    def __call__(self, x):
        x = Embedding(self.n_items, self.n_factors, embeddings_initializer='he_normal',
                      embeddings_regularizer=l2(1e-6))(x)
        x = Reshape((self.n_factors,))(x)
        return x
    
def RecommenderV2(n_users, n_movies, n_factors, min_rating, max_rating):
    user = Input(shape=(1,))
    u = EmbeddingLayer(n_users, n_factors)(user)
    ub = EmbeddingLayer(n_users, 1)(user)
    
    movie = Input(shape=(1,))
    m = EmbeddingLayer(n_movies, n_factors)(movie)
    mb = EmbeddingLayer(n_movies, 1)(movie)
    x = Dot(axes=1)([u, m])
    x = Add()([x, ub, mb])
    # x = Dense(n_users*n_movies, activation='sigmoid')(x)
    x = Activation('sigmoid')(x)
    x = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(x)
    print ('BHai : ', x.shape)
    model = Model(inputs=[user, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=opt)
    return model

In [35]:
# Initialization (plag)
n_factors = 50
model = RecommenderV2(n_users, n_movies, n_factors, min_rating, max_rating)
model.summary()

BHai :  (None, 1)
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_7 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_11 (Embedding)        (None, 1, 50)        302000      input_6[0][0]                    
__________________________________________________________________________________________________
embedding_13 (Embedding)        (None, 1, 50)        185300      input_7[0][0]                    
__________________________________________________________________________

In [39]:
# Model fitting (plag)
history = model.fit(x=X_train_array, y=y_train, batch_size=64, epochs=5,
                    verbose=1, validation_data=(X_test_array, y_test))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 900188 samples, validate on 100021 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
