In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing

In [2]:
pip install --upgrade tensorflow

Note: you may need to restart the kernel to use updated packages.


In [3]:
from tensorflow import keras
from keras.layers import Input, Reshape, Dot
from keras.layers.embeddings import Embedding
from tensorflow.keras.optimizers import Adam
from keras.regularizers import l2
from tensorflow.keras import Model

In [4]:
data = pd.read_csv('./ml-1m/ratings.dat',delimiter="::",names=["userId","movieId","rating","timestamp"])

  data = pd.read_csv('./ml-1m/ratings.dat',delimiter="::",names=["userId","movieId","rating","timestamp"])


In [5]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
data.shape

(1000209, 4)

In [7]:
data['userId'].unique()

array([   1,    2,    3, ..., 6038, 6039, 6040], dtype=int64)

# Number of unique userIDs

In [8]:
len(data['userId'].unique())

6040

# Number of unique movieIDs


In [9]:
len(data['movieId'].unique())

3706

# Using label encoder

In [10]:
user_le = preprocessing.LabelEncoder()
movie_le = preprocessing.LabelEncoder()

In [11]:
data['user'] = user_le.fit_transform(data['userId'])
data['movie'] = movie_le.fit_transform(data['movieId'])

In [12]:
n_users = data['user'].nunique()
n_movies = data['movie'].nunique()

In [13]:
n_users

6040

In [14]:
n_movies

3706

In [15]:
data

Unnamed: 0,userId,movieId,rating,timestamp,user,movie
0,1,1193,5,978300760,0,1104
1,1,661,3,978302109,0,639
2,1,914,3,978301968,0,853
3,1,3408,4,978300275,0,3177
4,1,2355,5,978824291,0,2162
...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,6039,1019
1000205,6040,1094,5,956704887,6039,1022
1000206,6040,562,5,956704746,6039,548
1000207,6040,1096,4,956715648,6039,1024


# We rank the data  so that we can the latest rating by the user will be taken as test

In [16]:
data['rank_latest']  = data.groupby(['userId'])['timestamp'].rank(method='first',ascending=False)

In [17]:
data

Unnamed: 0,userId,movieId,rating,timestamp,user,movie,rank_latest
0,1,1193,5,978300760,0,1104,42.0
1,1,661,3,978302109,0,639,23.0
2,1,914,3,978301968,0,853,28.0
3,1,3408,4,978300275,0,3177,47.0
4,1,2355,5,978824291,0,2162,4.0
...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,6039,1019,161.0
1000205,6040,1094,5,956704887,6039,1022,293.0
1000206,6040,562,5,956704746,6039,548,305.0
1000207,6040,1096,4,956715648,6039,1024,234.0


In [18]:
train_ratings= data[data["rank_latest"]!=1][["user","movie","rating"]]
test_ratings = data[data["rank_latest"]==1][["user","movie","rating"]]

In [19]:
test_ratings.shape

(6040, 3)

In [20]:
train_ratings.shape

(994169, 3)

In [21]:
train_ratings

Unnamed: 0,user,movie,rating
0,0,1104,5
1,0,639,3
2,0,853,3
3,0,3177,4
4,0,2162,5
...,...,...,...
1000204,6039,1019,1
1000205,6039,1022,5
1000206,6039,548,5
1000207,6039,1024,4


In [22]:
X_train,y_train = train_ratings.iloc[:,:-1],train_ratings.iloc[:,-1]
X_test,y_test = test_ratings.iloc[:,:-1],test_ratings.iloc[:,-1]

In [23]:
X_train

Unnamed: 0,user,movie
0,0,1104
1,0,639
2,0,853
3,0,3177
4,0,2162
...,...,...
1000204,6039,1019
1000205,6039,1022
1000206,6039,548
1000207,6039,1024


In [24]:
X_train.shape

(994169, 2)

### To use keras we convert into df into array


In [25]:
X_train_array = [X_train.iloc[:,0],X_train.iloc[:,1]]
X_test_array = [X_test.iloc[:,0],X_test.iloc[:,1]]

In [26]:
np.shape(X_train_array)

(2, 994169)

# Factors for embeddings -50

# Try different embeddigs,see what works better

In [27]:
n_factors = 50

In [28]:
 n_users

6040

In [29]:
user = Input(shape=(1,))

In [30]:
user

<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'input_1')>

In [31]:
user.shape

TensorShape([None, 1])

In [32]:
u = Embedding(n_users, n_factors, embeddings_initializer='he_normal',
                  embeddings_regularizer=l2(1e-6))(user)

In [33]:
u.shape

TensorShape([None, 1, 50])

In [34]:
 u = Reshape((n_factors,))(u)

In [35]:
u.shape

TensorShape([None, 50])

# wHY AND WHAT DOES ADAM DO,WHAT OTHER OPTIMIZERS ARE THERE AND WHY WE USED ADAM

In [36]:
def RecommenderV1(n_users, n_movies, n_factors):
    user = Input(shape=(1,))
    u = Embedding(n_users, n_factors, embeddings_initializer='he_normal',
                  embeddings_regularizer=l2(1e-6))(user)
    u = Reshape((n_factors,))(u)
    
    movie = Input(shape=(1,))
    m = Embedding(n_movies, n_factors, embeddings_initializer='he_normal',
                  embeddings_regularizer=l2(1e-6))(movie)
    m = Reshape((n_factors,))(m)
    
    x = Dot(axes=1)([u, m])

    model = Model(inputs=[user, movie], outputs=x)
    opt = Adam(lr=0.001)
    model.compile(loss='mean_squared_error', optimizer=opt)

    return model

## user embedding = num_users * factors  = 6040 * 50 = 302000


In [37]:
model = RecommenderV1(n_users, n_movies, n_factors)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 1, 50)        302000      ['input_2[0][0]']                
                                                                                                  
 embedding_2 (Embedding)        (None, 1, 50)        185300      ['input_3[0][0]']                
                                                                                              

  super(Adam, self).__init__(name, **kwargs)


# know what batch size means,change all the parameters and see if we get better accuracyy

In [38]:
history = model.fit(x=X_train_array, y=y_train, batch_size=64, epochs=5,
                    verbose=1, validation_data=(X_test_array, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [39]:
np.array(y_test)[120]

5

In [40]:
model.predict(X_test_array)[120]

array([4.828949], dtype=float32)