In [1]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from math import sqrt
import tensorlayer as tl

  from ._conv import register_converters as _register_converters


# Data

Based on the AutoRec paper, we look exclusively at the ratings.dat from the [Movielens](https://grouplens.org/datasets/movielens/) project 1 million ratings which has 6000 users and 4000 movies.

The format for the data is: UserID::MovieID::Rating::Timestamp

For my example I exclude the Timestamp column.

In [2]:
# read ratings data
ratings = pd.read_csv('../../data/ml-1m/ratings.dat',
                      sep="::", header=None, engine='python')

In [3]:
# create a user x movie matrix, make no rating movies = 0
ratings_pivot = pd.pivot_table(ratings[[0, 1, 2]],
                               values=2, index=0, columns=1).fillna(0)

print(ratings_pivot)

X_train, X_test = train_test_split(ratings_pivot, train_size=0.8)


1     1     2     3     4     5     6     7     8     9     10    ...   3943  \
0                                                                 ...          
1      5.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
2      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
3      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
4      0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
5      0.0   0.0   0.0   0.0   0.0   2.0   0.0   0.0   0.0   0.0  ...    0.0   
6      4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
7      0.0   0.0   0.0   0.0   0.0   4.0   0.0   0.0   0.0   0.0  ...    0.0   
8      4.0   0.0   0.0   3.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
9      5.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
10     5.0   5.0   0.0   0.0   0.0   0.0   4.0   0.0   0.0   0.0  ...    0.0   
11     0.0   0.0   0.0   0.0   0.0   0.0



In [4]:
# Decide number of neurons for input, hidden, and output layers
n_nodes_inpl = 3706
n_nodes_hl1 = 500
n_nodes_outl = 3706
learn_rate = 0.1   # how fast the model should learn
batch_size = 100  # how many images to use together for training
hm_epochs = 120   # how many times to go through the entire dataset
tot_users = X_train.shape[0]  # unique users
# we find that there are 3706 unique columns, 4832 rows
print(X_train.shape)

(4832, 3706)


In [5]:
# hidden layer weights
hidden_1_layer_vals = {'weights': tf.Variable(
    tf.random_normal([n_nodes_inpl + 1, n_nodes_hl1]))}
# output layer weights
output_layer_vals = {'weights': tf.Variable(
    tf.random_normal([n_nodes_hl1 + 1, n_nodes_outl]))}

input_layer = tf.placeholder('float', [None, 3706])

# add a constant node to the first layer
# it needs to have the same shape as the input layer for me to be
# able to concatinate it later
input_layer_const = tf.fill([tf.shape(input_layer)[0], 1], 1.0)
input_layer_concat = tf.concat([input_layer, input_layer_const], 1)
# multiply output of input_layer wth a weight matrix
layer_1 = tf.nn.sigmoid(tf.matmul(input_layer_concat,
                                  hidden_1_layer_vals['weights']))
# adding one bias node to the hidden layer
layer1_const = tf.fill([tf.shape(layer_1)[0], 1], 1.0)
layer_concat = tf.concat([layer_1, layer1_const], 1)
# multiply output of hidden with a weight matrix to get final output
output_layer = tf.matmul(layer_concat, output_layer_vals['weights'])
# output_true shall have the original shape for error calculations
output_true = tf.placeholder('float', [None, 3706])
# define our cost function
meansq = tf.reduce_mean(tf.square(output_layer - output_true))
# define our optimizer
optimizer = tf.train.AdagradOptimizer(learn_rate).minimize(meansq)

In [6]:
# initialising variables and starting the session
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
# train for epoch limit
for epoch in range(hm_epochs):
    epoch_loss = 0    # initializing error as 0

    for i in range(int(tot_users / batch_size)):
        epoch_x = X_train[i * batch_size: (i + 1) * batch_size]
        _, c = sess.run([optimizer, meansq],
                        feed_dict={input_layer: epoch_x,
                                   output_true: epoch_x})
        epoch_loss += c

    output_train = sess.run(output_layer,
                            feed_dict={input_layer: X_train})
    output_test = sess.run(output_layer,
                           feed_dict={input_layer: X_test})

    print('RMSE train', sqrt(MSE(output_train, X_train)),
          'RMSE test', sqrt(MSE(output_test, X_test)))
    print('Epoch', epoch, '/', hm_epochs, 'loss:', epoch_loss)

RMSE train 10.298523265310767 RMSE test 10.313349140825943
Epoch 0 / 120 loss: 7132.146530151367
RMSE train 9.039114118720658 RMSE test 9.066486223692962
Epoch 1 / 120 loss: 4453.74641418457
RMSE train 8.245614205574212 RMSE test 8.282472099757166
Epoch 2 / 120 loss: 3590.8529510498047
RMSE train 7.603365730259653 RMSE test 7.654883615389325
Epoch 3 / 120 loss: 3026.9862022399902
RMSE train 7.058516663147201 RMSE test 7.125202093327304
Epoch 4 / 120 loss: 2590.63826751709
RMSE train 6.587856293194937 RMSE test 6.662512561843248
Epoch 5 / 120 loss: 2246.2126502990723
RMSE train 6.159734669630153 RMSE test 6.245925248388828
Epoch 6 / 120 loss: 1961.824291229248
RMSE train 5.775151278713948 RMSE test 5.8712707800536155
Epoch 7 / 120 loss: 1719.4101181030273
RMSE train 5.436558996520675 RMSE test 5.545310270427774
Epoch 8 / 120 loss: 1517.3590869903564
RMSE train 5.143503544995957 RMSE test 5.260421958834906
Epoch 9 / 120 loss: 1351.4754829406738
RMSE train 4.890538155580832 RMSE test 5.01

RMSE train 2.425276016363174 RMSE test 2.7173557237590105
Epoch 85 / 120 loss: 282.7868094444275
RMSE train 2.418883908899621 RMSE test 2.7119062410816275
Epoch 86 / 120 loss: 281.23749256134033
RMSE train 2.4123828475341824 RMSE test 2.706463629870963
Epoch 87 / 120 loss: 279.73076915740967
RMSE train 2.405820836275776 RMSE test 2.7012154655609524
Epoch 88 / 120 loss: 278.23960304260254
RMSE train 2.3997782722293985 RMSE test 2.6960013110753893
Epoch 89 / 120 loss: 276.76927375793457
RMSE train 2.393870594196531 RMSE test 2.690942100745125
Epoch 90 / 120 loss: 275.37907886505127
RMSE train 2.388132262604539 RMSE test 2.6861459582261267
Epoch 91 / 120 loss: 274.0567479133606
RMSE train 2.382626360269453 RMSE test 2.6813219180146826
Epoch 92 / 120 loss: 272.75160932540894
RMSE train 2.37694931104114 RMSE test 2.6764738453754644
Epoch 93 / 120 loss: 271.481849193573
RMSE train 2.3713769873977557 RMSE test 2.6717404480380047
Epoch 94 / 120 loss: 270.2225046157837
RMSE train 2.365910322960

In [7]:
# # Fun part to find user recommendations
# # Select titles
# titles = pd.read_csv('../../data/ml-1m/movies.dat',
#                      sep="::", header=None, engine='python')
# titles.head()

In [8]:
# pd.merge(target, titles, on='MovieID', how='left', suffixes=('_',''))
# pick a user and get
sample_user = X_test.iloc[99, :]
print("Sample User:", sample_user)
# get the predicted ratings
sample_user_pred = sess.run(output_layer, feed_dict={
                            input_layer: [sample_user]})
print("Prediction Val:", sample_user_pred[0])
print(sample_user_pred[0][3601])

Sample User: 1
1       0.0
2       0.0
3       0.0
4       0.0
5       0.0
6       4.0
7       4.0
8       0.0
9       0.0
10      0.0
11      5.0
12      0.0
13      0.0
14      4.0
15      0.0
16      4.0
17      0.0
18      0.0
19      0.0
20      4.0
21      4.0
22      0.0
23      0.0
24      0.0
25      3.0
26      0.0
27      0.0
28      0.0
29      0.0
30      0.0
       ... 
3923    0.0
3924    0.0
3925    0.0
3926    0.0
3927    0.0
3928    0.0
3929    0.0
3930    0.0
3931    0.0
3932    0.0
3933    0.0
3934    0.0
3935    0.0
3936    0.0
3937    0.0
3938    0.0
3939    0.0
3940    0.0
3941    0.0
3942    0.0
3943    0.0
3944    0.0
3945    0.0
3946    0.0
3947    0.0
3948    0.0
3949    0.0
3950    0.0
3951    0.0
3952    0.0
Name: 4250, Length: 3706, dtype: float64
Prediction Val: [ 3.5325303  -0.37356472  0.8623571  ...  0.25967836 -0.38567734
  2.234995  ]
0.7380074


In [9]:
sample_user_pred

array([[ 3.5325303 , -0.37356472,  0.8623571 , ...,  0.25967836,
        -0.38567734,  2.234995  ]], dtype=float32)

In [14]:
max(sample_user_pred[0])

5.324612