# Book Recommender System in Tensorflow

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine

In [2]:
k = 50

epochs = 10
display_step = 10

learning_rate = 0.3

batch_size = 25

### Reading Dataset and splitting it in a training set and a test set

In [3]:
sql = 'SELECT user_id, book_id, rating, date_created FROM public."Reviews"'

engine = create_engine('postgresql://ece651_ml:TVL3MV0mguz0DOhLbbm2@localhost:5432/ece651')

# Reading dataset

df = pd.pandas.read_sql(sql, engine)

y = df.date_created
df = df.drop('date_created', axis=1)

df.columns = ['user', 'book', 'rating']

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2)

train_data = X_train
test_data = X_test

num_books = df.book.nunique()
num_users = df.user.nunique()

print("USERS: {} BOOKS: {}".format(num_users, num_books))
print(df.head(5))

USERS: 148 BOOKS: 48
   user  book  rating
0  2292   360       5
1  2293   360       5
2  2294   360       5
3  2297   655       4
4  2295   360       5


### Loading training set with three columns: user, book and ratings

In [4]:
# Normalize in [0, 1]

u = df['user'].values.astype(float)

user_min = u.min()
user_range = u.max() - u.min()

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(u.reshape(-1,1))
df_normalized = pd.DataFrame(x_scaled)
df['user'] = df_normalized


b = df['book'].values.astype(float)

book_min = b.min()
book_range = b.max() - b.min()

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(b.reshape(-1,1))
df_normalized = pd.DataFrame(x_scaled)
df['book'] = df_normalized

r = df['rating'].values.astype(float)

rating_min = r.min()
rating_range = r.max() - r.min()

min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(r.reshape(-1,1))
df_normalized = pd.DataFrame(x_scaled)
df['rating'] = df_normalized

### Convert DataFrame in user-item matrix

In [5]:
matrix = df.pivot(index='user', columns='book', values='rating')
matrix.fillna(0, inplace=True)

### Users and items ordered as they are in matrix

In [6]:
users = matrix.index.tolist()
books = matrix.columns.tolist()

matrix = matrix.values

print("Matrix shape: {}".format(matrix.shape))

Matrix shape: (148, 48)


### Network Parameters

In [7]:
num_input = num_books   # num of items
num_hidden_1 = 10       # 1st layer num features
num_hidden_2 = 5        # 2nd layer num features (the latent dim)

X = tf.placeholder(tf.float64, [None, num_input])

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
}

biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
}

### Building the encoder

In [8]:
def encoder(x):
    # Encoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
    # Encoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
    return layer_2

### Building the decoder

In [9]:
def decoder(x):
    # Decoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
    # Decoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
    return layer_2

### Construct model

In [10]:
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)

### Prediction

In [11]:
y_pred = decoder_op

### Targets are the input data.

In [12]:
y_true = X

### Define loss and optimizer, minimize the squared error

In [13]:
loss = tf.losses.mean_squared_error(y_true, y_pred)
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)

predictions = pd.DataFrame()

### Define evaluation metrics

In [14]:
eval_x = tf.placeholder(tf.int32, )
eval_y = tf.placeholder(tf.int32, )
pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

### Initialize the variables

In [15]:
init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()

### Train the Model

In [16]:
with tf.Session() as session:
    session.run(init)
    session.run(local_init)

    num_batches = int(matrix.shape[0] / batch_size)
    matrix = np.array_split(matrix, num_batches)

    for i in range(epochs):

        avg_cost = 0

        for batch in matrix:
            _, l = session.run([optimizer, loss], feed_dict={X: batch})
            avg_cost += l

        avg_cost /= num_batches

        print("Epoch: {} Loss: {}".format(i + 1, avg_cost))

        # if i % display_step == 0 or i == 1:
        #     print('Step %i: Minibatch Loss: %f' % (i, l))

    print("Predictions...")

    matrix = np.concatenate(matrix, axis=0)

    preds = session.run(decoder_op, feed_dict={X: matrix})

    # print(matrix)
    # print(preds)
    
    predictions = predictions.append(pd.DataFrame(preds))

    predictions = predictions.stack().reset_index(name='rating')
    predictions.columns = ['user', 'book', 'rating']
    predictions['user'] = predictions['user'].map(lambda value: users[value])
    predictions['book'] = predictions['book'].map(lambda value: books[value])

    print(predictions)
    print(predictions.shape)
    
    keys = ['user', 'book']
    i1 = predictions.set_index(keys).index
    i2 = df.set_index(keys).index

    recs = predictions[~i1.isin(i2)]
    recs = recs.sort_values(['user', 'rating'], ascending=[True, False])
    recs = recs.groupby('user').head(k)
    recs.to_csv('prediction.csv', sep=',', index=False, header=False)

Epoch: 1 Loss: 0.3135050177574158
Epoch: 2 Loss: 0.3073942303657532
Epoch: 3 Loss: 0.2996737420558929
Epoch: 4 Loss: 0.29001798033714293
Epoch: 5 Loss: 0.27809120416641236
Epoch: 6 Loss: 0.2635666251182556
Epoch: 7 Loss: 0.246157768368721
Epoch: 8 Loss: 0.22572847902774812
Epoch: 9 Loss: 0.202572825551033
Epoch: 10 Loss: 0.1777025729417801
Predictions...
      user      book    rating
0      0.0  0.000000  0.307350
1      0.0  0.167143  0.222597
2      0.0  0.176429  0.235368
3      0.0  0.177857  0.109023
4      0.0  0.180000  0.204449
5      0.0  0.181429  0.322745
6      0.0  0.182857  0.435355
7      0.0  0.211429  0.406789
8      0.0  0.377857  0.221879
9      0.0  0.378571  0.446230
10     0.0  0.382857  0.242514
11     0.0  0.383571  0.010432
12     0.0  0.384286  0.210606
13     0.0  0.388571  0.046564
14     0.0  0.389286  0.963894
15     0.0  0.390714  0.292323
16     0.0  0.391429  0.275621
17     0.0  0.392143  0.213142
18     0.0  0.393571  0.494022
19     0.0  0.465000  0

In [17]:
recs['user'] = recs['user'] * user_range + user_min
recs['book'] = recs['book'] * book_range + book_min

recs.sort_values(['user', 'rating'], ascending=[True, False])

Unnamed: 0,user,book,rating
19,1.0,777.0,0.989830
14,1.0,671.0,0.963894
38,1.0,905.0,0.938429
21,1.0,779.0,0.514966
18,1.0,677.0,0.494022
34,1.0,831.0,0.487434
22,1.0,780.0,0.464818
9,1.0,656.0,0.446230
6,1.0,382.0,0.435355
7,1.0,422.0,0.406789


In [18]:
recs.loc[recs['user'] == 2380]

Unnamed: 0,user,book,rating
3523,2380.0,777.0,0.987779
3518,2380.0,671.0,0.973378
3542,2380.0,905.0,0.945409
3545,2380.0,1308.0,0.821209
3546,2380.0,1374.0,0.754919
3538,2380.0,831.0,0.539009
3525,2380.0,779.0,0.520358
3528,2380.0,787.0,0.441038
3513,2380.0,656.0,0.419657
3530,2380.0,789.0,0.362183


In [19]:
recs.loc[recs['user'] == 2380]['book'].shape

(43,)

In [20]:
user_2380_top = recs.loc[recs['user'] == 2380]

expected_2380_book_ids = [382,670,662,375,677];
for x in expected_2380_book_ids:
    if x not in user_2380_top['book'].values.round(): 
        print(f'Couldn\'t find {x} for user 2380')

Couldn't find 382 for user 2380
Couldn't find 670 for user 2380
Couldn't find 662 for user 2380
Couldn't find 375 for user 2380
Couldn't find 677 for user 2380


In [21]:
recs.loc[recs['user'] == 1]

Unnamed: 0,user,book,rating
19,1.0,777.0,0.98983
14,1.0,671.0,0.963894
38,1.0,905.0,0.938429
21,1.0,779.0,0.514966
18,1.0,677.0,0.494022
34,1.0,831.0,0.487434
22,1.0,780.0,0.464818
9,1.0,656.0,0.44623
6,1.0,382.0,0.435355
7,1.0,422.0,0.406789


In [22]:
recs.loc[recs['user'] == 1]['book'].shape

(40,)

In [23]:
user_1_top = recs.loc[recs['user'] == 1].head(10)

expected_1_book_ids = [1387,1374,1420,1526,1308,1384,1210,1385];
for x in expected_1_book_ids:
    if x not in user_1_top['book'].values.round(): 
        print(f'Couldn\'t find {x} for user 1')

Couldn't find 1387 for user 1
Couldn't find 1374 for user 1
Couldn't find 1420 for user 1
Couldn't find 1526 for user 1
Couldn't find 1308 for user 1
Couldn't find 1384 for user 1
Couldn't find 1210 for user 1
Couldn't find 1385 for user 1
