## Experiment 2: Explicit Feedback

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

### Ratings file
Each line contains a rated video game:

- a user
- an item
- a rating from 1 to 5

In [2]:
raw_ratings = pd.read_csv('ratings_latest.csv', sep=',')

raw_ratings.head()

Unnamed: 0,id,rate,appid
0,76561197960265729,1.0,10
1,76561197960265729,1.0,20
2,76561197960265729,1.0,30
3,76561197960265729,1.0,40
4,76561197960265729,1.0,50


### Metadata file

This file contains information about each game, specifically:
- item
- name
- genres

In [3]:
items = pd.read_csv('metadata.csv', sep=',')

items.head()

Unnamed: 0,appid,appname,genres
0,10,Counter-Strike,1/
1,20,Team Fortress Classic,1/
2,30,Day of Defeat,1/
3,40,Deathmatch Classic,1/
4,50,Half-Life: Opposing Force,1/


In [4]:
all_ratings = pd.merge(items, raw_ratings)

all_ratings.head()

Unnamed: 0,appid,appname,genres,id,rate
0,10,Counter-Strike,1/,76561197960265729,1.0
1,10,Counter-Strike,1/,76561197960265730,2.05
2,10,Counter-Strike,1/,76561197960265731,1.0
3,10,Counter-Strike,1/,76561197960265733,1.4
4,10,Counter-Strike,1/,76561197960265734,1.0


In [5]:
max_user_id = all_ratings['id'].max()
max_user_id

76561198800607700

In [6]:
max_item_id = all_ratings['appid'].max()
max_item_id

787370

In [15]:
users = all_ratings.id.unique()
user_map = {i:val for i,val in enumerate(users)}
inverse_user_map = {val:i for i,val in enumerate(users)}


games = all_ratings.appid.unique()
game_map = {i:val for i,val in enumerate(games)}
inverse_game_map = {val:i for i,val in enumerate(games)}

all_ratings["id"] = all_ratings["id"].map(inverse_user_map)

all_ratings["old_id"] = all_ratings["appid"] # copying for join with metadata
all_ratings["appid"] = all_ratings["appid"].map(inverse_game_map)

print ("A total of: ", users.shape[0], "unique users")
print ("A total of: ", games.shape[0], "unique games")

A total of:  676668 unique users
A total of:  18865 unique games


In [16]:
from sklearn.model_selection import train_test_split

ratings_train, ratings_test = train_test_split(
    all_ratings, test_size=0.2, random_state=0)

user_id_train = ratings_train['id']
item_id_train = ratings_train['appid']
rating_train = ratings_train['rate']

user_id_test = ratings_test['id']
item_id_test = ratings_test['appid']
rating_test = ratings_test['rate']

ratings_train.head()

Unnamed: 0,appid,appname,genres,id,rate,old_id
19998751,3625,Famaze,4/37/23/3/2/,164315,1.0,3625
11140094,1160,Darksiders™,1/25/,134772,1.0,1160
17517238,2269,Tropico 5,3/28/2/,249377,1.1,2269
9452590,816,Alpha Protocol™,1/3/,45636,1.1,816
6814866,333,Call of Duty®: Modern Warfare® 2,1/,145808,1.5,333


### Supervised Ratings Prediction with Explicit Feedback

In [17]:
import tensorflow as tf
def dot_mode(inputs):
    """Work around for Keras bug with merge([...], mode='dot').

    https://github.com/fchollet/keras/issues/2626

    The dot product of 2 embeddings can be used as an unnormalized
    approximation to the cosine similarity.
    """
    latent_codes_1, latent_codes_2 = inputs
    return tf.reduce_sum(latent_codes_1 * latent_codes_2, axis=-1)

In [18]:
from keras.layers import Input, Embedding, Flatten, merge, Dense, Dropout, Lambda, Dot, Reshape
from keras.models import Model
import keras.backend as K

In [19]:
# # For each sample we input the integer identifiers
# # of a single user and a single item
# user_id_input = Input(shape=[1], name='user')
# item_id_input = Input(shape=[1], name='item')

# embedding_size = 30
# user_embedding = Embedding(output_dim=embedding_size, input_dim=max_user_id + 1,
#                            input_length=1, name='user_embedding')(user_id_input)
# item_embedding = Embedding(output_dim=embedding_size, input_dim=max_item_id + 1,
#                            input_length=1, name='item_embedding')(item_id_input)

# # reshape from shape: (batch_size, input_length, embedding_size)
# # to shape: (batch_size, input_length * embedding_size) which is
# # equal to shape: (batch_size, embedding_size)
# user_vecs = Flatten()(user_embedding)
# item_vecs = Flatten()(item_embedding)

# # y = merge([user_vecs, item_vecs], mode=dot_mode, output_shape=(1,))

# y = Dot(-1, normalize=False)([user_vecs, item_vecs])

# model = Model(inputs=[user_id_input, item_id_input], outputs=y)


# #model = Model(input=[user_id_input, item_id_input], output=y)
# model.compile(optimizer='adam', loss='mae')

In [20]:
all_ratings.id.unique().shape[0]

676668

In [21]:
user_id_input = Input(shape=[1], name='user')
item_id_input = Input(shape=[1], name='item')

embedding_size = 30
user_embedding = Embedding(output_dim=embedding_size, input_dim=users.shape[0],
                           input_length=1, name='user_embedding')(user_id_input)
item_embedding = Embedding(output_dim=embedding_size, input_dim=games.shape[0],
                           input_length=1, name='item_embedding')(item_id_input)

user_vecs = Reshape([embedding_size])(user_embedding)
item_vecs = Reshape([embedding_size])(item_embedding)

y = Dot(1, normalize=False)([user_vecs, item_vecs])

model = Model(inputs=[user_id_input, item_id_input], outputs=y)

model.compile(loss='mse',
              optimizer="adam"
             )

In [22]:
initial_train_preds = model.predict([user_id_train, item_id_train])
# initial_train_preds.shape

In [23]:
initial_train_preds.shape

(20587142, 1)

In [35]:
import numpy as np

squared_differences = np.square(initial_train_preds - rating_train.values)
absolute_differences = np.abs(initial_train_preds - rating_train.values)

print("Random init MSE:", np.mean(squared_differences))
print("Random init MAE:", np.mean(absolute_differences))


MemoryError: 