<a href="https://colab.research.google.com/github/enriqueav/MetacriticUserscore/blob/master/metacritic_user_scores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install the latest version of TensorFlow
!pip install -q -U tensorflow==1.7.0

[K    100% |████████████████████████████████| 48.0MB 831kB/s 
[K    100% |████████████████████████████████| 3.1MB 8.5MB/s 
[K    100% |████████████████████████████████| 890kB 16.7MB/s 
[?25h  Building wheel for html5lib (setup.py) ... [?25ldone
[31mmagenta 0.3.19 has requirement tensorflow>=1.12.0, but you'll have tensorflow 1.7.0 which is incompatible.[0m
[?25h

In [2]:
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
layers = keras.layers

# This code was tested with TensorFlow v1.7
print("You have TensorFlow version", tf.__version__)

You have TensorFlow version 1.7.0


Now we are going to download the dataset. Originally obtained from [this kaggle dataset](https://www.kaggle.com/dahlia25/metacritic-video-game-comments)

In [3]:
!wget -nc https://github.com/enriqueav/MetacriticUserscore/raw/master/metacritic-video-game-comments.zip
!unzip -o metacritic-video-game-comments.zip
!chmod 777 metacritic*

--2019-04-21 10:04:28--  https://github.com/enriqueav/MetacriticUserscore/raw/master/metacritic-video-game-comments.zip
Resolving github.com (github.com)... 192.30.255.113, 192.30.255.112
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/enriqueav/MetacriticUserscore/master/metacritic-video-game-comments.zip [following]
--2019-04-21 10:04:29--  https://raw.githubusercontent.com/enriqueav/MetacriticUserscore/master/metacritic-video-game-comments.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81214297 (77M) [application/zip]
Saving to: ‘metacritic-video-game-comments.zip’


2019-04-21 10:04:31 (311 MB/s) - ‘metacritic-video-game-comments.zip’ sav

In [4]:
# Convert the data to a Pandas data frame
comments = pd.read_csv('metacritic_game_user_comments.csv')
# Shuffle with a fixed random seed
comments = comments.sample(frac=1, random_state=387)
comments = comments[pd.notnull(comments['Comment'])]
comments.drop(['Unnamed: 0','Username'], axis=1, inplace=True)

# Drop comments with less than 200 characters
comments = comments[comments['Comment'].str.len() > 200]
# Print the first 5 rows
print(comments.count())
print(comments.head())

Title        225930
Platform     225930
Userscore    225930
Comment      225930
dtype: int64
                              Title      Platform  Userscore  \
277924  Call of Duty: Black Ops III  PlayStation4          6   
224106                      FIFA 18  PlayStation4          9   
169926      Resistance: Fall of Man  PlayStation3          9   
172446                    Titanfall       XboxOne          9   
70980   Super Smash Bros. for Wii U          WiiU          9   

                                                  Comment  
277924  I liked it. But there is some downfalls. It ai...  
224106  A welcomed evolution of  FIFA 17.Nothing seems...  
169926   I'm generally a Noob when it comes to first-p...  
172446  Wow, It is clear there is a lot of Sony fanboy...  
70980   This is the best game in the series. Mostly ev...  


In [5]:
# Split data into train and test
train_size = int(len(comments) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(comments) - train_size))

# Train features
comments_train = comments['Comment'][:train_size]
# Train labels
labels_train = comments['Userscore'][:train_size]
# Test features
comments_test = comments['Comment'][train_size:]
# Test labels
labels_test = comments['Userscore'][train_size:]

Train size: 180744
Test size: 45186


In [0]:
# Create a tokenizer to preprocess our text descriptions
vocab_size = 12000 # This is a hyperparameter, experiment with different values for your dataset
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level=False)
tokenize.fit_on_texts(comments_train) # only fit on train

In [7]:
# Define our wide model with the functional API
bow_inputs = layers.Input(shape=(vocab_size,))
inter = layers.Dense(64, activation='relu')(bow_inputs)
predictions = layers.Dense(1)(inter)
wide_model = keras.Model(inputs=bow_inputs, outputs=predictions)
wide_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
print(wide_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 12000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                768064    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 768,129
Trainable params: 768,129
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
max_seq_length = 2000

# Define our deep model with the Functional API
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size, 8, input_length=max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embedding = layers.Dense(64)(embedding)
embedding = layers.Dropout(0.3)(embedding)
embed_out = layers.Dense(1)(embedding)
deep_model = keras.Model(inputs=deep_inputs, outputs=embed_out)
deep_model.compile(loss='mse',
                   optimizer='adam',
                   metrics=['accuracy'])
print(deep_model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 2000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 2000, 8)           96000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                1024064   
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 1,120,129
Trainable params: 1,120,129
Non-trainable params: 0
_________________________________________________________________


In [9]:
# Combine wide and deep into one model
merged_out = layers.concatenate([wide_model.output, deep_model.output])
merged_out = layers.Dense(64)(merged_out)
merged_out = layers.Dropout(0.3)(merged_out)
merged_out = layers.Dense(1)(merged_out)
combined_model = keras.Model([wide_model.input, deep_model.input], merged_out)
combined_model.compile(loss='mse',
                       optimizer='adam',
                       metrics=['accuracy'])
print(combined_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 2000)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 2000, 8)      96000       input_2[0][0]                    
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 16000)        0           embedding_1[0][0]                
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 12000)        0                                            
__________________________________________________________________________________________________
dense_3 (D

In [0]:
# Create the generator for fit and evaluate
def generator(comments_list, labels_list, batch_size, tokenize, max_seq_length):
    batch_number = 0
    data_set_len = len(comments_list)
    batches_per_epoch = int(data_set_len/batch_size)

    while True:
        initial = (batch_number*batch_size) % data_set_len
        final = initial + batch_size
        comments_to_send = comments_list[initial:final]

        bow = tokenize.texts_to_matrix(comments_to_send)
        embed = tokenize.texts_to_sequences(comments_to_send)
        embed = keras.preprocessing.sequence.pad_sequences(
            embed, maxlen=max_seq_length, padding="post")

        x = [bow, embed]
        y = labels_list[initial:final]

        batch_number = (batch_number+1) % batches_per_epoch
        yield x, y

In [11]:
# Run training
combined_model.fit_generator(
    generator(comments_train, labels_train, 128, tokenize, max_seq_length),
    steps_per_epoch=int(len(comments_train)/128),
    epochs=7,
)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<tensorflow.python.keras._impl.keras.callbacks.History at 0x7ff1509db1d0>

In [12]:
print(combined_model.evaluate_generator(
    generator(comments_test, labels_test, 128, tokenize, max_seq_length),
    steps=int(len(comments_test)/128)
))

[4.503198841813603, 0.28370662181303113]


In [13]:
# Generate predictions
predictions = combined_model.predict_generator(
    generator(comments_test, labels_test, 128, tokenize, max_seq_length),
    steps=int(len(comments_test)/128)
)

# Compare predictions with actual values for the first few items in our test dataset
diff = 0

for i in range(len(predictions)):
    val = predictions[i]
    # print(description_test.iloc[i])
    # print('Predicted: ', val[0], 'Actual: ', labels_test.iloc[i], '\n')
    diff += abs(val[0] - labels_test.iloc[i])

# Compare the average difference between actual price and the model's predicted price
print('Average prediction difference: ', diff / len(predictions))

Average prediction difference:  1.4651924450077096
