In [3]:
#load dependencies
import pandas as pd
import numpy as np
import tflearn
import tensorflow as tf
from tflearn.data_utils import to_categorical, pad_sequences, VocabularyProcessor
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import TimeseriesGenerator

In [5]:
# Read data and drop unneeded columns
dataset = pd.read_csv('ign.csv').iloc[:, 1:3]  #import relevelant columns from dataset
dataset.fillna(value='', inplace=True)         #replance any blank or nan data with empty string
#print(dataset.columns)

# Check for null or missing data
dataset.isnull().sum()

# Fill in or create missing data
# dataframe.fillna(value='', inplace=True)

score_phrase    0
title           0
dtype: int64

In [6]:
# Extract independent & dependent variables (X & Y)
trainX = dataset.title
trainY = dataset.score_phrase
trainX

0                                  LittleBigPlanet PS Vita
1        LittleBigPlanet PS Vita -- Marvel Super Hero E...
2                                     Splice: Tree of Life
3                                                   NHL 13
4                                                   NHL 13
5                                Total War Battles: Shogun
6                                      Double Dragon: Neon
7                                             Guild Wars 2
8                                      Double Dragon: Neon
9                                Total War Battles: Shogun
10                                 Tekken Tag Tournament 2
11                                 Tekken Tag Tournament 2
12                                              Wild Blood
13                                       Mark of the Ninja
14                                       Mark of the Ninja
15                         Home: A Unique Horror Adventure
16                         Home: A Unique Horror Adventu

In [7]:
trainY

0            Amazing
1            Amazing
2              Great
3              Great
4              Great
5               Good
6              Awful
7            Amazing
8              Awful
9               Good
10              Good
11              Good
12              Good
13           Amazing
14           Amazing
15              Okay
16              Okay
17             Great
18          Mediocre
19              Good
20              Good
21              Good
22              Good
23              Good
24           Amazing
25              Good
26           Amazing
27              Good
28             Great
29              Okay
            ...     
18595            Bad
18596           Okay
18597            Bad
18598           Okay
18599           Good
18600           Good
18601           Good
18602           Good
18603           Good
18604          Great
18605           Okay
18606           Okay
18607           Good
18608       Mediocre
18609          Great
18610           Okay
18611       M

In [8]:
# Convert sequence data (strings) into numeric data
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(trainX)
x = tokenizer.texts_to_sequences(trainX)

# Convert data into a matrix array and pad with zeros
totalX = tf.keras.preprocessing.sequence.pad_sequences(x, maxlen=15, padding='post', truncating='post')
totalX

array([[1259, 3428, 2741, ...,    0,    0,    0],
       [1259, 3428, 2741, ...,    0,    0,    0],
       [4734, 1718,    3, ...,    0,    0,    0],
       ...,
       [  11,  883, 8145, ...,    0,    0,    0],
       [ 843,    0,    0, ...,    0,    0,    0],
       [ 843,    0,    0, ...,    0,    0,    0]])

In [9]:
# Convert sequence data (strings) into numeric data
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(trainY)
Y = tokenizer.texts_to_sequences(trainY)
#print(Y.count([11]))

# Convert data into a matrix array
totalY = np.array(Y)
totalY

array([[ 5],
       [ 5],
       [ 1],
       ...,
       [ 4],
       [10],
       [10]])

In [10]:
# Convert the indices into 11 dimensional vectors
tocatY =  to_categorical(totalY, nb_classes=12) 
# Drop first column of zeros
totalY = np.delete(tocatY, 0, 1)
totalY

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

In [11]:
# Split data into training, test and validation
trainX, testX, trainY, testY = train_test_split(totalX, totalY, test_size=0.1)

# Build Model
net = tflearn.input_data([None, trainX.shape[1]])
net = tflearn.embedding(net, input_dim=10000, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.9)
#net = tflearn.lstm(net, 128, dropout=0.6)
net = tflearn.fully_connected(net, 11, activation='softmax') # relu or softmax
net = tflearn.regression(net, optimizer='adam', learning_rate=.0001, loss='categorical_crossentropy')
#model = tflearn.DNN(net, tensorboard_verbose=0, checkpoint_path='SavedModels/model.tfl.ckpt')
model = tflearn.DNN(net, tensorboard_verbose=0)

Instructions for updating:
Please use `keras.layers.RNN(cell, unroll=True)`, which is equivalent to this API
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Use tf.cast instead.


In [12]:
# train the model
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=100, n_epoch=100)

Training Step: 16799  | total loss: [1m[32m26.37685[0m[0m | time: 10.508s
| Adam | epoch: 100 | loss: 26.37685 - acc: 0.0573 -- iter: 16700/16762
Training Step: 16800  | total loss: [1m[32m26.37685[0m[0m | time: 11.572s
| Adam | epoch: 100 | loss: 26.37685 - acc: 0.0516 | val_loss: 26.37685 - val_acc: 0.0338 -- iter: 16762/16762
--
