In [1]:
import sys
import tensorflow as tf
from termcolor import colored
print(colored('Python Version : %s' % sys.version.split()[0], 'blue'))
print(colored('Tensorflow Ver: %s' % tf.__version__, 'magenta'))

ImportError: No module named termcolor

In [None]:
n_epoch = int(input('Enter no of epochs for RNN training: '))

In [None]:
print(colored('No of epochs: %d' % n_epoch, 'red'))

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 1000)

In [None]:
# Load IGN Dataset as original_ign
original_ign = pd.read_csv('ign.csv')
original_ign.head(10)

In [None]:
print('original_ign.shape: ', original_ign.shape)

In [None]:
# Check out all the unique score_phrase as well as their counts
original_ign.score_phrase.value_counts()

In [None]:
# DAta Preprocessing: Convert score_phrase to binary sentiments and add a new column called sentiment
bad_phrases = ['Bad',  'Awful', 'Painful', 'Unbearable', 'Disaster']
original_ign['sentiment'] = original_ign.score_phrase.isin(bad_phrases).map({True: 'Negative', False: 'Positive'})

In [None]:
# Remove 'Disaster'
original_ign = original_ign[original_ign['score_phrase'] != 'Disaster']

In [None]:
original_ign.head(10)

In [None]:
# No of +ve sentiments vs no of -ve sentiments
original_ign.sentiment.value_counts(normalize=True)

In [None]:
# Check for null elements
original_ign.isnull().sum()

In [None]:
# Fill all null elements with an empty string
original_ign.fillna(value='', inplace=True)
# original_ign[original_ign['genre'] == ''].shape

In [None]:
# Create a new dataframe called ign
ign = original_ign[['sentiment', 'score_phrase', 'title', 'platform', 'genre', 'editors_choice']].copy()
ign.head(10)

In [None]:
# Create a new column called is_editors_choice
ign['is_editors_choice'] = ign['editors_choice'].map({'Y': 'editors_choice', 'N': ''})
ign.head()

In [None]:
# Create a new column called text which contains contents of several columns
ign['text'] = ign['title'].str.cat(ign['platform'], 
    sep=' ').str.cat(ign['genre'], sep=' ').str.cat(ign['is_editors_choice'], sep=' ')

In [None]:
ign.head(10)

In [None]:
print('Shape of \"ign\" DataFrame:', ign.shape)

In [None]:
# This is a multiclass problem where the labels are predicted
X = ign.text
y = ign.score_phrase
print(X.head(10))
y.head(10)

In [None]:
# Model #0 The Dummy Classifier (Always choose the most frequent class)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

vect = TfidfVectorizer(stop_words='english', token_pattern=r'\b\w{2,}\b')
dummy = DummyClassifier(strategy='most_frequent', random_state=0)
dummy_pipeline = make_pipeline(vect, dummy)

In [None]:
dummy_pipeline.named_steps

In [None]:
# Cross Validation
cv = cross_val_score(dummy_pipeline, X, y, scoring='accuracy', cv=10, n_jobs=-1)
print(colored('\nDummy Classifier\'s Accuracy: %0.5f\n' % cv.mean(), 'yellow'))

In [None]:
# Model #1, MultinomialNB Classifier
from sklearn.naive_bayes import MultinomialNB
vect = TfidfVectorizer(stop_words='english', token_pattern=r'\b\w{2,}\b', min_df=1, max_df=0.1, ngram_range=(1,2))
mnb = MultinomialNB(alpha=2)
mnb_pipeline = make_pipeline(vect, mnb)

In [None]:
mnb_pipeline.named_steps

In [None]:
# Cross validation
cv = cross_val_score(mnb_pipeline, X, y, scoring='accuracy', cv=10, n_jobs=-1)
print(colored('\nMultinomialNB Classifier\'s Accuracy: %0.5f\n' % cv.mean(), 'green'))

In [None]:
#Model #2: RNN Classifier using TFLearn
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
# Create the vocab (so that we can create X_words_ids from X)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(ngram_range=(1,1), token_pattern=r'\b\w{1,}\b')

In [None]:
vect.fit(X_train)
vocab = vect.vocabulary_

In [None]:
def convert_X_to_X_word_ids(X):
    return X.apply(lambda x: [vocab[w] for w in [w.lower().strip() for w in x.split()] if w in vocab])

In [None]:
X_train_word_ids = convert_X_to_X_word_ids(X_train)
X_test_word_ids = convert_X_to_X_word_ids(X_test)

X_train.head()

In [None]:
X_train_word_ids.head()

In [None]:
print('X_train_word_ids.shape', X_train_word_ids.shape)
print('X_test_word_ids.shape', X_test_word_ids.shape)

In [None]:
# Sequence Padding
X_train_padded_seq = pad_sequences(X_train_word_ids, maxlen=20, value=0)
X_test_padded_seq = pad_sequences(X_test_word_ids, maxlen=20, value=0)
print('X_train_padded_seq.shape', X_train_padded_seq.shape)
print('X_test_padded_seq.shape', X_test_padded_seq.shape)

In [None]:
pd.DataFrame(X_train_padded_seq).head()


In [None]:
pd.DataFrame(X_test_padded_seq).head()

In [None]:
# Convert (y) labels to vectors
unique_y_labels = list(y_train.value_counts().index)
unique_y_labels

In [None]:
len(unique_y_labels)

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(unique_y_labels)

In [None]:
print('')
for label_id, label_name in zip(le.transform(unique_y_labels), unique_y_labels):
    print('%d: %s' % (label_id, label_name))
print('')

In [None]:
y_train = to_categorical(y_train.map(lambda x: le.transform([x])[0]), nb_classes=len(unique_y_labels))
y_test = to_categorical(y_test.map(lambda x: le.transform([x])[0]), nb_classes=len(unique_y_labels))

In [None]:
y_train[0:3]

In [None]:
print('y_train.shape', y_train.shape)
print('y_test.shape', y_test.shape)

In [None]:
# Network Building
size_of_each_vector = X_train_padded_seq.shape[1]
vocab_size = len(vocab)
no_of_unique_y_labels = len(unique_y_labels)

In [None]:
print('size_of_each_vector:', size_of_each_vector)
print('vocab_size:', vocab_size)
print('no_of_unique_y_labels:', no_of_unique_y_labels)

In [None]:
# sgd = tflearn.SGD(learning_rate=1e-4, lr_decay=0.96, decay_step=1000)

net = tflearn.input_data([None, size_of_each_vector]) # The first element is the batch size which we set to None
net = tflearn.embedding(net, input_dim=vocab_size, output_dim=128) # input_dim: vocabulary size
net = tflearn.lstm(net, 128, dropout=0.6) # Set the dropout to 0.6
net = tflearn.fully_connected(net, no_of_unique_y_labels, activation='softmax') # relu or softmax
net = tflearn.regression(net,
                        optimizer='adam',
                        learning_rate=1e-4,
                        loss='categorical_crossentropy')

In [None]:
# model = tflearn.DNN(net, tensorboard_verbose=0, checkpoint_path='SavedModels/model.tfl.ckpt')
model = tflearn.DNN(net, tensorboard_verbose=0)

In [None]:
model.fit(X_train_padded_seq, y_train, validation_set=(X_test_padded_seq, y_test),
         n_epoch=n_epoch,
         show_metric=True,
         batch_size=100)

In [None]:
# Manually Save the model
model.save('SavedModels/ign_videogame_sentiment_analysis.tfl')
print(colored('Model Saves!', 'red'))

In [None]:
# Manually load the model
model.load('SavedModels/ign_videogame_sentiment_analysis.tfl')
print(colored('Model Loaded!', 'red'))

In [None]:
# RNN's accuracy
import numpy as np
from sklearn import metrics

pred_classes = [np.argmax(i) for i in model.predict(X_test_padded_seq)]
true_classes = [np.argmax(i) for i in y_test]

print(colored('\nRNN Classifier\'s Accuracy: %0.5f\n' % metrics.accuracy_score(true_classes, pred_classes), 'cyan'))

In [None]:
# Show some predicted samples
ids_of_titles = range(0, 21)

for i in ids_of_titles:
    pred_class = np.argmax(model.predict([X_test_padded_seq[i]]))
    true_class = np.argmax(y_test[i])
    
    print(X_test.values[i])
    print('pred_class:', le.inverse_transform(pred_class))
    print('true_class:', le.inverse_transform(true_class))
    print('')