# Sentiment classifier for tweets about GOT Characters

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import bcolz
import pickle
from sklearn.model_selection import train_test_split

from keras.models import Model
from keras.utils.data_utils import get_file

from keras.layers import Dense, Activation, Dropout, Embedding, Input, LSTM 
from keras.layers.normalization import BatchNormalization

from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from keras.regularizers import l2
from keras.datasets import imdb

%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## 1. Dataset Creation

The imdb film reviwes dataset can be directly imported in keras. It contains positive and negative reviews and is the most commen datset used for sentiment analysis.

In [2]:

idx = imdb.get_word_index()

In [3]:
idx_arr = sorted(idx, key=idx.get)
idx_arr[:10]
vocab = list(idx_arr)

We have contructed the vocabulary; now, since any model uses numbers, we need two dictionaries, one mapping each word to an index and the second, the inverse of this dictionary.

In [4]:
def invert_dict(d):
    return dict([(v, k) for k, v in d.items()])

In [5]:
idx2word = {v: k for k, v in idx.items()}
word2idx = invert_dict(idx2word)

In [6]:
path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')
f = open(path, 'rb')
(x_train, y_train), (x_test, y_test) = pickle.load(f)

we want to predict on the dataset we have created with tweets about Game of Thrones characters

In [7]:
dfPred = pd.read_csv('./data/fakenewsprediction.csv')
dfPred.head()

Unnamed: 0,text,length,euron,sansa,jaime,arya,tyrion,bran,night,daenerys,cersei,jon,fakeOrNot
0,this new game of thrones night king theory is ...,78,0,0,0,0,0,0,1,0,0,0,0.080296
1,congratulations for winning awards tonight wor...,96,0,0,0,0,0,0,1,0,0,0,0.02436
2,more breaking news jinyoung of got and his vis...,109,0,0,0,0,0,0,1,0,0,0,0.47085
3,congratulations for winning awards tonight wor...,96,0,0,0,0,0,0,1,0,0,0,0.02436
4,congratulations for winning awards tonight wor...,96,0,0,0,0,0,0,1,0,0,0,0.02436


In [8]:
def normalize(stri):
    stri = re.sub(r'[^\w\s\d]','',stri)
    stri = stri.replace('/n', ' ')
    stri = stri.replace('\n', ' ')
    return stri.lower()

In [9]:
dfPred['text'] = dfPred['text'].map(normalize)

In [10]:
idx2word[23022]

'bromwell'

In [11]:
' '.join([idx2word[o] for o in x_train[0]])

"bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i'm here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn't"

Let's limit to a vocab size of 5000 words. it is already sorted, so we will replace all the rare words with the 5000-th

In [12]:
vocab_size = 5000

trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

In [13]:
from itertools import chain
lens = np.array(list(map(len, trn)))
(lens.max(), lens.min(), lens.mean())

(2493, 10, 237.71364)

We need to pad shorter sentences with a given word, such that in the end we will have a matrixfor which each row represents a review:

In [14]:
seq_len = 500

X_train = pad_sequences(trn, maxlen=seq_len, value=0)
X_test = pad_sequences(test, maxlen=seq_len, value=0)

In [15]:
def transform(text):
    lista = text.split(' ')
    return [word2idx[s] if s in vocab else 0 for s in lista]

In [16]:
dfPred['indices'] = dfPred['text'].map(transform)
X_pred = dfPred['indices'].tolist()
X_pred = [np.array(el) for el in X_pred]

In [17]:
X_pred = pad_sequences(X_pred, maxlen=seq_len, value=0)

## 2. Use GloVe Word Embedding

Even if keras has an embedding layer, to capture semantics, we will use a pretrained GloVe embedding.

In [18]:


def load_glove(loc):
    with open(loc+'_words.pkl', 'rb') as f1:
            r1 = pickle.load(f1, encoding='latin1')
    with open(loc+'_idx.pkl', 'rb') as f2:
            r2 = pickle.load(f2, encoding='latin1')
    return (bcolz.open(loc+'.dat')[:], r1, r2)


vecs, words, wordidx = load_glove(r'./6B.300d/6B.300d')

In [19]:
n_fact = 300

def create_emb():
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i in range(1,len(emb)):
        word = idx2word[i]
        if word in words and re.match(r"^[a-zA-Z\-]*$", word):
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx] #vecs[wordidx[w]]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = np.random.normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = np.random.normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [20]:
emb = create_emb()

## Create the model. Fit. Predict.

We will use one LSTM followed by two dense layers

In [21]:
def model(vocab_size, n_fact, seq_len):
    """
    Arguments:
    vocab_size, vocabulary size
    n_fact, no of embedding factors
    seq_len, length of the sequence
    Returns:
    model -- Keras model instance
    """
    
    input_X = Input(shape=(seq_len, ))
    embed = Embedding(vocab_size, 300, input_length=seq_len, mask_zero=True,
              embeddings_regularizer=l2(1e-6), weights = [emb])
    X = embed(input_X)
    X = BatchNormalization()(X)
    X = LSTM(128, return_sequences=False)(X)
    X = BatchNormalization()(X)
    X = Dropout(0.5)(X)
    X = Dense(32, activation='relu',kernel_regularizer=l2(0.05),
             activity_regularizer=l2(0.01))(X)
    X = BatchNormalization()(X)
    X = Dropout(0.7)(X)
    out = Dense(1, activation='sigmoid',kernel_regularizer=l2(0.05),
               activity_regularizer=l2(0.005))(X)
    # Create model instance 
    model = Model(inputs=input_X, outputs=out)
    
    return model

In [22]:
model = model(vocab_size, n_fact=300, seq_len=seq_len)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 500)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 500, 300)          1500000   
_________________________________________________________________
batch_normalization_1 (Batch (None, 500, 300)          1200      
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               219648    
_________________________________________________________________
batch_normalization_2 (Batch (None, 128)               512       
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                4128      
__________

In [23]:
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])



In [24]:
model.optimizer.lr=0.01
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=4, batch_size=16, verbose=2)

Train on 25000 samples, validate on 25000 samples
Epoch 1/4
2051s - loss: 0.8482 - acc: 0.4998 - val_loss: 0.7215 - val_acc: 0.5000
Epoch 2/4
1887s - loss: 0.7175 - acc: 0.5038 - val_loss: 0.7159 - val_acc: 0.5000
Epoch 3/4
1878s - loss: 0.7156 - acc: 0.4985 - val_loss: 0.7160 - val_acc: 0.5000
Epoch 4/4
1912s - loss: 0.7148 - acc: 0.4988 - val_loss: 0.7151 - val_acc: 0.5000


<keras.callbacks.History at 0x7faf4c2577b8>

In [25]:
y_pred = model.predict(X_pred)

In [26]:
dfPred['Sentiment'] = y_pred

In [27]:
dfPred.head()

Unnamed: 0,text,length,euron,sansa,jaime,arya,tyrion,bran,night,daenerys,cersei,jon,fakeOrNot,indices,Sentiment
0,this new game of thrones night king theory is ...,78,0,0,0,0,0,0,1,0,0,0,0.080296,"[11, 159, 497, 4, 60913, 311, 708, 2601, 6, 18...",0.466841
1,congratulations for winning awards tonight wor...,96,0,0,0,0,0,0,1,0,0,0,0.02436,"[10308, 15, 1573, 2125, 4484, 179, 320, 890, 2...",0.466841
2,more breaking news jinyoung of got and his vis...,109,0,0,0,0,0,0,1,0,0,0,0.47085,"[50, 2241, 1633, 0, 4, 185, 2, 24, 2054, 6, 29...",0.466841
3,congratulations for winning awards tonight wor...,96,0,0,0,0,0,0,1,0,0,0,0.02436,"[10308, 15, 1573, 2125, 4484, 179, 320, 890, 2...",0.466841
4,congratulations for winning awards tonight wor...,96,0,0,0,0,0,0,1,0,0,0,0.02436,"[10308, 15, 1573, 2125, 4484, 179, 320, 890, 2...",0.466841
