In [1]:
import numpy as np
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from silence_tensorflow import silence_tensorflow
silence_tensorflow()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
dataset_path = '/Users/harikrishnanagarajan/Downloads/romney-tweets.csv'
df = pd.read_csv(dataset_path, usecols= ['Anootated tweet', 'Class'])

In [3]:
df.head(10)

Unnamed: 0,Anootated tweet,Class
0,Insidious!<e>Mitt Romney</e>'s Bain Helped Phi...,-1
1,Senior <e>Romney</e> Advisor Claims <e>Obama</...,2
2,.@WardBrenda @shortwave8669 @allanbourdius you...,-1
3,<e>Mitt Romney</e> still doesn't <a>believe</a...,-1
4,<e>Romney</e>'s <a>tax plan</a> deserves a 2nd...,-1
5,Hope <e>Romney</e> debate prepped w/ the same ...,1
6,Want to know how <e>Mitt Romney</e> is going t...,-1
7,If <e>Romney</e> wins the <a>presidential elec...,-1
8,Presidential debate round 2: <e>Romney</e> wan...,2
9,Someone on the <e>mitt Romney</e> <a>Facebook ...,!!!!


### PREPROCESSING

In [4]:
stop_words = stopwords.words('english')
stemmer = PorterStemmer()

TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [5]:
def preprocess(tweet):
    # Remove link,user and special characters
    tweet = str(tweet).replace('<e>','')
    tweet = str(tweet).replace('</e>', '')
    tweet = re.sub(TEXT_CLEANING_RE, ' ', str(tweet).lower()).strip()
    
    tokens = []
    for token in tweet.split():
        if token not in stop_words:
            tokens.append(stemmer.stem(token))
    return " ".join(tokens)

In [6]:
df['Anootated tweet'] = df['Anootated tweet'].apply(lambda x: preprocess(x))

In [7]:
def oneHotEncode(data):
    data = np.asarray(data)
    temp = np.zeros((len(data),3))
    for i in range(len(temp)):
        if data[i] == '1':
            temp[i][0] = 1
        elif data[i] == '0':
            temp[i][1] = 1
        elif data[i] == '2':
            temp[i][2] = 1

    return temp     

In [8]:
def pred(x):
    temp = []
    for i in x:
        m = np.argmax(i)
        if m == 0:
            temp.append('1')
        elif m == 1:
            temp.append('0')
        else:
            temp.append('2')
    return temp

In [12]:
df = df.dropna(subset= ['Class', 'Anootated tweet'])
df = df[df.Class != '2']
df = df[df.Class != 'irrelevant']
df = df[df.Class != 'irrevelant']
df = df[df.Class != '!!!!']
df = df.replace('-1', '2')

In [13]:
df = df.reset_index()
df = df.drop(['index'], axis= 1)
df.head(10)

Unnamed: 0,Anootated tweet,Class
0,hope romney debat prep w peopl last time,1
1,pleas mitt romney huffingtonpost honey boo boo...,0
2,women poll show truli vote romney want know tr...,0
3,good luck mittromney tonight alreadi vote noth...,1
4,debat tonight go pointless know romney win debat,1
5,mitt romney alway worship father must hurt lea...,0
6,hope debat goe well romney tonight first time ...,1
7,yanke send arod debat romney amp start barack ...,0
8,bo come fight mitt simpli take advantag obviou...,1
9,oh yeah rt fwwak romney pray babi jesu assault...,0


In [14]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size= 0.15, random_state= 5) 
print('Train_size:', len(df_train))
print('\nTest_size:', len(df_test))

Train_size: 2344

Test_size: 414


In [15]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['Anootated tweet'])

vocab_size = len(tokenizer.word_index) + 1
print('Total words in the vocab:', vocab_size)

SEQ_LENGTH = 50

Total words in the vocab: 4429


In [16]:
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train['Anootated tweet']), maxlen= SEQ_LENGTH, padding='post', truncating = 'pre')

x_test = pad_sequences(tokenizer.texts_to_sequences(df_test['Anootated tweet']), maxlen= SEQ_LENGTH, padding='post', truncating = 'pre')

In [17]:
y_train = np.array(oneHotEncode(df_train.Class))

y_test = np.array(oneHotEncode(df_test.Class))

print('Shape of x_train:', x_train.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of x_test:', x_test.shape)
print('Shape of y_test:', y_test.shape)

Shape of x_train: (2344, 50)
Shape of y_train: (2344, 3)
Shape of x_test: (414, 50)
Shape of y_test: (414, 3)


In [18]:
embedding_dict = {}

with open('/Users/harikrishnanagarajan/Downloads/glove/glove.twitter.27B.200d.txt', 'r') as f:
    
    for line in f:
        
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], "float32")
        embedding_dict[word] = vectors
        
f.close()

In [19]:
embedding_matrix = np.zeros((vocab_size, 200))

for word, i in tokenizer.word_index.items():
    
    if i < vocab_size:
        
        emb_vec = embedding_dict.get(word)
        if emb_vec is not None:
            
            embedding_matrix[i] = emb_vec

In [20]:
import tensorflow.keras.backend as K

def f1_value(y_true, y_pred): #taken from old keras source code
    
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    
    precision = true_positives / (predicted_positives + K.epsilon())
    
    recall = true_positives / (possible_positives + K.epsilon())
    
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    
    return f1_val

### BUILDING MODEL

In [17]:
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM, Bidirectional, SimpleRNN, GRU
from keras import utils
import tensorflow as tf
from keras.optimizers import adam

In [21]:
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, peoch, logs= {}):
        if logs.get('val_f1_value') > 0.5875:
            print("Ending Training")
            self.model.stop_training = True
            
callback = myCallback()

embedding_layer = Embedding(vocab_size, 200, weights= [embedding_matrix], input_length= SEQ_LENGTH, trainable= False)


model_LSTM = Sequential()

model_LSTM.add(embedding_layer)
model_LSTM.add(Bidirectional(LSTM(75, activation= 'tanh')))
model_LSTM.add(Dropout(0.2))
model_LSTM.add(Dense(3, activation= 'softmax'))

model_LSTM.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 200)           1334800   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 150)               165600    
_________________________________________________________________
dropout_2 (Dropout)          (None, 150)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 453       
Total params: 1,500,853
Trainable params: 166,053
Non-trainable params: 1,334,800
_________________________________________________________________


In [22]:
opt = adam(learning_rate= 0.001)
model_LSTM.compile(loss= 'categorical_crossentropy', optimizer= opt, metrics= [f1_value])

history = model_LSTM.fit(x_train, y_train, epochs= 15, validation_split= 0.1, batch_size= 64, verbose= 1, callbacks= [callback])

Train on 4322 samples, validate on 481 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Ending Training


In [50]:
model_LSTM.save('BEST_MODEL_LSTM')

In [39]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = model_LSTM.predict(x_test)
f1_list= np.round(f1_score(pred(y_test), pred(y_pred), average = None),3)
accuracy = accuracy_score(pred(y_test), pred(y_pred))
f1_dict = {'f1_pos': f1_list[0], 'f1_neu': f1_list[1], 'f1_neg': f1_list[2]}

In [40]:
f1_dict

{'f1_pos': 0.356, 'f1_neu': 0.401, 'f1_neg': 0.673}

In [41]:
accuracy

0.5459905660377359

In [47]:
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, peoch, logs= {}):
        if logs.get('val_f1_value') > 0.60:
            print("Ending Training")
            self.model.stop_training = True
            
callback = myCallback()
embedding_layer = Embedding(vocab_size, 200, weights= [embedding_matrix], input_length= SEQ_LENGTH, trainable= False)


model_GRU = Sequential()

model_GRU.add(embedding_layer)
model_GRU.add(Bidirectional(GRU(75, activation= 'tanh')))
model_GRU.add(Dropout(0.2))
model_GRU.add(Dense(3, activation= 'softmax'))

model_GRU.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 50, 200)           1334800   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 150)               124200    
_________________________________________________________________
dropout_6 (Dropout)          (None, 150)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 3)                 453       
Total params: 1,459,453
Trainable params: 124,653
Non-trainable params: 1,334,800
_________________________________________________________________


In [48]:
opt2 = adam(learning_rate= 0.002)
model_GRU.compile(loss= 'categorical_crossentropy', optimizer= opt2, metrics= [f1_value])

history2 = model_GRU.fit(x_train, y_train, epochs= 20, validation_split= 0.1, verbose= 1, batch_size=128, callbacks= [callback])

Train on 4322 samples, validate on 481 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Ending Training


In [49]:
model_GRU.save('BEST_MODEL_LS')

In [54]:
y_pred = model_GRU.predict(x_test)
f1_list= np.round(f1_score(pred(y_test), pred(y_pred), average = None),3)
accuracy = accuracy_score(pred(y_test), pred(y_pred))
f1_dict = {'f1_pos': f1_list[0], 'f1_neu': f1_list[1], 'f1_neg': f1_list[2]}

In [55]:
f1_dict

{'f1_pos': 0.436, 'f1_neu': 0.34, 'f1_neg': 0.656}

In [56]:
accuracy

0.5389150943396226

In [22]:
from tensorflow import keras

loaded_model = keras.models.load_model('BEST_MODEL_GRU', custom_objects= {'f1_value': f1_value})

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = loaded_model.predict(x_test)
f1_list= np.round(f1_score(pred(y_test), pred(y_pred), average = None),3)
accuracy = accuracy_score(pred(y_test), pred(y_pred))
f1_dict = {'f1_pos': f1_list[0], 'f1_neu': f1_list[1], 'f1_neg': f1_list[2]}

In [24]:
f1_dict

{'f1_pos': 0.441, 'f1_neu': 0.098, 'f1_neg': 0.0}

In [25]:
accuracy

0.23671497584541062