In [2]:
import numpy as np
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from silence_tensorflow import silence_tensorflow
silence_tensorflow()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

### DATASET

In [3]:
dataset_path = '/Users/harikrishnanagarajan/Downloads/obama-tweets.csv'
df = pd.read_csv(dataset_path, usecols= ['Anootated tweet', 'Class'])

In [4]:
df.head(10)

Unnamed: 0,Anootated tweet,Class
0,"Kirkpatrick, who wore a baseball cap embroider...",0
1,Question: If <e>Romney</e> and <e>Obama</e> ha...,2
2,#<e>obama</e> debates that Cracker Ass Cracker...,1
3,RT @davewiner Slate: Blame <e>Obama</e> for fo...,2
4,@Hollivan @hereistheanswer Youre missing the ...,0
5,<e>Mitt Romney</e> made all of his money himse...,2
6,I was raised as a Democrat left the party yea...,-1
7,The <e>Obama camp</e> can't afford to lower ex...,0
8,"Tonight's debate has that ""Game 7"" feel! This ...",2
9,<e>Obama</e> pot <a>policy</a> disappointing -...,-1


### PREPROCESSING

In [5]:
stop_words = stopwords.words('english')
stemmer = PorterStemmer()

TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [6]:
def preprocess(tweet):
    # Remove link,user and special characters
    tweet = str(tweet).replace('<e>','')
    tweet = str(tweet).replace('</e>', '')
    tweet = re.sub(TEXT_CLEANING_RE, ' ', str(tweet).lower()).strip()
    
    tokens = []
    for token in tweet.split():
        if token not in stop_words:
            tokens.append(stemmer.stem(token))
    return " ".join(tokens)

In [7]:
df['Anootated tweet'] = df['Anootated tweet'].apply(lambda x: preprocess(x))

In [8]:
def oneHotEncode(data):
    data = np.asarray(data)
    temp = np.zeros((len(data),3))
    for i in range(len(temp)):
        if data[i] == '1':
            temp[i][0] = 1
        elif data[i] == '0':
            temp[i][1] = 1
        elif data[i] == '2':
            temp[i][2] = 1

    return temp        

In [27]:
def pred(x):
    temp = []
    for i in x:
        m = np.argmax(i)
        if m == 0:
            temp.append('1')
        elif m == 1:
            temp.append('0')
        else:
            temp.append('2')
    return temp

In [9]:
df = df.dropna(subset= ['Class', 'Anootated tweet'])
df = df[df.Class != '2']
df = df[df.Class != 'irrelevant']
df = df[df.Class != 'irrevelant']
df = df.replace('-1', '2')

In [10]:
df = df.reset_index()
df = df.drop(['index'], axis= 1)
df.head(10)

Unnamed: 0,Anootated tweet,Class
0,kirkpatrick wore basebal cap embroid barack ob...,0
1,obama debat cracker ass cracker tonight tune t...,1
2,hereistheansw your miss point im afraid unders...,0
3,rais democrat left parti year ago 1980 lifetim...,2
4,obama camp afford lower expect tonight debat p...,0
5,obama pot polici disappoint say least 420 lega...,2
6,hollywood back rt redalert gene simmon yank ob...,2
7,obama expedi speak fair order slender biscuit ...,0
8,dream smoke obama,0
9,washington time presid popular bubbl burst bar...,2


In [11]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size= 0.15, random_state= 5) 
print('Train_size:', len(df_train))
print('\nTest_size:', len(df_test))

Train_size: 4781

Test_size: 844


In [33]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['Anootated tweet'])

vocab_size = len(tokenizer.word_index) + 1
print('Total words in the vocab:', vocab_size)

SEQ_LENGTH = 100

Total words in the vocab: 7161


In [34]:
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train['Anootated tweet']), maxlen= SEQ_LENGTH, padding='post', truncating = 'pre')

x_test = pad_sequences(tokenizer.texts_to_sequences(df_test['Anootated tweet']), maxlen= SEQ_LENGTH, padding='post', truncating = 'pre')

In [35]:
y_train = np.array(oneHotEncode(df_train.Class))

y_test = np.array(oneHotEncode(df_test.Class))

print('Shape of x_train:', x_train.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of x_test:', x_test.shape)
print('Shape of y_test:', y_test.shape)

Shape of x_train: (4781, 100)
Shape of y_train: (4781, 3)
Shape of x_test: (844, 100)
Shape of y_test: (844, 3)


In [15]:
embedding_dict = {}

with open('/Users/harikrishnanagarajan/Downloads/glove/glove.twitter.27B.200d.txt', 'r') as f:
    
    for line in f:
        
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], "float32")
        embedding_dict[word] = vectors
        
f.close()

In [36]:
embedding_matrix = np.zeros((vocab_size, 200))

for word, i in tokenizer.word_index.items():
    
    if i < vocab_size:
        
        emb_vec = embedding_dict.get(word)
        if emb_vec is not None:
            
            embedding_matrix[i] = emb_vec

In [25]:
import tensorflow.keras.backend as K

def f1_value(y_true, y_pred): #taken from old keras source code
    
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    
    precision = true_positives / (predicted_positives + K.epsilon())
    
    recall = true_positives / (possible_positives + K.epsilon())
    
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    
    return f1_val

### BUILDING THE MODEL

In [18]:
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM, Bidirectional, SimpleRNN, GRU
from keras import utils
import tensorflow as tf
from keras.optimizers import adam

In [21]:
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, peoch, logs= {}):
        if logs.get('val_f1_value') > 0.58:
            print("Ending Training")
            self.model.stop_training = True
            
callback = myCallback()

embedding_layer = Embedding(vocab_size, 200, weights= [embedding_matrix], input_length= SEQ_LENGTH, trainable= False)


model_LSTM = Sequential()

model_LSTM.add(embedding_layer)
model_LSTM.add(Bidirectional(LSTM(100, activation= 'tanh')))
model_LSTM.add(Dropout(0.2))
model_LSTM.add(Dense(3, activation= 'softmax'))

model_LSTM.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 75, 200)           1432200   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 200)               240800    
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 603       
Total params: 1,673,603
Trainable params: 241,403
Non-trainable params: 1,432,200
_________________________________________________________________


In [22]:
opt = adam(learning_rate= 0.007)
model_LSTM.compile(loss= 'categorical_crossentropy', optimizer= opt, metrics= [f1_value])

history = model_LSTM.fit(x_train, y_train, epochs= 20, validation_split= 0.1, batch_size= 64 ,verbose= 1, callbacks= [callback])

Train on 4302 samples, validate on 479 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Ending Training


In [24]:
model_LSTM.save('BEST_LSTM_MODEL')

In [47]:
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, peoch, logs= {}):
        if logs.get('val_f1_value') > 0.55:
            print("Ending Training")
            self.model.stop_training = True
            
callback = myCallback()
            
embedding_layer = Embedding(vocab_size, 200, weights= [embedding_matrix], input_length= SEQ_LENGTH, trainable= False)


model_Vanilla = Sequential()

model_Vanilla.add(embedding_layer)
model_Vanilla.add(Bidirectional(SimpleRNN(80, activation= 'tanh')))
model_Vanilla.add(Dropout(0.2))
model_Vanilla.add(Dense(3, activation= 'softmax'))

model_Vanilla.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 100, 200)          1432200   
_________________________________________________________________
bidirectional_9 (Bidirection (None, 160)               44960     
_________________________________________________________________
dropout_9 (Dropout)          (None, 160)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 3)                 483       
Total params: 1,477,643
Trainable params: 45,443
Non-trainable params: 1,432,200
_________________________________________________________________


In [48]:
opt1 = adam(learning_rate= 0.007)
model_Vanilla.compile(loss= 'categorical_crossentropy', optimizer= opt1, metrics= [f1_value])

history1 = model_Vanilla.fit(x_train, y_train, epochs= 20, validation_split= 0.1, batch_size= 128, verbose= 1, callbacks= [callback])

Train on 4302 samples, validate on 479 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [49]:
model_Vanilla.save('BEST_VANILLA_MODEL')

In [56]:
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, peoch, logs= {}):
        if logs.get('val_f1_value') > 0.57:
            print("Ending Training")
            self.model.stop_training = True
            
callback = myCallback()
embedding_layer = Embedding(vocab_size, 200, weights= [embedding_matrix], input_length= SEQ_LENGTH, trainable= False)


model_GRU = Sequential()

model_GRU.add(embedding_layer)
model_GRU.add(Bidirectional(GRU(100, activation= 'tanh')))
model_GRU.add(Dropout(0.2))
model_GRU.add(Dense(3, activation= 'softmax'))

model_GRU.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 100, 200)          1432200   
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 200)               180600    
_________________________________________________________________
dropout_13 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 3)                 603       
Total params: 1,613,403
Trainable params: 181,203
Non-trainable params: 1,432,200
_________________________________________________________________


In [57]:
opt2 = adam(learning_rate= 0.005)
model_GRU.compile(loss= 'categorical_crossentropy', optimizer= opt2, metrics= [f1_value])

history2 = model_GRU.fit(x_train, y_train, epochs= 20, validation_split= 0.1, verbose= 1, batch_size=128, callbacks= [callback])

Train on 4302 samples, validate on 479 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Ending Training


In [58]:
model_GRU.save('BEST_GRU_MODEL')

In [40]:
from tensorflow import keras

loaded_model = keras.models.load_model('BEST_GRU_MODEL', custom_objects= {'f1_value': f1_value})

In [41]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = loaded_model.predict(x_test)
f1_list= np.round(f1_score(pred(y_test), pred(y_pred), average = None),3)
accuracy = accuracy_score(pred(y_test), pred(y_pred))
f1_dict = {'f1_pos': f1_list[0], 'f1_neu': f1_list[1], 'f1_neg': f1_list[2]}

In [42]:
f1_dict

{'f1_pos': 0.565, 'f1_neu': 0.529, 'f1_neg': 0.579}

In [43]:
accuracy

0.5604265402843602