In [116]:
import numpy as np
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from silence_tensorflow import silence_tensorflow
silence_tensorflow()

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [132]:
dataset_path = '/Users/harikrishnanagarajan/Downloads/final-testData-no-label-Romney-tweets(1).xlsx'
read_file = pd.read_excel(dataset_path)
read_file.to_csv('DATASET')

In [133]:
df = pd.read_csv('DATASET', usecols= [2], names= ['Anootated tweet'])

In [134]:
df.head(10)

Unnamed: 0,Anootated tweet
0,<e>Romney</e> got 3 less minutes and had to de...
1,<e>Mitt </e>is beating him UP! on his record...
2,I actually like <e>Romney </e>'s response to ...
3,Just for that <a>immigration statement </a>tha...
4,This man <e>Romney </e>is tearing this dude ...
5,"<e>Romney </e>had less to prove, given the la..."
6,"<e>Romney </e>- he comes off as a dumbass, but..."
7,Great <a>closing</a> by <e>Romney </e>. Summe...
8,Just ready for <e>Romney </e> to become presi...
9,"Get him on his trampling on the Constitution, ..."


In [135]:
stop_words = stopwords.words('english')
stemmer = PorterStemmer()

TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [136]:
def preprocess(tweet):
    # Remove link,user and special characters
    tweet = str(tweet).replace('<e>','')
    tweet = str(tweet).replace('</e>', '')
    tweet = re.sub(TEXT_CLEANING_RE, ' ', str(tweet).lower()).strip()
    
    tokens = []
    for token in tweet.split():
        if token not in stop_words:
            tokens.append(stemmer.stem(token))
    return " ".join(tokens)

In [137]:
df['Anootated tweet'] = df['Anootated tweet'].apply(lambda x: preprocess(x))

In [138]:
def pred(x):
    temp = []
    for i in x:
        m = np.argmax(i)
        if m == 0:
            temp.append('1')
        elif m == 1:
            temp.append('0')
        else:
            temp.append('2')
    return temp

In [139]:
df = df.dropna(subset= ['Anootated tweet'])
df = df.reset_index()
df = df.drop(['index'], axis= 1)
df.head(10)

Unnamed: 0,Anootated tweet
0,romney got 3 less minut debat candi crowley st...
1,mitt beat record credibl charact
2,actual like romney respons immigr
3,immigr statement romney answer 18 enough get vote
4,man romney tear dude econom
5,romney less prove given last debat back inch p...
6,romney come dumbass love plan small larg busi ...
7,great close romney sum well debat
8,readi romney becom presid lol
9,get trampl constitut mitt pleas


In [141]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Anootated tweet'])

vocab_size = len(tokenizer.word_index) + 1
print('Total words in the vocab:', vocab_size)

SEQ_LENGTH = 50

Total words in the vocab: 3845


In [142]:
test = pad_sequences(tokenizer.texts_to_sequences(df['Anootated tweet']), maxlen= SEQ_LENGTH, padding='post', truncating = 'pre')
print('Shape of test data:', test.shape)

Shape of test data: (1900, 50)


In [143]:
import tensorflow.keras.backend as K

def f1_value(y_true, y_pred): #taken from old keras source code
    
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    
    precision = true_positives / (predicted_positives + K.epsilon())
    
    recall = true_positives / (possible_positives + K.epsilon())
    
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    
    return f1_val

In [144]:
from tensorflow import keras

loaded_model = keras.models.load_model('BEST_MODEL_GRU', custom_objects= {'f1_value': f1_value})

In [145]:
predict = loaded_model.predict(test)
true_pred = pred(predict)

In [146]:
file1 = open('romney.txt', 'w+')

file1.write('78 74\n')

for i in range(len(true_pred)):
    if true_pred[i] == '2':
        file1.write(str(i + 1) + ';;-1\n')
    else:
        file1.write(str(i + 1) + ';;' + true_pred[i] + '\n')

file1.close()

In [115]:
print(true_pred)

['1', '0', '2', '1', '0', '2', '0', '2', '1', '0', '0', '0', '2', '0', '0', '2', '1', '2', '0', '2', '0', '2', '0', '1', '1', '0', '1', '2', '0', '1', '1', '0', '1', '1', '1', '0', '0', '2', '1', '2', '1', '1', '0', '2', '0', '1', '0', '0', '0', '1', '2', '2', '1', '2', '0', '0', '1', '2', '0', '1', '0', '1', '1', '0', '0', '2', '0', '1', '1', '1', '0', '0', '0', '1', '2', '2', '2', '2', '2', '2', '1', '1', '0', '0', '2', '2', '0', '2', '1', '1', '2', '1', '1', '2', '1', '1', '2', '1', '1', '1', '1', '1', '0', '0', '0', '0', '2', '0', '2', '0', '1', '2', '0', '2', '2', '0', '1', '0', '1', '2', '2', '2', '0', '2', '1', '1', '0', '2', '0', '0', '0', '0', '0', '1', '0', '2', '2', '0', '1', '0', '2', '0', '0', '1', '1', '2', '2', '2', '2', '2', '1', '1', '2', '2', '0', '1', '1', '1', '1', '2', '2', '2', '2', '0', '0', '0', '2', '0', '2', '2', '2', '1', '2', '0', '1', '2', '2', '0', '0', '0', '0', '2', '1', '0', '1', '1', '2', '0', '1', '0', '0', '0', '1', '0', '1', '0', '2', '1', '2', '1',