In [3]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.cm as cm
from matplotlib import rcParams
from collections import Counter
from nltk.tokenize import RegexpTokenizer
import re
import string
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [4]:
data = pd.read_csv("../data/data.csv", encoding = "ISO-8859-1", engine="python")
data.columns = ["label", "time", "date", "query", "username", "text"]

In [5]:
data = data[799900:800100]
N=len(data)
print('Number of tweets: ', N)
data.head()

Number of tweets:  200


Unnamed: 0,label,time,date,query,username,text
799900,0,2329170385,Thu Jun 25 10:26:00 PDT 2009,NO_QUERY,celhouston855,has a feeling the rest of the day is going to ...
799901,0,2329170393,Thu Jun 25 10:26:00 PDT 2009,NO_QUERY,maryag,@JapanNewbie woohoo I'm all for slack time.. I...
799902,0,2329170631,Thu Jun 25 10:26:02 PDT 2009,NO_QUERY,ianbicknell,Scratch that.. No pool today
799903,0,2329170797,Thu Jun 25 10:26:02 PDT 2009,NO_QUERY,44ava_182,"@markhoppus my girlfriend just dumped me, got..."
799904,0,2329171365,Thu Jun 25 10:26:04 PDT 2009,NO_QUERY,MasterAbbott,It's 3:30 am and I can't sleep I'm sad


Selecting the text and label coloumn

In [6]:
data=data[['text','label']]

Assigning 1 to Positive sentment 4

In [7]:
data['label'][data['label']==4]=1

Separating positive and negative tweets

In [8]:
data_pos = data[data['label'] == 1]
data_neg = data[data['label'] == 0]

taking one fourth data so we can run on our machine easily

In [9]:
data_pos = data_pos.iloc[:int(20000)]
data_neg = data_neg.iloc[:int(20000)]

Combining positive and negative tweets

In [11]:
data = pd.concat([data_pos, data_neg])

Making statement text in lower case

In [12]:
data['text']=data['text'].str.lower()

In [13]:
data['text'].tail()

799994    sick  spending my day laying in bed listening ...
799995                                      gmail is down? 
799996                        rest in peace farrah! so sad 
799997    @eric_urbane sounds like a rival is flagging y...
799998    has to resit exams over summer...  wishes he w...
Name: text, dtype: object

Cleaning and removing Stop words of english

In [14]:
stopwords_list = stopwords.words('english')

In [15]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

Cleaning and removing the above stop words list from the tweet text

In [16]:
STOPWORDS = set(stopwords.words('english'))
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
data['text'] = data['text'].apply(lambda text: cleaning_stopwords(text))
data['text'].head()

799999                love @health4uandpets u guys r best!!
800000    im meeting one besties tonight! cant wait!! - ...
800001    @darealsunisakim thanks twitter add, sunisa! g...
800002    sick really cheap hurts much eat real food plu...
800003                      @lovesbrooklyn2 effect everyone
Name: text, dtype: object

In [17]:
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

In [18]:
data['text']= data['text'].apply(lambda x: cleaning_punctuations(x))
data['text'].tail()

799994    sick spending day laying bed listening taylors...
799995                                           gmail down
799996                                rest peace farrah sad
799997    ericurbane sounds like rival flagging ads much...
799998    resit exams summer wishes worked harder first ...
Name: text, dtype: object

In [19]:
def cleaning_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

In [20]:
data['text'] = data['text'].apply(lambda x: cleaning_repeating_char(x))
data['text'].tail()

799994    sick spending day laying bed listening taylors...
799995                                           gmail down
799996                                 rest peace farah sad
799997    ericurbane sounds like rival flaging ads much ...
799998    resit exams sumer wishes worked harder first y...
Name: text, dtype: object

Cleaning and removing email

In [21]:
def cleaning_email(data):
    return re.sub('@[^\s]+', ' ', data)

In [22]:
data['text']= data['text'].apply(lambda x: cleaning_email(x))
data['text'].tail()

799994    sick spending day laying bed listening taylors...
799995                                           gmail down
799996                                 rest peace farah sad
799997    ericurbane sounds like rival flaging ads much ...
799998    resit exams sumer wishes worked harder first y...
Name: text, dtype: object

Cleaning and removing URL's

In [23]:
def cleaning_URLs(data):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',data)

In [24]:
data['text'] = data['text'].apply(lambda x: cleaning_URLs(x))
data['text'].tail()

799994    sick spending day laying bed listening taylors...
799995                                           gmail down
799996                                 rest peace farah sad
799997    ericurbane sounds like rival flaging ads much ...
799998    resit exams sumer wishes worked harder first y...
Name: text, dtype: object

Cleaning and removing Numeric numbers

In [26]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)

In [27]:
data['text'] = data['text'].apply(lambda x: cleaning_numbers(x))
data['text'].tail()

799994    sick spending day laying bed listening taylors...
799995                                           gmail down
799996                                 rest peace farah sad
799997    ericurbane sounds like rival flaging ads much ...
799998    resit exams sumer wishes worked harder first y...
Name: text, dtype: object

Getting tokenization of tweet text

In [29]:
tokenizer = RegexpTokenizer(r'\w+')
data['text'] = data['text'].apply(tokenizer.tokenize)

In [30]:
data['text'].head()

799999             [love, healthuandpets, u, guys, r, best]
800000    [im, meting, one, besties, tonight, cant, wait...
800001    [darealsunisakim, thanks, twiter, ad, sunisa, ...
800002    [sick, realy, cheap, hurts, much, eat, real, f...
800003                      [lovesbroklyn, efect, everyone]
Name: text, dtype: object

Applying Stemming

In [31]:
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data

data['text']= data['text'].apply(lambda x: stemming_on_text(x))

In [32]:
data['text'].head()

799999             [love, healthuandpets, u, guys, r, best]
800000    [im, meting, one, besties, tonight, cant, wait...
800001    [darealsunisakim, thanks, twiter, ad, sunisa, ...
800002    [sick, realy, cheap, hurts, much, eat, real, f...
800003                      [lovesbroklyn, efect, everyone]
Name: text, dtype: object

Applying Lemmatizer

In [33]:
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data

data['text'] = data['text'].apply(lambda x: lemmatizer_on_text(x))

In [34]:
data['text'].head()

799999             [love, healthuandpets, u, guys, r, best]
800000    [im, meting, one, besties, tonight, cant, wait...
800001    [darealsunisakim, thanks, twiter, ad, sunisa, ...
800002    [sick, realy, cheap, hurts, much, eat, real, f...
800003                      [lovesbroklyn, efect, everyone]
Name: text, dtype: object

Separating input feature and label

In [35]:
X=data.text
y=data.label

Preparing the input features for training
We converting the text words into arrays form.
Maximum 500 features/words selected for training. These 500 words will be selected on the importance that will distinguish between the positive tweets and negative tweets.

In [36]:
max_len = 500
tok = Tokenizer(num_words=2000)
tok.fit_on_texts(X)
sequences = tok.texts_to_sequences(X)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [37]:
sequences_matrix.shape

(200, 500)

In [38]:
X_train, X_test, Y_train, Y_test = train_test_split(sequences_matrix, y, test_size=0.3, random_state=2)

In [39]:
def tensorflow_based_model(): #Defined tensorflow_based_model function for training tenforflow based model
    inputs = Input(name='inputs',shape=[max_len])#step1
    layer = Embedding(2000,50,input_length=max_len)(inputs) #step2
    layer = LSTM(64)(layer) #step3
    layer = Dense(256,name='FC1')(layer) #step4
    layer = Activation('relu')(layer) # step5
    layer = Dropout(0.5)(layer) # step6
    layer = Dense(1,name='out_layer')(layer) #step4 again but this time its giving only one output as because we need to classify the tweet as positive or negative
    layer = Activation('sigmoid')(layer) #step5 but this time activation function is sigmoid for only one output.
    model = Model(inputs=inputs,outputs=layer) #here we are getting the final output value in the model for classification
    return model #function returning the value when we call it

# Model compilation

First we are calling the model
We are using 2 classes so we set "binary_crossentropy" and if we use more than two classes then we use "categorical_crossentropy"
Optimizer is a function that used to change the features of neural network such as learning rate (how the model learn with features) in order to reduce the losses. So the learning rate of neural network to reduce the losses is defined by optimizer.
We are setting metrics=accuracy because we are going to caluclate the percentage of correct predictions over all predictions on the validation set

In [40]:
model = tensorflow_based_model() # here we are calling the function of created model
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

Training and validating with parameter tuning

In [41]:
history=model.fit(X_train,Y_train,batch_size=80,epochs=6, validation_split=0.1)# here we are starting the training of model by feeding the training data
print('Training finished !!')

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Training finished !!


Testing the Trained model on test data

In [42]:
accr1 = model.evaluate(X_test,Y_test) #we are starting to test the model here



Accuracy

In [43]:
print('Test set\n  Accuracy: {:0.2f}'.format(accr1[1])) #the accuracy of the model on test data is given below

Test set
  Accuracy: 0.47


Getting prediction of the test data and then we will compare the true labels/classes of the data with predictions

In [44]:
y_pred = model.predict(X_test) #getting predictions on the trained model
y_pred = (y_pred > 0.5) 