## Twitter Sentiment Analysis (Text classification)

https://www.kaggle.com/crowdflower/twitter-airline-sentiment
This dataset is taken from Kaggle.

In [1]:
# Imported necessary libraries

import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout, Embedding, SpatialDropout1D, LSTM
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import models

In [2]:
#Loading data

df= pd.read_csv('Tweets.csv', sep=',')
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [3]:
# Shape of Dataset

print("Number of Rows: " + str(df.shape[0]))
print("Number of Columns: " + str(df.shape[1]))

Number of Rows: 14640
Number of Columns: 15


In [4]:

# Select Required Columns Only

tweet_df = df[['text','airline_sentiment']]

In [5]:
tweet_df.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [6]:
# Select only Positive and Negative Reviews

tweet_df = tweet_df[tweet_df['airline_sentiment'] != 'neutral']

In [7]:
# Attribute and Labels
X = tweet_df.text
y = tweet_df.airline_sentiment

In [8]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Checking Train Set

print("Training Set X Items: " + str(len(X_train)))
print("Training Set y Items: " + str(len(y_train)))

Training Set X Items: 9232
Training Set y Items: 9232


In [10]:
# Checking Test Set

print("Test Set X Items: " + str(len(X_test)))
print("Test Set y Items: " + str(len(y_test)))

Test Set X Items: 2309
Test Set y Items: 2309


In [11]:
# Getting required labels only and encoding

review_labels_train = y_train.factorize()

In [12]:
review_labels_train[0]

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [13]:
# Check Review Labels
review_labels_train[1]

Index(['negative', 'positive'], dtype='object')

## Model Building

In [14]:
vocab_size = X_train.values
vocab_size

array(['@USAirways Another dead end.  They only handle AA L&amp;F.  They gave me the same failed # I already had. 610-362-7498(99) VM full.  #lost',
       '@USAirways #2066. Was on plane from PBI to CLT and knew about the frozen water. Also saw a plane to NYC take off at the gate next door!',
       '@USAirways waiting for bags now over 25min in Phl bag claim!',
       ...,
       'Lovely! RT @JetBlue: Our fleet’s on fleek. http://t.co/Hi6Fl1AX9E',
       "@united Okay thanks if you could please update me. I was told at the airport someone would call me today but they haven't.",
       '@USAirways IS THIS RINGLING BROTHERS BARNUM AND BAILEY???  SHOULD I KEEP MY EYES PEELED FOR THE CLOWN CAR???'],
      dtype=object)

In [15]:

# Vectorize a text corpus, by turning each text into sequence of integers

tokenizer = Tokenizer(num_words=8000,oov_token='OOV')
tokenizer.fit_on_texts(vocab_size)

vocab_size = len(tokenizer.word_index) + 1

print(tokenizer)
print(vocab_size)

<keras_preprocessing.text.Tokenizer object at 0x0000027409E8F5E0>
11636


In [16]:
len(tokenizer.word_index)

11635

In [17]:
# To save the tokenized vocab for Web app
#import pickle
#with open('tokenizer.pickle', 'wb') as handle:
#        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
# Store and Padding Converted Sequences
tweet = X_train.values

tweet_seqs = tokenizer.texts_to_sequences(tweet)

padded_sequence_train = pad_sequences(tweet_seqs, maxlen=200)
print(padded_sequence_train)

[[   0    0    0 ... 3560  364  141]
 [   0    0    0 ...   76  196  754]
 [   0    0    0 ...  304   79  354]
 ...
 [   0    0    0 ...   48   50    1]
 [   0    0    0 ...   32   57  353]
 [   0    0    0 ...    3 4197  569]]


In [19]:
# Check padded sequence element

print(padded_sequence_train[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0   13  143 1590  549   57  122  660  221 2105
   59  752   57  428   20    3  257  855    4  224   81 5034 3559 5035
 2830 

In [20]:
embedding_vector_length = 32

model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length,     
                                     input_length=200) )
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 32)           372352    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 200, 32)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 50)                16600     
_________________________________________________________________
dropout (Dropout)            (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 51        
Total params: 389,003
Trainable params: 389,003
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
# Used to save trained model 

#model.save("sentiment_analysis.h5")
#print("Model Saved")

Model Saved


In [22]:
# Training the model

trained = model.fit(padded_sequence_train,review_labels_train[0], validation_split=0.2, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [23]:
# Encoding and Padding Test Data to Check Accuracy

encoded_docs = tokenizer.texts_to_sequences(X_test)
padded_sequence_test = pad_sequences(encoded_docs, maxlen=200)
print(padded_sequence_test)

[[   0    0    0 ...   10    1   18]
 [   0    0    0 ...   48   50    1]
 [   0    0    0 ...    2 6450  529]
 ...
 [   0    0    0 ...   99    3  314]
 [   0    0    0 ... 1542   91    1]
 [   0    0    0 ...  200  106   30]]


In [24]:
sentiment_label_test = y_test.factorize()
sentiment_label_test[0]

array([0, 1, 0, ..., 0, 1, 0], dtype=int64)

In [25]:
score = model.evaluate(padded_sequence_test,sentiment_label_test[0],verbose=0)

In [26]:
print("Accuracy: {}".format(score[1]))

Accuracy: 0.9242095947265625


### Loading model

In [34]:
#Code to load the saved model
model = models.load_model('sentiment_analysis.h5')
print("Model Loaded")
model.summary()

Model Loaded
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 32)           372352    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                16600     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51        
Total params: 389,003
Trainable params: 389,003
Non-trainable params: 0
_________________________________________________________________


In [35]:
score = model.evaluate(padded_sequence_test,sentiment_label_test[0],verbose=0)

In [36]:
print("Accuracy: {}".format(score[1]))

Accuracy: 0.9268081188201904


In [39]:
#import pickle
#with open('tokenizer.pickle', 'rb') as handle:
 #   tokenizer = pickle.load(handle)

In [40]:
len(tokenizer.word_index)

11635

In [41]:
# Test Review Sentence

test_word ="""
These masks were a steal! 50 for 12$!! They are soft, breathable, light, comfortable and professional. It doesn’t hurt or irritate my ears and where they glue the straps to the mask isn’t hard and crusty like the basic blue ones. These are great!
"""

# To
tw = tokenizer.texts_to_sequences([test_word])
tw = pad_sequences(tw,maxlen=200)

tw

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [42]:
prediction = int(model.predict(tw).round().item())
outcome = (review_labels_train[1][prediction]).capitalize()

print("Actual Review: " + test_word)
print("\nSentiment Analysis Outcome ==> The review shows " + (review_labels_train[1][prediction]).capitalize() + " sentiment.")
print("\n======================================================================================")

print("\nAccuracy Criteria \n\nProbability Closer to 0 == Negative Sentiment\nProbability Closer to 1 == Positive Sentiment")

prob = model.predict(tw)[0][0]

print("\n ==> Probability is " + str(prob)+ " (" + outcome + ")")

Actual Review: 
These masks were a steal! 50 for 12$!! They are soft, breathable, light, comfortable and professional. It doesn’t hurt or irritate my ears and where they glue the straps to the mask isn’t hard and crusty like the basic blue ones. These are great!


Sentiment Analysis Outcome ==> The review shows Positive sentiment.


Accuracy Criteria 

Probability Closer to 0 == Negative Sentiment
Probability Closer to 1 == Positive Sentiment

 ==> Probability is 0.83420324 (Positive)
