# Importing all the necessary librarries .

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SimpleRNN, SpatialDropout1D

from sklearn.model_selection import train_test_split
from sklearn import metrics


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
df=pd.read_csv('Usecase3_Dataset.csv')
df.head()

Unnamed: 0,airline_sentiment,airline,text
0,neutral,Virgin America,@VirginAmerica What @dhepburn said.
1,positive,Virgin America,@VirginAmerica plus you've added commercials t...
2,neutral,Virgin America,@VirginAmerica I didn't today... Must mean I n...
3,negative,Virgin America,@VirginAmerica it's really aggressive to blast...
4,negative,Virgin America,@VirginAmerica and it's a really big bad thing...


In [3]:
df.shape

(14640, 3)

In [4]:
df = df[['text', 'airline_sentiment']]
df.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


Created a function for data cleaning which removes brackets , punctuations, words with numbers and return clean text.

In [5]:
def clean_train_data(x):
    text = x
    text = text.lower()
    text = re.sub('\[.*?\]', '', text) # remove square brackets
    text = re.sub(r'[^\w\s]','',text) # remove punctuation
    text = re.sub('\w*\d\w*', '', text) # remove words containing numbers
    text = re.sub('\n', '', text)
    return text

In [6]:
df['text'] = df.text.apply(lambda x : clean_train_data(x))
df.head()

Unnamed: 0,text,airline_sentiment
0,virginamerica what dhepburn said,neutral
1,virginamerica plus youve added commercials to ...,positive
2,virginamerica i didnt today must mean i need t...,neutral
3,virginamerica its really aggressive to blast o...,negative
4,virginamerica and its a really big bad thing a...,negative


In [7]:
ms_data = df.copy()

Shuffling the data.

In [8]:
num_of_rows = 4000
shuffled = ms_data.reindex(np.random.permutation(ms_data.index))
nt = shuffled[shuffled['airline_sentiment'] == 'neutral'][:num_of_rows]
ng = shuffled[shuffled['airline_sentiment'] == 'negative'][:num_of_rows]
ps = shuffled[shuffled['airline_sentiment'] == 'positive'][:num_of_rows]
combine_data = pd.concat([nt, ng, ps], ignore_index=True)
combine_data = combine_data.reindex(np.random.permutation(combine_data.index))
combine_data['label'] = 0
combine_data.head()

Unnamed: 0,text,airline_sentiment,label
4062,usairways you guys are screwing up my trip tha...,negative,0
4866,jetblue our fleets on fleek plz stop,negative,0
4593,americanair all right but can you give me an e...,negative,0
6713,usairways americanair gma could not have had a...,negative,0
3269,usairways needs to hire more people,negative,0


In [9]:
print(len(combine_data[combine_data['airline_sentiment'] == 'neutral']))
print(len(combine_data[combine_data['airline_sentiment'] == 'negative']))
print(len(combine_data[combine_data['airline_sentiment'] == 'positive']))

3099
4000
2363


In [10]:
print(len(ms_data[ms_data['airline_sentiment'] == 'neutral']))
print(len(ms_data[ms_data['airline_sentiment'] == 'negative']))
print(len(ms_data[ms_data['airline_sentiment'] == 'positive']))

3099
9178
2363


In [11]:
ms_data.loc[ms_data['airline_sentiment'] == 'neutral', 'label'] = 0
ms_data.loc[ms_data['airline_sentiment'] == 'negative', 'label'] = 1
ms_data.loc[ms_data['airline_sentiment'] == 'positive', 'label'] = 2

In [12]:
ms_data.head(10)

Unnamed: 0,text,airline_sentiment,label
0,virginamerica what dhepburn said,neutral,0.0
1,virginamerica plus youve added commercials to ...,positive,2.0
2,virginamerica i didnt today must mean i need t...,neutral,0.0
3,virginamerica its really aggressive to blast o...,negative,1.0
4,virginamerica and its a really big bad thing a...,negative,1.0
5,virginamerica seriously would pay a flight fo...,negative,1.0
6,virginamerica yes nearly every time i fly vx t...,positive,2.0
7,virginamerica really missed a prime opportunit...,neutral,0.0
8,virginamerica well i didntbut now i do d,positive,2.0
9,virginamerica it was amazing and arrived an ho...,positive,2.0


Importing Keras , also assigning labels as 0 for neutral , 1 for negative and 2 for positive.

In [13]:
from keras.utils import to_categorical

In [14]:
labels = to_categorical(ms_data['label'], num_classes=3)

In [15]:
max_features = 3000
max_len = 130
ms_token = Tokenizer(num_words=max_features)
ms_token.fit_on_texts(ms_data['text'].values)
ms_sequences = ms_token.texts_to_sequences(ms_data['text'].values)
X = pad_sequences(ms_sequences, maxlen=max_len)

In [16]:
word_index = ms_token.word_index
print('Found %s unique tokens.' % len(word_index))

Found 13277 unique tokens.


In [17]:
y = labels

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

In [19]:
embed_dim = 128
lstm_out = 96

LSTM model creation, sequential model is used with softmax activation function. 

In [20]:
ms_model = Sequential()
ms_model.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
ms_model.add(SpatialDropout1D(0.7))
ms_model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
ms_model.add(Dense(3, activation='softmax'))
ms_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

ms_model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 130, 128)          384000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 130, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 195       
Total params: 433,603
Trainable params: 433,603
Non-trainable params: 0
_________________________________________________________________


Used Earlystopping in Keras. Trained the model .

In [21]:
from keras.callbacks import EarlyStopping

In [22]:
batch_size = 50
ms_history = ms_model.fit(X_train, y_train, epochs=20, batch_size=batch_size,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.0001)])

Instructions for updating:
Use tf.cast instead.
Train on 8784 samples, validate on 2196 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20


In [23]:
loss, accuracy = ms_model.evaluate(X_test, y_test)
print("loss", loss)
print("accuracy", accuracy)

loss 0.5578573196312118
accuracy 0.7961748838424683


Model testing and prediction.

In [24]:
text = ['i would recommend it if you have no other options']
text = ms_token.texts_to_sequences(text)
text = pad_sequences(text, maxlen=max_len)
res = ms_model.predict(text)
res

array([[0.13789089, 0.8530562 , 0.00905294]], dtype=float32)

In [25]:
if np.argmax(res) == 0:
    print("neutral Comment")
elif np.argmax(res) == 1:
    print("Negetive Comment")
elif np.argmax(res) == 2:
    print("positive Comment")

Negetive Comment
