### Sentiment Analysis using LSTM

<b>Import libraries

In [49]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split

import re

<b>Loading the data

In [50]:
df = pd.read_csv('sentiment.csv')
df.head()

Unnamed: 0,tweetID,entity,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [51]:
data = df[['text', 'sentiment']]
data.head()

Unnamed: 0,text,sentiment
0,im getting on borderlands and i will murder yo...,Positive
1,I am coming to the borders and I will kill you...,Positive
2,im getting on borderlands and i will kill you ...,Positive
3,im coming on borderlands and i will murder you...,Positive
4,im getting on borderlands 2 and i will murder ...,Positive


In [52]:
data.shape

(74682, 2)

In [53]:
data = data[data.sentiment != 'Neutral']
data = data[data.sentiment != 'Irrelevant']
data.shape

(43374, 2)

In [54]:
data.text = data.text.astype(str)

In [55]:
data.head()

Unnamed: 0,text,sentiment
0,im getting on borderlands and i will murder yo...,Positive
1,I am coming to the borders and I will kill you...,Positive
2,im getting on borderlands and i will kill you ...,Positive
3,im coming on borderlands and i will murder you...,Positive
4,im getting on borderlands 2 and i will murder ...,Positive


In [56]:
data['text'] = data['text'].apply(lambda x : x.lower())
data['text'] = data['text'].apply(lambda x : re.sub('[^a-zA-z0-9\s]','',x))
data.head()

Unnamed: 0,text,sentiment
0,im getting on borderlands and i will murder yo...,Positive
1,i am coming to the borders and i will kill you...,Positive
2,im getting on borderlands and i will kill you all,Positive
3,im coming on borderlands and i will murder you...,Positive
4,im getting on borderlands 2 and i will murder ...,Positive


In [57]:
data['sentiment'].value_counts()

Negative    22542
Positive    20832
Name: sentiment, dtype: int64

In [58]:
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')

In [59]:
tokenizer.fit_on_texts(data['text'].values)

In [60]:
data.head()

Unnamed: 0,text,sentiment
0,im getting on borderlands and i will murder yo...,Positive
1,i am coming to the borders and i will kill you...,Positive
2,im getting on borderlands and i will kill you all,Positive
3,im coming on borderlands and i will murder you...,Positive
4,im getting on borderlands 2 and i will murder ...,Positive


In [62]:
X = tokenizer.texts_to_sequences(data['text'].values)
X[0]

[29, 130, 14, 107, 4, 2, 58, 1451, 13, 26]

In [63]:
X = pad_sequences(X)
X[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

<b>Model Definition

In [64]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 166, 128)          256000    
                                                                 
 spatial_dropout1d (Spatial  (None, 166, 128)          0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dense (Dense)               (None, 2)                 394       
                                                                 
Total params: 511194 (1.95 MB)
Trainable params: 511194 (1.95 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


<b>One_hot encoding of prediction value

In [65]:
Y = pd.get_dummies(data['sentiment']).values
print(Y[:5])

[[0 1]
 [0 1]
 [0 1]
 [0 1]
 [0 1]]


<b>Train test split

In [71]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(29060, 166) (29060, 2)
(14314, 166) (14314, 2)


In [72]:
batch_size = 32
model.fit(x_train, y_train, epochs=7, batch_size=batch_size, verbose=2)

Epoch 1/7


909/909 - 765s - loss: 0.4436 - accuracy: 0.7880 - 765s/epoch - 841ms/step
Epoch 2/7
909/909 - 636s - loss: 0.3202 - accuracy: 0.8604 - 636s/epoch - 700ms/step
Epoch 3/7
909/909 - 625s - loss: 0.2757 - accuracy: 0.8784 - 625s/epoch - 688ms/step
Epoch 4/7
909/909 - 630s - loss: 0.2422 - accuracy: 0.8931 - 630s/epoch - 693ms/step
Epoch 5/7
909/909 - 642s - loss: 0.2171 - accuracy: 0.9056 - 642s/epoch - 706ms/step
Epoch 6/7
909/909 - 752s - loss: 0.1956 - accuracy: 0.9124 - 752s/epoch - 827ms/step
Epoch 7/7
909/909 - 626s - loss: 0.1798 - accuracy: 0.9200 - 626s/epoch - 689ms/step


<keras.src.callbacks.History at 0x2444e5f2c50>

<b>Evaluation

In [73]:
validation_size = 6500
x_validate = x_test[-validation_size:]
y_validate = y_test[-validation_size:]
x_test1 = x_test[:-validation_size]
y_test1 = y_test[:-validation_size]
score,acc = model.evaluate(x_test, y_test, verbose=2, batch_size=batch_size)
score1,acc1 = model.evaluate(x_test1, y_test1, verbose=2, batch_size=batch_size)
print(score, acc)
print(score1, acc1)

448/448 - 41s - loss: 0.2586 - accuracy: 0.8912 - 41s/epoch - 92ms/step
245/245 - 22s - loss: 0.2565 - accuracy: 0.8925 - 22s/epoch - 88ms/step
0.25862860679626465 0.8912253975868225
0.25650474429130554 0.892500638961792


In [77]:
twt = ['good boys are awesome']

twt = tokenizer.texts_to_sequences(twt)
twt = pad_sequences(twt, maxlen=166, dtype='int32', value=0)

sentiment = model.predict(twt, batch_size=1, verbose=2)[0]
print(sentiment)

if(np.argmax(sentiment) == 0):
    print("Negative")
elif(np.argmax(sentiment) == 1):
    print("Positive")

1/1 - 0s - 126ms/epoch - 126ms/step
[0.00446193 0.9955381 ]
Positive


In [78]:
twt = ['this is not a right post']

twt = tokenizer.texts_to_sequences(twt)
twt = pad_sequences(twt, maxlen=166, dtype='int32', value=0)

sentiment = model.predict(twt, batch_size=1, verbose=2)[0]
print(sentiment)

if(np.argmax(sentiment) == 0):
    print("Negative")
elif(np.argmax(sentiment) == 1):
    print("Positive")

1/1 - 0s - 111ms/epoch - 111ms/step
[0.9414208  0.05857926]
Negative
