### NLP - Sentiment Analysis

#### Long-Short Term Memory Neural Network

##### Dataset - Sentiment140

In [1]:
import numpy as np
import pandas as pd
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM

np.random.seed(7)

In [2]:
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


##### Read Dataset

###### target/label 0 -> negative - target/label 1 -> positive

In [3]:
twitter = pd.read_csv('training.1600000.processed.noemoticon.csv', 
                      encoding='latin1',
                      names=['Target', 'ID', 'Date', 'Query', 'User', 'Tweet'])

# label encoder
le = LabelEncoder()
twitter['Target'] = le.fit_transform(twitter['Target'])
print(twitter['Target'].value_counts())

0    800000
1    800000
Name: Target, dtype: int64


In [4]:
twitter.head()

Unnamed: 0,Target,ID,Date,Query,User,Tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
twitter.tail()

Unnamed: 0,Target,ID,Date,Query,User,Tweet
1599995,1,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,1,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,1,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,1,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,1,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [6]:
# preprocess
twitter['Tweet'] = twitter['Tweet'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','', x)))

In [7]:
X = twitter['Tweet']
print(X.shape)
y = twitter['Target'].values

(1600000,)


In [8]:
tokenizer = Tokenizer(num_words=5000, lower=True, split=' ')
tokenizer.fit_on_texts(X.values)
X = tokenizer.texts_to_sequences(X.values)
X = pad_sequences(X, padding='post', maxlen=40)

In [9]:
X.shape

(1600000, 40)

In [10]:
embed_dim = 128
lstm_out = 128
batch_size= 32

# LSTM network

model = Sequential()
model.add(Embedding(5000, embed_dim, input_length=X.shape[1]))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=16, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(lstm_out, dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 128)           640000    
                                                                 
 conv1d (Conv1D)             (None, 40, 32)            12320     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 20, 32)           0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 20, 16)            1552      
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 10, 16)           0         
 1D)                                                             
                                                                 
 lstm (LSTM)                 (None, 128)               7

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=7)

# train the network.

model.fit(X_train, y_train, batch_size=batch_size, epochs=3, verbose=1)

# score and accuracy on test set
score, acc = model.evaluate(X_test, y_test, verbose=0, batch_size=batch_size)
print("Loss score: %.2f" % (score))
print("Test Accuracy: %.2f" % (acc * 100))

Epoch 1/3
Epoch 2/3
Epoch 3/3
Loss score: 0.40
Test Accuracy: 81.77


In [13]:
y_pred = model.predict(X_test)

In [14]:
print(classification_report(y_test, np.round(y_pred)))

              precision    recall  f1-score   support

           0       0.81      0.84      0.82    160000
           1       0.83      0.80      0.81    160000

    accuracy                           0.82    320000
   macro avg       0.82      0.82      0.82    320000
weighted avg       0.82      0.82      0.82    320000



In [15]:
tf.keras.models.save_model(model, 'model_lstm.h5')

Predict Sentiment

In [21]:
model = tf.keras.models.load_model('model_lstm.h5')

In [19]:

negative = ['i hate having to wait this long for the model to train']

tokenizer.fit_on_texts(negative)

prediction=model.predict(pad_sequences(tokenizer.texts_to_sequences(negative), padding='post', maxlen=40))
print('Probability of positive sentiment: ', prediction[0][0] * 100, '%')
print('Probability of negative sentiment: ', (1 - prediction[0][0]) * 100, '%')

Probability of positive sentiment:  3.028212860226631 %
Probability of negative sentiment:  96.97178713977337 %


In [20]:

positive = ['but lstm networks are awesome']

tokenizer.fit_on_texts(positive)

prediction=model.predict(pad_sequences(tokenizer.texts_to_sequences(positive), padding='post', maxlen=40))
print('Probability of positive sentiment: ', prediction[0][0] * 100, '%')
print('Probability of negative sentiment: ', (1 - prediction[0][0]) * 100, '%')

Probability of positive sentiment:  90.9493625164032 %
Probability of negative sentiment:  9.050637483596802 %
