In [1]:
import re
import warnings
import numpy as np
import pandas as pd 
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from tensorflow import keras
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from keras.models import Model, Sequential
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences  
from keras.layers import Dense, LSTM, Embedding, Bidirectional, SpatialDropout1D

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("data.csv")
df

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,negative
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",neutral


In [3]:
df.columns = ['Phrase', 'Sentiment']

encoded_dict = {'negative': 0, 'neutral': 1, 'positive': 2}
df['Sentiment'] = df.Sentiment.map(encoded_dict)

sentiments = df.Sentiment.values
sentiments = to_categorical(sentiments)

In [4]:
df[df.Sentiment == 0].shape[0]

860

In [5]:
unique_words = set()
len_max = 0

for sent in tqdm(df.Phrase.values):
  unique_words.update(sent)
  if(len_max < len(sent)):
    len_max = len(sent)

100%|██████████| 5842/5842 [00:00<00:00, 400823.20it/s]


In [6]:
tokenizer = Tokenizer(num_words = 8000, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower = True)
tokenizer.fit_on_texts(df.Phrase.values)
X = tokenizer.texts_to_sequences(df.Phrase.values)
word_index = tokenizer.word_index

X = pad_sequences(X, maxlen = len_max)
X.shape

(5842, 315)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, sentiments, test_size = 0.25, random_state = 1234, shuffle = True)

In [8]:
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 2)

In [9]:
model = Sequential([Embedding(8000, 300, input_length = len_max),
                   SpatialDropout1D(0.7),
                   Bidirectional(LSTM(128, dropout = 0.7)),
                   Dense(3, activation = 'softmax')])

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 315, 300)          2400000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 315, 300)         0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  (None, 256)              439296    
 l)                                                              
                                                                 
 dense (Dense)               (None, 3)                 771       
                                                                 
Total params: 2,840,067
Trainable params: 2,840,067
Non-trainable params: 0
_________________________________________________________________


In [10]:
history = model.fit(X_train, y_train, epochs = 20, batch_size = 256, validation_split = 0.25, callbacks = [es])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 11: early stopping


In [12]:
Y_test = (np.argmax(y_test, axis = 1)).reshape(-1, 1)
Y_pred = (np.argmax(model.predict(X_test), axis = 1)).reshape(-1, 1)

print(np.concatenate((Y_test, Y_pred), axis = 1))

cm = confusion_matrix(Y_test, Y_pred)
print(f'\nConfusion Matrix: \n{cm}')

[[1 0]
 [2 2]
 [2 2]
 ...
 [0 1]
 [1 0]
 [2 2]]

Confusion Matrix: 
[[ 50 124  55]
 [ 58 647  69]
 [ 18 129 311]]


In [36]:
df.sample(20)

Unnamed: 0,Phrase,Sentiment
1301,The Department Store Division reported an incr...,2
2812,Motorola accounted for 11.5 percent of the Sou...,1
4168,The Group 's consolidated net sales for 2008 t...,1
615,$FB slight green... May be runner later,2
79,Operating profit totaled EUR 18.6 mn or 8.3 % ...,1
3271,The company slipped to an operating loss of EU...,1
4531,The energy shot is packed in a 100-millilitre ...,1
5604,`` This is a significant milestone for Benefon...,2
1478,Estonia 's beer market overall grew three perc...,2
1856,"On the other hand , Finnish sawmills ' product...",1


In [28]:
labels = ['Negative', 'Neutral', 'Positive']

In [45]:
sentence = [df['Phrase'][1478]]
seq = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(seq, maxlen = len_max)
pred = np.argmax(model.predict(padded))

print(labels[pred])

Positive


In [48]:
sentence = [df['Phrase'][3068]]
seq = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(seq, maxlen = len_max)
pred = np.argmax(model.predict(padded))

print(labels[pred])

Neutral


In [40]:
sentence = [df['Phrase'][1856]]
seq = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(seq, maxlen = len_max)
pred = np.argmax(model.predict(padded))

print(labels[pred])

Neutral
