In [1]:
%matplotlib inline
from keras.layers import LSTM, Dense, Embedding, Input, Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras import optimizers
import os
import io
import re
import sklearn
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import cohen_kappa_score
from sklearn.utils import shuffle
import numpy as np

Using TensorFlow backend.


In [2]:
path = 'training_data_5151/'
text_list = os.listdir(path)
text_list

['pos', 'training_data_5151.arff', 'neu', 'neg']

In [3]:
reviews = []
labels = []
true = []
for i in text_list:
    if i == 'neg':
        list_of_text = os.listdir(path+i)
        for ltext in list_of_text:
            with io.open(path+i+'/'+ltext, encoding='utf-8') as f:
                review = f.read()
                reviews.append(review)
                labels.append([1,0,0])
                true.append(0)
    if i == 'neu':
        list_of_text = os.listdir(path+i)
        for ltext in list_of_text:
            with io.open(path+i+'/'+ltext, encoding='utf-8') as f:
                review = f.read()
                reviews.append(review)
                labels.append([0,1,0])
                true.append(1)
    if i == 'pos':
        list_of_text = os.listdir(path+i)
        for ltext in list_of_text:
            with io.open(path+i+'/'+ltext, encoding='utf-8') as f:
                review = f.read()
                reviews.append(review)
                labels.append([0,0,1])
                true.append(2)

In [4]:
labels = np.array(labels)

In [5]:
labels

array([[0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       ..., 
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0]])

In [6]:
final_reviews = []
for review in reviews:
    x = re.sub(r'\n', ' ', review)
    final_reviews.append(x)

In [7]:
vocab = set()
for review in final_reviews:
    for word in review.split(' '):
        vocab.add(word)

In [8]:
len(vocab)

10286

In [9]:
word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i

In [10]:
import pickle
with open('word2index.pkl','wb') as f:
    pickle.dump(word2index, f)

In [11]:
def encode(text):
    vector = []
    for word in text.split(' '):
        vector.append(word2index[word])
    return vector

In [12]:
encode('कहानी को ठीक से समेटा नहीं गया है')

[3073, 2428, 2886, 5292, 5828, 6840, 4293, 6521]

In [13]:
final_reviews, labels, true = shuffle(final_reviews, labels, true)

In [14]:
reviews = []
for review in final_reviews:
    reviews.append(encode(review))

In [15]:
reviews = pad_sequences(reviews, maxlen=100, value=0.)

In [18]:
trainX = reviews[:3729]
trainY = labels[:3729]
testX = reviews[:-3729]
testY = labels[:-3729]
true = true[:-3729]
print(len(trainX))
print(len(testX))

3729
1422


# LSTM network

In [20]:
adam = optimizers.Adam(lr=0.001)

model = Sequential()
model.add(Embedding(10286, 128, input_length=100))
model.add(LSTM(128))
model.add(Dense(3, activation='softmax'))

model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())
model.fit(trainX, trainY, batch_size=64, verbose=1, validation_data=(testX, testY), epochs=10)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 128)          1316608   
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 387       
Total params: 1,448,579
Trainable params: 1,448,579
Non-trainable params: 0
_________________________________________________________________
None
Train on 3729 samples, validate on 1422 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fcbcabe75f8>

In [21]:
model.evaluate(testX, testY, batch_size=32)



[0.033765795831889461, 0.98874824166130248]

In [30]:
pred = []
for i in testX:
    pred.append(np.argmax(model.predict(i[None,:])))

In [26]:
mat = confusion_matrix(pred, true)

ValueError: Found input variables with inconsistent numbers of samples: [1422, 0]

In [24]:
plot_confusion_matrix(mat)

NameError: name 'mat' is not defined

In [28]:
cohen_kappa_score(pred, true)

ValueError: Found input variables with inconsistent numbers of samples: [1422, 0]

In [31]:
mean_absolute_error(pred, true)

ValueError: Found input variables with inconsistent numbers of samples: [1422, 0]

In [130]:
sklearn.metrics.recall_score(true, pred, average='weighted')

0.99759036144578317

In [131]:
sklearn.metrics.f1_score(true, pred, average='weighted')

0.99759105980498819

In [132]:
np.sqrt(sklearn.metrics.mean_squared_error(true, pred))

0.049088069367381595

In [133]:
np.sqrt(sklearn.metrics.mean_squared_error(true, pred))

0.049088069367381595

In [134]:
sklearn.metrics.r2_score(true, pred)

0.99626778423296225

In [135]:
sklearn.metrics.precision_score(true, pred, average='weighted')

0.9976096385542168