# Negation detection - Using RNN

In [53]:
import warnings
warnings.simplefilter('ignore')

import numpy as np 
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re


In [54]:
data = pd.read_csv('Sentiment.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]

In [55]:
data.head()

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive


In [56]:
len(data)

13871

In [57]:
data["sentiment"].value_counts()

Negative    8493
Neutral     3142
Positive    2236
Name: sentiment, dtype: int64

In [58]:
len(data[data['sentiment'] == 'Negative'])

8493

In [59]:
data[ data['sentiment'] == 'Negative'].size

16986

In [60]:
data = data[data.sentiment != "Neutral"]

In [61]:
data["sentiment"].value_counts()

Negative    8493
Positive    2236
Name: sentiment, dtype: int64

In [62]:
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

In [63]:
for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')

In [64]:
data.head()

Unnamed: 0,text,sentiment
1,scottwalker didnt catch the full gopdebate l...,Positive
3,robgeorge that carly fiorina is trending ho...,Positive
4,danscavino gopdebate w realdonaldtrump deliv...,Positive
5,gregabbott_tx tedcruz on my first day i will...,Positive
6,warriorwoman91 i liked her and was happy whe...,Negative


In [65]:
print(data[ data['sentiment'] == 'Positive'].size)
print(data[ data['sentiment'] == 'Negative'].size)

4472
16986


In [66]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)


In [68]:
len(X)

10729

In [71]:
len(X[100])

12

In [73]:
len(X[0])

17

In [74]:
X = pad_sequences(X)

In [75]:
len(X)

10729

In [76]:
len(X[0])

28

In [77]:
X[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
        363,  122,    1,  703,    2,   39,   58,  237,   37,  210,    6,
        174, 1761,   12, 1324, 1409,  743], dtype=int32)

In [50]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 28, 128)           256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 28, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [78]:
Y = pd.get_dummies(data['sentiment']).values

In [79]:
Y

array([[0, 1],
       [0, 1],
       [0, 1],
       ...,
       [0, 1],
       [1, 0],
       [0, 1]], dtype=uint8)

In [80]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(7188, 28) (7188, 2)
(3541, 28) (3541, 2)


In [81]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2)


Epoch 1/7
 - 13s - loss: 0.4472 - accuracy: 0.8151
Epoch 2/7
 - 11s - loss: 0.3277 - accuracy: 0.8625
Epoch 3/7
 - 11s - loss: 0.2846 - accuracy: 0.8816
Epoch 4/7
 - 12s - loss: 0.2562 - accuracy: 0.8954
Epoch 5/7
 - 11s - loss: 0.2296 - accuracy: 0.9041
Epoch 6/7
 - 11s - loss: 0.2108 - accuracy: 0.9182
Epoch 7/7
 - 11s - loss: 0.1885 - accuracy: 0.9239


<keras.callbacks.callbacks.History at 0x7fc5956b4cf8>

In [82]:
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.44
acc: 0.83


In [102]:
Y_test

array([[0, 1],
       [0, 1],
       [1, 0],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]], dtype=uint8)

In [90]:
pred = pd.read_csv("Dataset_for_Prediction.csv")

In [91]:
pred["target"][pred["target"]==4]=1

In [92]:
pred.head()

Unnamed: 0,text,target
0,I am looking for the eligibility criteria of N...,1
1,Am i eligible for opening a fed savings account,1
2,can i open a salary account,1
3,I would like to know about procedure to open a...,1
4,I want to start federal salary account,1


In [93]:
preddata = pred["text"]

In [103]:
predy = pred["target"]

In [105]:
Y = pd.get_dummies(predy).values

In [106]:
Y

array([[0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1,

In [94]:
preddata

0     I am looking for the eligibility criteria of N...
1       Am i eligible for opening a fed savings account
2                           can i open a salary account
3     I would like to know about procedure to open a...
4                I want to start federal salary account
                            ...                        
95    irrelevant documents need to apply for agricul...
96                what are the disadvantages of fedfast
97    What are not Federal Agricultural Development ...
98           nobody can help me to find my mobile phone
99                  recharge not possible for my mobile
Name: text, Length: 100, dtype: object

In [100]:
preddata = preddata.apply(lambda x: x.lower())
preddata = preddata.apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
    
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(preddata)
X = tokenizer.texts_to_sequences(preddata)
X = pad_sequences(X, maxlen=28)

In [101]:
X.shape

(100, 28)

In [107]:
score,acc = model.evaluate(X, Y, verbose = 2, batch_size = batch_size)

In [108]:
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 1.22
acc: 0.47
