In [16]:
import numpy as np 
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import keras
import re

In [3]:
data = pd.read_csv('sentiment/Sentiment.csv')
data = data[['text','sentiment']]

In [4]:
data.head()

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive


In [6]:
data = data[data.sentiment != "Neutral"]
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

print(data[ data['sentiment'] == 'Positive'].size)
print(data[ data['sentiment'] == 'Negative'].size)

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

4472
16986


In [7]:
X.shape

(10729, 28)

In [8]:
X[:2]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
         363,  122,    1,  703,    2,   39,   58,  237,   37,  210,    6,
         174, 1761,   12, 1324, 1409,  743],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          16,  284,  252,    5,  821,  102,  167,   26,  136,    6,    1,
         173,   12,    2,  233,  724,   17]])

In [32]:
embed_dim = 128

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(Dropout(0.20))

In [33]:
model.add(keras.layers.recurrent.SimpleRNN(units = 100, activation='relu',use_bias=True))
model.add(keras.layers.Dense(units=1000, input_dim = 2000, activation='sigmoid'))
model.add(keras.layers.Dense(units=500, input_dim=1000, activation='relu'))
model.add(keras.layers.Dense(units=2, input_dim=500,activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(7188, 28) (7188, 2)
(3541, 28) (3541, 2)


In [34]:
batch_size = 32
model.fit(X_train, Y_train, nb_epoch = 17, batch_size=batch_size, verbose = 2)



Epoch 1/17
 - 23s - loss: 0.5318 - acc: 0.7924
Epoch 2/17
 - 17s - loss: 0.3423 - acc: 0.8582
Epoch 3/17
 - 17s - loss: 0.2440 - acc: 0.9060
Epoch 4/17
 - 17s - loss: 0.1752 - acc: 0.9350
Epoch 5/17
 - 17s - loss: 0.1627 - acc: 0.9393
Epoch 6/17
 - 17s - loss: 0.1246 - acc: 0.9540
Epoch 7/17
 - 17s - loss: 0.0987 - acc: 0.9626
Epoch 8/17
 - 17s - loss: 0.0953 - acc: 0.9634
Epoch 9/17
 - 17s - loss: 0.0866 - acc: 0.9655
Epoch 10/17
 - 17s - loss: 0.0858 - acc: 0.9665
Epoch 11/17
 - 17s - loss: 0.0765 - acc: 0.9687
Epoch 12/17
 - 17s - loss: 0.0729 - acc: 0.9681
Epoch 13/17
 - 17s - loss: 0.0793 - acc: 0.9683
Epoch 14/17
 - 18s - loss: 0.0753 - acc: 0.9697
Epoch 15/17
 - 17s - loss: 0.0708 - acc: 0.9711
Epoch 16/17
 - 17s - loss: 0.0673 - acc: 0.9702
Epoch 17/17
 - 17s - loss: 0.0802 - acc: 0.9687


<keras.callbacks.History at 0x23635391b70>

In [25]:
model.predict(X_test[0:10])

array([[  9.21380162e-01,   7.86197856e-02],
       [  9.87706840e-01,   1.22931534e-02],
       [  8.12505722e-01,   1.87494248e-01],
       [  9.98835862e-01,   1.16410200e-03],
       [  9.97321069e-01,   2.67897709e-03],
       [  8.84306431e-02,   9.11569417e-01],
       [  9.99960661e-01,   3.93490773e-05],
       [  9.99059260e-01,   9.40774451e-04],
       [  5.32969832e-01,   4.67030227e-01],
       [  9.98835266e-01,   1.16479525e-03]], dtype=float32)

In [26]:
Y_test[0:10]

array([[0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0]], dtype=uint8)

In [27]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 28, 128)           256000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 28, 128)           0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 100)               22900     
_________________________________________________________________
dense_1 (Dense)              (None, 1000)              101000    
_________________________________________________________________
dense_2 (Dense)              (None, 500)               500500    
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 1002      
Total params: 881,402
Trainable params: 881,402
Non-trainable params: 0
_________________________________________________________________


In [38]:
model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(Dropout(0.20))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [39]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 28, 128)           256000    
_________________________________________________________________
dropout_4 (Dropout)          (None, 28, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 130       
Total params: 305,538
Trainable params: 305,538
Non-trainable params: 0
_________________________________________________________________


In [40]:
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2)

Epoch 1/7
 - 43s - loss: 0.4393 - acc: 0.8141
Epoch 2/7
 - 46s - loss: 0.3156 - acc: 0.8699
Epoch 3/7
 - 41s - loss: 0.2710 - acc: 0.8900
Epoch 4/7
 - 40s - loss: 0.2401 - acc: 0.9033
Epoch 5/7
 - 32s - loss: 0.2169 - acc: 0.9144
Epoch 6/7
 - 31s - loss: 0.1914 - acc: 0.9229
Epoch 7/7
 - 32s - loss: 0.1731 - acc: 0.9285


<keras.callbacks.History at 0x236389c98d0>

In [41]:
model.get_config()

[{'class_name': 'Embedding',
  'config': {'activity_regularizer': None,
   'batch_input_shape': (None, 28),
   'dtype': 'float32',
   'embeddings_constraint': None,
   'embeddings_initializer': {'class_name': 'RandomUniform',
    'config': {'maxval': 0.05, 'minval': -0.05, 'seed': None}},
   'embeddings_regularizer': None,
   'input_dim': 2000,
   'input_length': 28,
   'mask_zero': False,
   'name': 'embedding_7',
   'output_dim': 128,
   'trainable': True}},
 {'class_name': 'Dropout',
  'config': {'name': 'dropout_4',
   'noise_shape': None,
   'rate': 0.2,
   'seed': None,
   'trainable': True}},
 {'class_name': 'LSTM',
  'config': {'activation': 'tanh',
   'activity_regularizer': None,
   'bias_constraint': None,
   'bias_initializer': {'class_name': 'Zeros', 'config': {}},
   'bias_regularizer': None,
   'dropout': 0.2,
   'go_backwards': False,
   'implementation': 1,
   'kernel_constraint': None,
   'kernel_initializer': {'class_name': 'VarianceScaling',
    'config': {'distribu

In [42]:
Y

array([[0, 1],
       [0, 1],
       [0, 1],
       ..., 
       [0, 1],
       [1, 0],
       [0, 1]], dtype=uint8)

In [43]:
data['sentiment']

1        Positive
3        Positive
4        Positive
5        Positive
6        Negative
8        Negative
9        Negative
10       Negative
11       Positive
12       Negative
14       Positive
15       Negative
16       Negative
17       Negative
18       Negative
19       Negative
20       Positive
21       Negative
22       Negative
24       Negative
25       Negative
26       Negative
27       Negative
28       Negative
29       Negative
30       Negative
31       Negative
34       Negative
35       Negative
36       Negative
           ...   
13839    Negative
13840    Negative
13841    Negative
13843    Negative
13844    Negative
13845    Negative
13847    Positive
13848    Negative
13849    Positive
13850    Negative
13851    Negative
13852    Negative
13853    Negative
13854    Negative
13855    Negative
13856    Negative
13857    Negative
13858    Positive
13859    Positive
13860    Negative
13861    Negative
13862    Positive
13863    Negative
13864    Negative
13865    N

In [51]:
Y_train

array([[1, 0],
       [0, 1],
       [1, 0],
       ..., 
       [1, 0],
       [1, 0],
       [1, 0]], dtype=uint8)

In [50]:
model.predict_classes(X_test)

array([1, 0, 1, ..., 0, 0, 0], dtype=int64)

In [48]:
model.evaluate(X_test,Y_test)



[0.43480763399846739, 0.83168596445049681]

In [52]:
Y

array([[0, 1],
       [0, 1],
       [0, 1],
       ..., 
       [0, 1],
       [1, 0],
       [0, 1]], dtype=uint8)