# Sentiment Analysis using LSTM Recurrent Neural Networks

Implemented using the <a href="https://www.crowdflower.com/data-for-everyone/">Disasters on social media</a> dataset from Crowdflower. 

### Import Libraries

In [71]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils.np_utils import to_categorical




### Read Data

In [30]:
tweets = pd.read_csv('socialmedia-disaster-tweets-DFE.csv')[['text','choose_one']]
tweets.columns = ['tweet','class']
tweets = tweets[(tweets['class'] == 'Relevant') | (tweets['class'] == 'Not Relevant')]
tweets = tweets.reset_index(drop=True)
tweets.tail()

Unnamed: 0,tweet,class
10855,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,Relevant
10856,Police investigating after an e-bike collided ...,Relevant
10857,The Latest: More Homes Razed by Northern Calif...,Relevant
10858,MEG issues Hazardous Weather Outlook (HWO) htt...,Relevant
10859,#CityofCalgary has activated its Municipal Eme...,Relevant


### Remove any HTML and Emoticons from the tweets

In [31]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [32]:
tweets['tweet'] = tweets['tweet'].apply(preprocessor)

### Reindex the tweets dataframe so the rows are in random order

In [33]:
tweets = tweets.reindex(np.random.permutation(tweets.index))

print tweets.head()
print tweets.tail()

                                                  tweet         class
1788  just in kenya several buildings are reported t...      Relevant
3623  i really liked the first hobbit movie i saw it...  Not Relevant
2909  my take away preservation parks r an impositio...  Not Relevant
6075  hellfire is surrounded by desires so be carefu...  Not Relevant
578    kisii police in kisii hunt for students over ...      Relevant
                                                  tweet         class
4017              i m the architect of my own disaster   Not Relevant
7683   underrrtow molly send help im panicking over ...  Not Relevant
2400   maybe someday we ll find the place where our ...  Not Relevant
3646  overall the english lads did very well today t...  Not Relevant
1749   travdave kornbread_icu especially where it ha...  Not Relevant


### Download stopwords from the NLTK, remove them from the tweets

In [34]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [35]:
stop = stopwords.words('english')
stop = stop + [u'a',u'b',u'c',u'd',u'e',u'f',u'g',u'h',u'i',u'j',u'k',u'l',u'm',u'n',u'o',u'p',u'q',u'r',u's',u't',u'v',u'w',u'x',u'y',u'z']

In [42]:
def remove_stop_words(tweet):
    #tweet = unicode(tweet, 'utf8')  # convert bytes into proper unicode
    return str(' '.join([word for word in TextBlob(tweet).words if word not in stop]))

In [43]:
tweets.tweet.head()

1788    kenya several buildings reported fire close da...
3623    really liked first hobbit movie saw three time...
2909    take away preservation parks imposition amp da...
6075    hellfire surrounded desires careful let desire...
578     kisii police kisii hunt students failed arson ...
Name: tweet, dtype: object

In [44]:
tweets.tweet = tweets.tweet.apply(remove_stop_words)

In [45]:
tweets.tweet.head()

1788    kenya several buildings reported fire close da...
3623    really liked first hobbit movie saw three time...
2909    take away preservation parks imposition amp da...
6075    hellfire surrounded desires careful let desire...
578     kisii police kisii hunt students failed arson ...
Name: tweet, dtype: object

### Tokenize words

In [81]:
MAX_NB_WORDS=5000
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
print('Tokenizing...')
tokenizer.fit_on_texts(tweets.tweet)

Tokenizing...


In [82]:
word_index = tokenizer.word_index
len(word_index)

27637

### Create word sequences

In [83]:
sequences_train = tokenizer.texts_to_sequences(tweets.tweet)

In [84]:
MAXLEN = 500
sequences_train_pad = sequence.pad_sequences(sequences_train, maxlen=MAXLEN)

In [85]:
x = sequences_train_pad
y = np.array([1 if c == 'Relevant' else 0 for c in tweets['class']])

### Generate training and validation datasets

In [86]:
rng = np.random.RandomState(42)
n_samples = len(x)
indices = np.arange(n_samples)
rng.shuffle(indices)
x_shuffled = x[indices]
y_shuffled = y[indices]

x_train = x_shuffled[:int(n_samples*0.8)]
x_test = x_shuffled[int(n_samples*0.8):]

y_train = y_shuffled[:int(n_samples*0.8)]
y_test = y_shuffled[int(n_samples*0.8):]

###  Build and train the model

In [87]:
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, embedding_vecor_length, input_length=MAXLEN))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.8))
model.add(LSTM(100))
model.add(Dropout(0.8))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(x_train, y_train, validation_data=(x_test, y_test),nb_epoch=3, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 500, 32)           3104      
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 250, 32)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 101       
Total para

<keras.callbacks.History at 0x7f05a3ed9d90>

In [88]:
predictions = model.predict_classes(x_test)



In [89]:
print 'accuracy', accuracy_score(y_test, predictions)
print 'confusion matrix\n', confusion_matrix(y_test, predictions)
print '(row=expected, col=predicted)'
print classification_report(y_test, predictions)

accuracy 0.810313075506
confusion matrix
[[1107  148]
 [ 264  653]]
(row=expected, col=predicted)
             precision    recall  f1-score   support

          0       0.81      0.88      0.84      1255
          1       0.82      0.71      0.76       917

avg / total       0.81      0.81      0.81      2172

