# Sentiment Analysis

In [1]:
import pandas as pd
import numpy as np
import re
import keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, Bidirectional, Dropout
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


### Getting data

In [2]:
data = pd.read_csv('shuffled_movie_data.csv')
data.tail()

Unnamed: 0,review,sentiment
49995,"OK, lets start with the best. the building. al...",0
49996,The British 'heritage film' industry is out of...,0
49997,I don't even know where to begin on this one. ...,0
49998,Richard Tyler is a little boy who is scared of...,0
49999,I waited long to watch this movie. Also becaus...,1


### Prepare data

Getting rid of tags

In [3]:
def ridof_sc(text):
    text = re.sub('<[^>]*>', '', text) 
    return text

In [4]:
data['review'] = data['review'].apply(lambda x: ridof_sc(x))
data.at[49999,'review']

'I waited long to watch this movie. Also because I like Bruce Willis. The plot was quite different from what I had expected but still quite good. Its a good mix of emotions, humor and drama.Left me thinking over and again :)'

In [5]:
tokenizer = Tokenizer(num_words = None, 
                      filters = '#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                      lower = False, split = ' ')

In [6]:
tokenizer.fit_on_texts(data['review'])

#Convert lists of strings into list of lists of integer
sequences = tokenizer.texts_to_sequences(data['review'])

In [7]:
sequences[49999][:10]

[8, 5117, 222, 5, 115, 10, 116, 929, 76, 8]

Getting indexes for words

In [8]:
idx_word = tokenizer.index_word

#Total amount of words
num_words = len(idx_word) + 1
print(num_words)

383444


### Setting X and y

In [9]:
#pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0.0)
X = pad_sequences(sequences, maxlen=2000)
y = data['sentiment']

In [10]:
print(X.shape, y.shape)

(50000, 2000) (50000,)


Set train and test sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)

### Building the model

In [23]:
model = Sequential()

model.add( Embedding(num_words, output_dim = 100, input_length = X.shape[1]))
model.add( Dropout(0.5))          
model.add( Bidirectional( LSTM(20, dropout = 0.2, recurrent_dropout = 0.2)))
model.add( Dense(2, activation = 'softmax'))

model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])


In [24]:
model.fit(X_train, y_train, epochs = 5, batch_size = 64, verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f4dca4166a0>

In [27]:
yhat_train = model.predict_classes(X_train, verbose=1)



In [28]:
print("train accuracy: {} %".format(100 - np.mean(np.abs(yhat_train - y_train)) * 100))

train accuracy: 99.53866666666667 %


### Get accuracy for test set

In [25]:
yhat_test = model.predict_classes(X_test, verbose=1)



In [26]:
print("test accuracy: {} %".format(100 - np.mean(np.abs(yhat_test - y_test)) * 100))

test accuracy: 85.272 %
