## Sentiment Analysis

* Sentiment analysis aims to determine the attitude of a speaker or a writer with respect to some topic or the overall contextual polarity of a document.

* Most common classes: positive, negative, and neutral.



### Dataset: IMDB Movie reviews sentiment classification

* Dataset of 25,000 movies reviews from IMDB, labeled by sentiment (positive/negative). 
* Reviews have been preprocessed, and each review is encoded as a sequence of word indexes (integers). 
* Words are indexed by overall frequency in the dataset

[Source: https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification]

In [1]:
from keras.datasets import imdb

# Loading the IMBD dataset
# Selecting the 2000 most frequent words
(x_train_org, y_train), (x_test_org, y_test) = imdb.load_data(path="imdb.npz",
                                                      num_words=2000,
                                                      skip_top=0,
                                                      maxlen=None,
                                                      seed=113,
                                                      start_char=1,
                                                      oov_char=2,
                                                      index_from=2)

print(x_train_org.shape)
print(x_test_org.shape)

print(x_train_org[0][:10])
print(x_test_org[0][:10])

# Reducing the data size to be run on CPUs
x_train_org = x_train_org[:5000]
y_train = y_train[:5000]
x_test_org = x_test_org[:500]
y_test= y_test[:500]

Using TensorFlow backend.


(25000,)
(25000,)
[1, 13, 21, 15, 42, 529, 972, 1621, 1384, 64]
[1, 590, 201, 13, 30, 5, 716, 9, 9, 2]


In [2]:
x_train_org.shape

(5000,)

In [3]:
# Loading the vocabulary
import numpy as np

vocab = imdb.get_word_index(path="./imdb_word_index.json")
print("Number of unique words: %d" % len(vocab))

INDEX_FROM = 2 

# Dict {word:id}
word_to_id = {x:vocab[x]+INDEX_FROM for x in vocab if vocab[x]<=2000}
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2


# Dict {id:word}
id_to_word = {word_to_id[x]:x for x in word_to_id}

# Array of ordered words by their frequency + special characters
vocab_list = np.array(["<PAD>"]+[id_to_word[x] for x in range(1,2001)])


Number of unique words: 88584


In [4]:
vocab_list.size

2001

In [5]:
# Summarize number of classes
print("Classes: %s"% np.unique(y_train))
# y = {0:negative | 1:positive}

i = 500
print("Review: %s" % " ".join(vocab_list[np.array(x_train_org[i])]))
print("Class: %s" % "positive" if y_train[i] == 1 else "negative")

Classes: [0 1]
Review: <START> and that's why hard to rate br br from the adult point of view <UNK> student point of view i must say i fell nearly <UNK> here sure there is some laughing scene all the credit takes here eddie but that can't save the disney type of script and whole movie that's why br br 2 out of 10
negative


In [6]:
def get_W(word_vecs, k=300):
    """
    Get word matrix. W[i] is the vector for word indexed by i
    """
    vocab_size = len(word_vecs)
    word_idx_map = dict()
    W = np.zeros(shape=(vocab_size + 1, k), dtype='float32')
    W[0] = np.zeros(k, dtype='float32')
    i = 1
    for word in word_vecs:
        W[i] = word_vecs[word]
        word_idx_map[word] = i
        i += 1
    return W, word_idx_map


def load_bin_vec(fname, vocab):
    """
    Loads 300x1 word vecs from Google (Mikolov) word2vec
    """
    word_vecs = {}
    with open(fname, "rb") as f:
        header = f.readline()
        # ~ print(header)
        vocab_size, layer1_size = map(int, header.split())
        binary_len = np.dtype('float32').itemsize * layer1_size
        # print(vocab_size)
        for line in range(vocab_size):
            # print(line)
            word = []
            while True:
                ch = f.read(1).decode('iso-8859-1')
                if ch == ' ':
                    word = ''.join(word)
                    break
                if ch != '\n':
                    word.append(ch)
            # print(word)
            if word in vocab:
                # print(word)
                word_vecs[word] = np.frombuffer(f.read(binary_len), dtype='float32')
            else:
                f.read(binary_len)

    return word_vecs

def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
    """
    For words that occur in at least min_df documents, create a separate word vector.    
    0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
    """
    for word in vocab:
        if word not in word_vecs and vocab[word] >= min_df:
            word_vecs[word] = np.random.uniform(-0.25, 0.25, k)
            

In [7]:
import time
start = time.time()
w2v_file = "./GoogleNews-vectors-negative300.bin"
w2v = load_bin_vec(w2v_file, word_to_id)
print("num words found: %d" % len(w2v))
add_unknown_words(w2v, word_to_id, k=300)
W, word_idx_map = get_W(w2v, k=300)

print("W shape: %s" % str(W.shape))

print("%d seconds to get the embeddings" % (time.time()-start))

num words found: 1966
W shape: (2003, 300)
59 seconds to get the embeddings


In [8]:
# Max and avg number of word
lengths = [len(x) for x in x_train_org]
print("max %d" % max(lengths))
print("mean %d" % np.mean(lengths))
print(lengths)

max 1851
mean 243
[218, 189, 141, 550, 147, 43, 123, 562, 233, 130, 450, 99, 117, 238, 109, 129, 163, 752, 212, 177, 129, 140, 256, 888, 93, 142, 220, 193, 171, 221, 174, 647, 233, 162, 597, 234, 51, 336, 139, 231, 704, 142, 861, 132, 122, 570, 55, 214, 103, 186, 113, 169, 469, 138, 302, 766, 351, 146, 59, 206, 107, 152, 186, 431, 147, 684, 383, 324, 252, 263, 787, 211, 314, 118, 390, 132, 710, 306, 167, 115, 95, 158, 156, 82, 502, 314, 190, 174, 60, 145, 214, 659, 408, 515, 461, 202, 238, 170, 107, 171, 158, 145, 790, 258, 287, 67, 123, 975, 775, 236, 195, 274, 214, 91, 1038, 815, 183, 206, 50, 118, 147, 141, 60, 56, 439, 439, 213, 144, 533, 303, 203, 563, 129, 153, 55, 92, 174, 187, 183, 165, 78, 198, 156, 223, 127, 61, 362, 84, 57, 176, 159, 57, 159, 165, 213, 194, 149, 130, 203, 19, 98, 466, 525, 130, 322, 153, 408, 215, 472, 143, 136, 354, 260, 319, 125, 209, 282, 810, 142, 240, 148, 198, 193, 123, 128, 103, 479, 345, 263, 165, 205, 333, 184, 92, 177, 335, 120, 121, 259, 180, 160,

In [9]:
# Padding the input data
from keras.preprocessing import sequence
input_length = 350 # average length 

x_train = sequence.pad_sequences(x_train_org, maxlen=input_length, padding='post', truncating='post')
x_test = sequence.pad_sequences(x_test_org, maxlen=input_length, padding='post', truncating='post')
print(x_train)


[[   1   13   21 ...,    0    0    0]
 [   1  193 1152 ...,    0    0    0]
 [   1   13   46 ...,    0    0    0]
 ..., 
 [   1   13   15 ...,   20    2  599]
 [   1   12  331 ...,    0    0    0]
 [   1  206  114 ...,    0    0    0]]


In [23]:
abc=[1,2,3,4,5]
input_length= 10
def pad(ut):
    return ut[:10]+['' for i in range(input_length-len(ut))]

In [24]:
pad(abc)

[1, 2, 3, 4, 5, '', '', '', '', '']

### Two dense layers for Sentiment Analysis

In [11]:
# create the model - Two dense layer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

model = Sequential()
model.add(Embedding(W.shape[0], W.shape[1], input_length=input_length, weights=[W]))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
# plot_model(model, to_file='Sent_FF.png', show_shapes=False, show_layer_names=True, rankdir='TB')

# loss function = binary_crossentropy
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 350, 300)          600900    
_________________________________________________________________
flatten_1 (Flatten)          (None, 105000)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                1050010   
_________________________________________________________________
dense_2 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 1,651,031
Trainable params: 1,651,031
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
# Fit the model
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=5, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Train on 5000 samples, validate on 500 samples
Epoch 1/5
 - 11s - loss: 0.6972 - acc: 0.5064 - val_loss: 0.6936 - val_acc: 0.4760
Epoch 2/5
 - 7s - loss: 0.6928 - acc: 0.5082 - val_loss: 0.6946 - val_acc: 0.4760
Epoch 3/5
 - 6s - loss: 0.6802 - acc: 0.5202 - val_loss: 0.6882 - val_acc: 0.5680
Epoch 4/5
 - 5s - loss: 0.5131 - acc: 0.7572 - val_loss: 0.6739 - val_acc: 0.6380
Epoch 5/5
 - 6s - loss: 0.1444 - acc: 0.9584 - val_loss: 0.6478 - val_acc: 0.7100
Accuracy: 71.00%


### Simple CNN for Sentiment Analysis

In [13]:
# create the model - CNN
from keras.layers import Conv2D, Reshape, MaxPooling2D

vocab_size = W.shape[0]
embedding_size = W.shape[1]

model = Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=input_length, weights=[W]))
model.add(Reshape((input_length, embedding_size, 1)))

# CNN hyperparameters
num_filters = 100
n_gram = 5
filter_size = (n_gram, embedding_size)
          
model.add(Conv2D(num_filters, filter_size, activation='relu'))
print(model.output_shape)
model.add(MaxPooling2D(pool_size=(model.output_shape[1], 1)))
          
model.add(Flatten())
model.add(Dense(250, activation='relu'))

model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
# plot_model(model, to_file='Sent_simple_CNN.png', show_shapes=False, show_layer_names=True, rankdir='TB')

(None, 346, 1, 100)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 350, 300)          600900    
_________________________________________________________________
reshape_1 (Reshape)          (None, 350, 300, 1)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 346, 1, 100)       150100    
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 1, 1, 100)         0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 250)               25250     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 2

In [14]:
# Fit the model
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=5, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Train on 5000 samples, validate on 500 samples
Epoch 1/5


KeyboardInterrupt: 

###  Non-sequential CNN for Sentiment Analysis

In [None]:
from keras.models import Model
from keras.layers import Input, Concatenate

# Input layer
input_x = Input(shape=(input_length,), dtype=np.int32)

# Embedding layer
embeddings = Embedding(vocab_size, embedding_size, input_length=input_length,
                           weights=[W])(input_x)

# Reshape to 3D
conv_input = Reshape((input_length, embedding_size, 1))(embeddings)

# CNN Hyperparameters
# 3 filters shapes
filter_widths = [3, 4, 5]

num_filters = 100
filter_height = embedding_size

pooled_outputs = []

# Convolution layers
for width in filter_widths:
    conv = Conv2D(num_filters, (width, filter_height), activation='relu')(conv_input)
    pooling = MaxPooling2D(pool_size=(int(conv.shape[1]), 1))(conv)
    pooled_outputs.append(pooling)
    
# Concatenation of maxpooling outputs
h_pool = Concatenate(axis=-1)(pooled_outputs)
h_pool = Reshape((int(h_pool.shape[-1]), ))(h_pool)

# Sigmoid
dense = Dense(1, activation='sigmoid')(h_pool)

model = Model(input_x, dense)


# Model compilation, Adam optimizer
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()    
# plot_model(model, to_file='Sent_non_seq_CNN.png', show_shapes=False, show_layer_names=True, rankdir='TB')

In [None]:
# Fit the model
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=5, batch_size=512, verbose=2)
# Final evaluation of the model
scores = model.evaluate(x_test, y_test,  batch_size=512, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

###  LSTM-RNN for Sentiment Analysis

In [25]:
from keras.models import Model
from keras.layers import LSTM, Input

# Input layer
input_x = Input(shape=(input_length,), dtype=np.int32)

# Embedding layer
embeddings = Embedding(2002, 32, input_length=input_length, 
                       mask_zero=True)(input_x)

# Reshape to 3D
lstm = LSTM(units=100)(embeddings)


# Sigmoid
dense = Dense(1, activation='sigmoid')(lstm)

# Defining model
model = Model(input_x, dense)


# Model compilation, Adam optimizer
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()    
# plot_model(model, to_file='Sent_lstm.png', show_shapes=False, show_layer_names=True, rankdir='TB')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 10, 32)            64064     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 101       
Total params: 117,365
Trainable params: 117,365
Non-trainable params: 0
_________________________________________________________________


TensorShape([Dimension(None), Dimension(10)])

In [None]:
# Fit the model
history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=5, batch_size=512)
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, batch_size=512, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
x_train.shape

In [None]:
# Plotting some items in gray scale 
import matplotlib.pyplot as plt

# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()


### Saving and loading the model



In [None]:
from keras.models import load_model

model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
del model  # deletes the existing model

# returns a compiled model
# identical to the previous one
model = load_model('my_model.h5')

scores = model.evaluate(x_test, y_test, batch_size=512, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))