# Sentiment Classification


#### Import the packages

In [0]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score,classification_report
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Conv1D,MaxPooling1D,MaxPool1D
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional,Flatten,BatchNormalization

In [0]:
# vocab_size is no.of words to consider from the dataset, ordering based on frequency.
vocab_size = 10000 #vocab size
maxlen = 300  #number of word used from each review

#### Load the test and train data

In [0]:
#load dataset as a list of ints
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)

In [4]:
print("train_data ", x_train.shape)
print("train_labels ", y_train.shape)
print("_"*100)
print("test_data ", x_test.shape)
print("test_labels ", y_test.shape)
print("_"*100)
print("Maximum value of a word index ")
print(max([max(sequence) for sequence in x_train]))
print("Maximum length num words of review in train ")
print(max([len(sequence) for sequence in x_train]))

train_data  (25000,)
train_labels  (25000,)
____________________________________________________________________________________________________
test_data  (25000,)
test_labels  (25000,)
____________________________________________________________________________________________________
Maximum value of a word index 
9999
Maximum length num words of review in train 
2494


#### Get the word index and then Create a key-value pair for word and word_id

In [5]:
# See an actual review in words
# Reverse from integers to words using the DICTIONARY

word_index = imdb.get_word_index()

reverse_word_index = dict(
[(value, key) for (key, value) in word_index.items()])

decoded_review = ' '.join(
[reverse_word_index.get(i - 3, '?') for i in x_train[345]])

print(decoded_review)

? as romantic comedies go this was a cute and winning one i thought that the writing could have been stronger to build up the final connection a bit better but that is not a huge ? point but amanda ? and scott wolf give nice performances and are as charming as ever these are two of my favorite actors and i was just glad to see them cast as romantic leads i hope to see them cast in more projects soon br br overall this movie won't change your life but is is sweet warm and winning not a bad thing to be at all


In [0]:
#make all sequences of the same length
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test =  pad_sequences(x_test, maxlen=maxlen)

In [7]:
print("train_data ", x_train.shape)
print("test_data ", x_test.shape)

train_data  (25000, 300)
test_data  (25000, 300)


## Build Keras Embedding Layer Model
We can think of the Embedding layer as a dicionary that maps a index assigned to a word to a word vector. This layer is very flexible and can be used in a few ways:

* The embedding layer can be used at the start of a larger deep learning model. 
* Also we could load pre-train word embeddings into the embedding layer when we create our model.
* Use the embedding layer to train our own word2vec models.

The keras embedding layer doesn't require us to onehot encode our words, instead we have to give each word a unqiue intger number as an id. For the imdb dataset we've loaded this has already been done, but if this wasn't the case we could use sklearn [LabelEncoder](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html).

#### Build a Sequential Model using Keras for the Sentiment Classification task

In [8]:
print('Build model...')
model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = 32, input_length = maxlen))
model.add(Dropout(0.2))
model.add(Conv1D(filters = 32, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(MaxPool1D(pool_size = 2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation = 'sigmoid'))             
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

Build model...
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 32)           320000    
_________________________________________________________________
dropout (Dropout)            (None, 300, 32)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 300, 32)           3104      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 150, 32)           0         
_________________________________________________________________
lstm (LSTM) 

In [9]:
print('Train...')
history = model.fit(x_train, y_train,batch_size=64,epochs=3,validation_data=(x_test, y_test))

Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


#### the Accuracy of the model

In [0]:
pred = model.predict(x_test)

In [11]:
# Changing the shape of pred to 1-Dimensional
ytest_prediction = np.array(pred).reshape((25000, ))
for i in range(len(ytest_prediction)):
    ytest_prediction[i] = round(ytest_prediction[i])
ytest_prediction = ytest_prediction.astype(int)
print("accuracy:",accuracy_score(y_test,ytest_prediction))
print("classification_report:\n",classification_report(y_test,ytest_prediction))

accuracy: 0.8736
classification_report:
               precision    recall  f1-score   support

           0       0.85      0.91      0.88     12500
           1       0.91      0.83      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.88      0.87      0.87     25000
weighted avg       0.88      0.87      0.87     25000



In [12]:
from sklearn.metrics import  roc_auc_score
print("The roc AUC socre is : %.4f." %roc_auc_score(y_test, pred)) 

The roc AUC socre is : 0.9481.


## Retrive the output of each layer in keras for a given single test sample from the trained model


In [13]:
for layer in model.layers:

    intermediate_layer_model = Model(inputs=model.input,
                                    outputs=model.get_layer(layer.name).output)
    intermediate_output = intermediate_layer_model.predict(x_test[np.random.randint(25000)].reshape(1,-1))
    print("\n",10*"-",layer.name," layer",10*"-","\n")
    print(intermediate_output)


 ---------- embedding  layer ---------- 

[[[-0.01104451 -0.07824421 -0.02044006 ... -0.01431841 -0.0074472
    0.03292025]
  [-0.01104451 -0.07824421 -0.02044006 ... -0.01431841 -0.0074472
    0.03292025]
  [-0.01104451 -0.07824421 -0.02044006 ... -0.01431841 -0.0074472
    0.03292025]
  ...
  [-0.01511567 -0.09015582 -0.05512023 ... -0.03145565  0.03647319
   -0.00206182]
  [-0.00131926  0.03686389  0.00565225 ... -0.01578274  0.00599393
    0.00061738]
  [ 0.03404765  0.0072722  -0.01393873 ... -0.05090065  0.04384007
    0.04650633]]]

 ---------- dropout  layer ---------- 

[[[-0.01104451 -0.07824421 -0.02044006 ... -0.01431841 -0.0074472
    0.03292025]
  [-0.01104451 -0.07824421 -0.02044006 ... -0.01431841 -0.0074472
    0.03292025]
  [-0.01104451 -0.07824421 -0.02044006 ... -0.01431841 -0.0074472
    0.03292025]
  ...
  [ 0.03040791 -0.04417368  0.02203849 ... -0.01377411 -0.00967952
    0.02545103]
  [-0.01014464 -0.00710064 -0.02316453 ...  0.03871964  0.03896904
   -0.03615