# Deep Learning Methods for Text Sentiment Analysis

#### Description:

This codebook covers how to use deep learning methods for text sentiment analysis.

#### Skill level:

- Advanced

### Import the required libraries
-------------------------

In [1]:
import os
import sys

platform_path = os.path.abspath(os.path.join(os.path.abspath(''), '../../../'))
sys.path.append(platform_path)

In [2]:
#!pip install tensorflow

In [3]:
#!pip install gensim

In [4]:
import numpy as np
import pandas as pd
import HELPERS.data_preprocessing.text_normalizer as tn
import HELPERS.machine_learning.model_evaluation as me
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import gensim
import keras
from keras.models import Sequential
from keras.layers import Dropout, Activation, Dense

### Read data into a Pandas dataframe
-------------------------

In [5]:
df_raw = pd.read_csv(os.path.join(platform_path, 'DATA/movie_reviews.csv'))

### Check the shape and head of the dataframe
-------------------------

In [6]:
df_raw.shape

(50000, 2)

In [7]:
df_raw.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
#optional: use a subset to improve performance
df_raw = df_raw[:10000]

### Separate features from the label
-------------------------

In [9]:
X_all = np.array(df_raw['review'])
y_true_all = np.array(df_raw['sentiment'])

### Make a split between training and test sets of data
-------------------------

In [10]:
def shuffle_split_data(y_true_all, X_all, test_size):
    X_train, X_test, y_true_train, y_true_test = train_test_split(X_all, y_true_all, test_size=test_size)

    return X_train, y_true_train, X_test, y_true_test

In [11]:
X_train, y_true_train, X_test, y_true_test = shuffle_split_data(y_true_all, X_all, test_size=0.3)

### One-hot-encode the label data
-------------------------

In [12]:
y_true_train[:5]

array(['positive', 'positive', 'negative', 'negative', 'negative'],
      dtype=object)

In [13]:
le = LabelEncoder()

y_true_train_enc = le.fit_transform(y_true_train)

num_classes = 2
y_true_train_enc = keras.utils.to_categorical(y_true_train_enc, num_classes)

In [14]:
y_true_train_enc

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]], dtype=float32)

### Normalize the feature data
-------------------------

In [15]:
X_train_norm = tn.normalize_corpus(X_train)
X_test_norm = tn.normalize_corpus(X_test)

### Tokenize the feature data
-------------------------

In [16]:
X_train_norm_token = [tn.tokenizer.tokenize(text) for text in X_train_norm]
X_test_norm_token = [tn.tokenizer.tokenize(text) for text in X_test_norm]

### Perform feature engineering with word embeddings
-------------------------

In [17]:
num_features = 500

w2v = gensim.models.Word2Vec(X_train_norm_token, size=num_features, window=150,
                             min_count=10, sample=1e-3)   

In [18]:
def averaged_word2vec_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    
    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype='float64')
        nwords = 0.
        
        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)

        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                for tokenized_sentence in corpus]
    
    return np.array(features)

In [19]:
X_train_norm_awv = averaged_word2vec_vectorizer(corpus=X_train_norm_token, model=w2v, num_features=500)
X_test_norm_awv = averaged_word2vec_vectorizer(corpus=X_test_norm_token, model=w2v, num_features=500)

  feature_vector = np.add(feature_vector, model[word])


In [20]:
X_train_norm_awv.shape

(7000, 500)

In [21]:
X_test_norm_awv.shape

(3000, 500)

### Fit a deep learning neural network model using the averaged word vector data
-------------------------

In [22]:
def create_deep_nn_architecture(num_input_features):
    dnn = Sequential()
    
    dnn.add(Dense(512, activation='relu', input_shape=(num_input_features,)))
    dnn.add(Dropout(0.2))
    dnn.add(Dense(512, activation='relu'))
    dnn.add(Dropout(0.2))
    dnn.add(Dense(512, activation='relu'))
    dnn.add(Dropout(0.2))
    dnn.add(Dense(2))
    dnn.add(Activation('softmax'))

    dnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return dnn

In [23]:
dnn = create_deep_nn_architecture(num_input_features=500)

### Inspect the network structure
-------------------------

In [24]:
dnn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               256512    
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 1

In [25]:
batch_size = 100

dnn.fit(X_train_norm_awv, y_true_train_enc, epochs=5, batch_size=batch_size, 
        shuffle=True, validation_split=0.1, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x19e0f1f0f40>

### Generate predictions using the fitted model
-------------------------

In [26]:
y_pred_train_enc = np.argmax(dnn.predict(X_train_norm_awv), axis=-1)
y_pred_test_enc = np.argmax(dnn.predict(X_test_norm_awv), axis=-1)

In [27]:
y_pred_train = le.inverse_transform(y_pred_train_enc)
y_pred_test = le.inverse_transform(y_pred_test_enc)

### Check common error metrics for training and test sets of data
-------------------------

In [28]:
me.get_classification_metrics(y_true_train, y_pred_train)

accuracy_score: 0.8076
precision_score: 0.8076
recall_score: 0.8076
f1_score: 0.8076
