<a href="https://colab.research.google.com/github/dvrakeshreddy/NLP/blob/master/Sentiment_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Deep Sentiment Analysis Tutorial

### Setup Environment
- On Google Colab make sure you select Python 3/GPU runtime before running the code

#### Choose Python 3 + GPU/CPU

<img src="https://i.stack.imgur.com/khwGc.png" width="400"></img>
<img src="https://i.stack.imgur.com/5iL6w.png" width="400"></img>

In [1]:
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


### Download Data

In [2]:
![ ! -d data ] && mkdir data/
![ -f data/aclImdb_v1.tar.gz ] && echo "Skip Download"
![ ! -f data/aclImdb_v1.tar.gz ] && wget -N https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz -P data/

--2019-09-26 13:28:26--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘data/aclImdb_v1.tar.gz’


2019-09-26 13:28:33 (11.8 MB/s) - ‘data/aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [3]:
%%time
![ -d data/aclImdb/ ] && echo "Data already extracted"
![ ! -d data/aclImdb/ ] && tar -xzf data/aclImdb_v1.tar.gz -C data/

CPU times: user 72.7 ms, sys: 18 ms, total: 90.7 ms
Wall time: 11.2 s


### Imports

In [0]:
import os
import re
import nltk
from collections import Counter
from tqdm import tqdm_notebook
import numpy as np
import tensorflow as tf
from tensorflow.contrib import seq2seq
from tensorflow.contrib.rnn import DropoutWrapper
import random

In [0]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
MAX_SEQ_LEN = 100
BATCH_SIZE = 50

In [0]:
class Lang:
    def __init__(self, counter, vocab_size):
        self.word2id = {}
        self.id2word = {}
        self.pad = "<PAD>"
        self.sos = "<SOS>"
        self.eos = "<EOS>"
        self.unk = "<UNK>"
        
        self.ipad = 0
        self.isos = 1
        self.ieos = 2
        self.iunk = 3
        
        self.word2id[self.pad] = 0
        self.word2id[self.sos] = 1
        self.word2id[self.eos] = 2
        self.word2id[self.unk] = 3
        
        self.id2word[0] = self.pad
        self.id2word[1] = self.sos
        self.id2word[2] = self.eos
        self.id2word[3] = self.unk
        
        curr_id = 4
        for w, c in counter.most_common(vocab_size-curr_id):
            self.word2id[w] = curr_id
            self.id2word[curr_id] = w
            curr_id += 1
    
    def encodeSentence(self, wseq, max_len=-1):
        # wseq = nltk.tokenize.word_tokenize(s.lower().strip())
        if max_len == -1:
            return [self.word2id[w] if w in self.word2id else self.iunk for w in wseq]
        else:
            return ([self.word2id[w] if w in self.word2id else self.iunk for w in wseq] + [self.ieos] + [self.ipad]*max_len)[:max_len]
        
    def encodeSentence2(self, wseq, max_len=-1):
        # wseq = nltk.tokenize.word_tokenize(s.lower().strip()) 
        return min(max_len, len(wseq)+1), \
            ([self.word2id[w] if w in self.word2id else self.iunk for w in wseq] + \
                [self.ieos] + [self.ipad]*max_len)[:max_len]
    
    def decodeSentence(self, id_seq):
        id_seq = np.array(id_seq + [self.ieos])
        j = np.argmax(id_seq==self.ieos)
        s = ' '.join([self.id2word[x] for x in id_seq[:j]])
        s = s.replace(self.unk, "UNK")
        return s

### Let's read in the data

In [0]:
data_folder = 'data/aclImdb/'

In [0]:
rp = os.path.join(data_folder, 'train/pos')
train_positive = [os.path.join(rp, f) for f in os.listdir(rp)]
rp = os.path.join(data_folder, 'train/neg')
train_negative = [os.path.join(rp, f) for f in os.listdir(rp)]

rp = os.path.join(data_folder, 'test/pos')
test_positive = [os.path.join(rp, f) for f in os.listdir(rp)]
rp = os.path.join(data_folder, 'test/neg')
test_negative = [os.path.join(rp, f) for f in os.listdir(rp)]

In [0]:
re_html_cleaner = re.compile(r"<.*?>")

In [13]:
en_counter = Counter()
train_data = []
for _fname in tqdm_notebook(train_positive[:], desc="Crunching +ve samples: "):
    with open(_fname) as f:
        text = f.read().strip()
        text = re_html_cleaner.sub(" ", text)
        wseq = nltk.tokenize.word_tokenize(text.lower())
        en_counter += Counter(wseq)
        train_data.append((wseq, 1))
        
for _fname in tqdm_notebook(train_negative[:], desc="Crunching -ve samples: "):
    with open(_fname) as f:
        text = f.read().strip()
        text = re_html_cleaner.sub(" ", text)
        wseq = nltk.tokenize.word_tokenize(text.lower())
        en_counter += Counter(wseq)
        train_data.append((wseq, 0))

HBox(children=(IntProgress(value=0, description='Crunching +ve samples: ', max=12500, style=ProgressStyle(desc…




HBox(children=(IntProgress(value=0, description='Crunching -ve samples: ', max=12500, style=ProgressStyle(desc…




In [14]:
test_data = []
for _fname in tqdm_notebook(test_positive, desc="Crunching +ve samples: "):
    with open(_fname) as f:
        text = f.read().strip()
        text = re_html_cleaner.sub(" ", text)
        wseq = nltk.tokenize.word_tokenize(text.lower())
        test_data.append((wseq, 1,_fname))
        
for _fname in tqdm_notebook(test_negative, desc="Crunching -ve samples: "):
    with open(_fname) as f:
        text = f.read().strip()
        text = re_html_cleaner.sub(" ", text)
        wseq = nltk.tokenize.word_tokenize(text.lower())
        test_data.append((wseq, 0,_fname))

HBox(children=(IntProgress(value=0, description='Crunching +ve samples: ', max=12500, style=ProgressStyle(desc…




HBox(children=(IntProgress(value=0, description='Crunching -ve samples: ', max=12500, style=ProgressStyle(desc…




In [15]:
print(len(train_data),len(test_data))

25000 25000


In [16]:
# A few sample english words
print("\nMost common en words in dataset:\n", en_counter.most_common(10))

print("\nTotal (en)words gathered from dataset:", len(en_counter))


Most common en words in dataset:
 [('the', 334679), (',', 275881), ('.', 271448), ('and', 163327), ('a', 162141), ('of', 145428), ('to', 135194), ('is', 110395), ('it', 95707), ('in', 93248)]

Total (en)words gathered from dataset: 106415


In [0]:
V = 10000

In [0]:
en_lang = Lang(en_counter, V)

In [19]:
wseq = nltk.tokenize.word_tokenize("Where are you going?".lower())
print("Test en encoding:", en_lang.encodeSentence(wseq))
print("Test en decoding:", en_lang.decodeSentence(en_lang.encodeSentence(wseq, 10)))

Test en encoding: [131, 33, 27, 182, 58]
Test en decoding: where are you going ?


### The RNN based Sentence Classifier architecture
- We will implement a RNN based classifier architecture for sentiment analysis in Tensorflow r1.13.1 / r1.14
- Debugging Tip: Always keep track of tensor dimensions!
- **Tensorflow Computation Graph** - We will build a tf computation graph first. This is the representation used by tf for any neural network architecture. Once the computation graph is built, you can feed data to it for training or inference

#### Word Embedding Matrix

In [20]:
en_word_emb_matrix = tf.get_variable("en_word_emb_matrix", (V, 300), dtype=tf.float32)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


#### Placeholders

In [0]:
keep_prob = tf.placeholder(tf.float32)

In [0]:
input_ids = tf.placeholder(tf.int32, (None, MAX_SEQ_LEN))
input_lens = tf.placeholder(tf.int32, (None, ))

In [0]:
y_placeholder = tf.placeholder(tf.int32, (None,))

#### Tensorflow Graphs

In [0]:
input_emb = tf.nn.embedding_lookup(en_word_emb_matrix, input_ids)

In [25]:
input_emb.shape

TensorShape([Dimension(None), Dimension(100), Dimension(300)])

#### Encoder

##### RNN Units

In [26]:
# Create a single GRU cell
encoder_cell = tf.nn.rnn_cell.GRUCell(128)
# Add dropout : Dropout is applied to the hidden state output at every time step
encoder_cell = DropoutWrapper(encoder_cell, output_keep_prob=keep_prob)

Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.


In [27]:
# Unrolling of time-sequence
# Apply the encoder cell on input sequence and unroll computation upto
# max sequence length
enc_outputs, enc_state = tf.nn.dynamic_rnn(
    encoder_cell, input_emb, sequence_length=input_lens, initial_state=encoder_cell.zero_state(BATCH_SIZE, dtype=tf.float32)
)

Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [28]:
enc_outputs.shape

TensorShape([Dimension(50), Dimension(100), Dimension(128)])

In [29]:
enc_state.shape

TensorShape([Dimension(50), Dimension(128)])

### Classifier Layer

In [30]:
# A simple fully connected linear layer
# W^T*X + b

layer1 = tf.layers.dense(inputs=enc_state, units=64, activation=tf.nn.relu)
layer2 = tf.layers.dense(inputs=layer1, units=16, activation=tf.nn.relu)
dense_layer = tf.layers.dense(inputs=layer2, units=1)

Instructions for updating:
Use keras.layers.dense instead.


#### Approaches:
As input to the final linear layers use mean of the hidden states?

or

As input to the final linear layers use the last hidden state?

##### Approch 1: Take mean of enc_outputs across dimension 1
- **IMPORTANT:** Need to **mask** the positions in input sentence that doesn't contain any inputs

In [0]:
# masks = tf.sequence_mask(input_lens, MAX_SEQ_LEN, dtype=tf.float32, name='masks')
# class_prob = tf.nn.sigmoid(
#                 dense_layer(
#                     tf.reduce_mean(
#                         enc_outputs*masks[:, :, None], 1)
#                 )
# ) 

# print(class_prob.shape)

##### Approch 2: Use enc_state (final hidden state)

In [32]:
class_prob = tf.nn.sigmoid(dense_layer)
print(class_prob.shape)

(50, 1)


#### Loss and Optimizers [softmax_cross_entropy]
Note that `onehot_labels` and `logits` must have the same shape, e.g. `[batch_size, num_classes]`

In [33]:
print(y_placeholder.shape)
print(class_prob.shape)

(?,)
(50, 1)


In [0]:
# Loss function - softmax cross entropy
y_ = tf.cast(y_placeholder[:, None], dtype=tf.float32)
cost = -y_*tf.log(class_prob + 1e-12) - (1-y_)*tf.log(1-class_prob + 1e-12)
cost = tf.reduce_mean(cost)

# Optimizer
optimizer = tf.train.AdamOptimizer(0.001)

In [0]:
train_op = optimizer.minimize(cost)

In [0]:
init = tf.global_variables_initializer()

#### Tensorflow Sessions

In [0]:
sess_config = tf.ConfigProto()
sess_config.gpu_options.allow_growth = True

In [0]:
sess = tf.InteractiveSession(config=sess_config)
sess.run(init)

#### Minibatch Training

In [0]:
random.seed(41)

In [0]:
random.shuffle(train_data)

In [41]:
train_n = len(train_data)
train_n

25000

In [42]:
test_n = len(test_data)
test_n

25000

In [44]:
for _e in range(2):
    # Mix things up a bit.
    random.shuffle(train_data)
    pbar = tqdm_notebook(range(0, train_n, BATCH_SIZE))
    batch_loss = 0
    bxi = 0
    for m in pbar:
        n = m + BATCH_SIZE
        if n <= train_n:
            # print("Epoch Complete... \n")

            input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
            input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
            true_class_batch = np.zeros((BATCH_SIZE))
            for i in range(m, n):
                b,a = en_lang.encodeSentence2(train_data[i][0], MAX_SEQ_LEN)
                input_batch[i-m,:] = a
                input_lens_batch[i-m] = b
                true_class_batch[i-m] = train_data[i][1]

            feed_dict={
                input_ids: input_batch,
                input_lens: input_lens_batch,
                y_placeholder: true_class_batch,
                keep_prob: 0.6
            }
            sess.run(train_op, feed_dict=feed_dict)
            batch_loss += sess.run(cost, feed_dict=feed_dict)
            pbar.set_description(f"Epoch: {_e} >> Loss: {batch_loss/(bxi+1):2.2F}:")
            bxi += 1
            if (1 + n//BATCH_SIZE) % 100 == 0:
                small_test()

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

Precision: 75.02, Recall: 80.98, F1-Score: 77.89
Precision: 80.71, Recall: 76.70, F1-Score: 78.66
Precision: 84.36, Recall: 68.81, F1-Score: 75.80
Precision: 79.34, Recall: 82.32, F1-Score: 80.80
Precision: 77.04, Recall: 87.30, F1-Score: 81.85



HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

Precision: 79.46, Recall: 84.27, F1-Score: 81.80
Precision: 84.78, Recall: 73.07, F1-Score: 78.49
Precision: 82.32, Recall: 79.57, F1-Score: 80.92
Precision: 80.07, Recall: 83.76, F1-Score: 81.87
Precision: 76.02, Recall: 88.31, F1-Score: 81.70



In [45]:
print(BATCH_SIZE,test_n)

50 25000


In [0]:

def small_test_modified():
    all_true = []
    all_preds = []
    name = []
    for m in range(0, test_n, BATCH_SIZE):
        n = m + BATCH_SIZE
        if n > test_n:
            break

        input_batch = np.zeros((BATCH_SIZE, MAX_SEQ_LEN), dtype=np.int32)
        input_lens_batch = np.zeros((BATCH_SIZE,), dtype=np.int32)
        true_class_batch = np.zeros((BATCH_SIZE))
        for i in range(m, n):
            b,a = en_lang.encodeSentence2(test_data[i][0], MAX_SEQ_LEN)
            input_batch[i-m,:] = a
            input_lens_batch[i-m] = b
            true_class_batch[i-m] = test_data[i][1]
            name.append(test_data[i][2])

        feed_dict={
            input_ids: input_batch,
            input_lens: input_lens_batch,
            keep_prob: 1.0
        }
        pred_batch = sess.run(class_prob, feed_dict=feed_dict)
        # acc = accuracy_score(true_class_batch, pred_batch > 0.5)
        all_true.extend(list(true_class_batch))
        all_preds.extend(list(pred_batch[:,0]))
    
    all_true = np.array(all_true)
    all_preds = np.array(all_preds)
    prec = precision_score(all_true, all_preds > 0.5)*100
    rec = recall_score(all_true, all_preds > 0.5)*100
    f1 = f1_score(all_true, all_preds > 0.5)*100
    print(f"Precision: {prec:2.2F}, Recall: {rec:2.2F}, F1-Score: {f1:2.2F}")
    return all_preds,all_true,name

In [47]:
pred,tru,name = small_test_modified()

Precision: 75.70, Recall: 88.77, F1-Score: 81.72


In [48]:
lis =[]
for i in  range(0,len(name)):
    lis.append(str(name[i]+','+ str(pred[i])))
len(lis)

25000

In [0]:
file = open("output.csv","w")
file.write('filename,prob_positive')
for i in range(0,len(name)):
    file.write('\n')
    file.write(lis[i])
file.close()

In [50]:
def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

file_len('output.csv')

25001

References: https://github.com/bsantraigi/Tensorflow-RNN-Tutorials/blob/master/Sentiment%20Analysis%20v2.ipynb

### Improvemetns over Vanila RNN

Ahieved improvement in f1-score:
from ~77% to 81.72%

Made modifications to:
 - lstm size 
 - dropout
 - Added more hidden layers to the classifier