<a href="https://colab.research.google.com/github/cs145442/nlp-projects-with-tf2/blob/master/sentiment_classification_with_keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 0. Handling the dependencies

In [110]:
# install all the dependencies here
! pip install bert-for-tf2



In [111]:
# import all the dependencies here
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

from tensorflow.keras import layers

import tensorflow_hub as hub
import pandas as pd
import bert

import math
import random
import spacy

In [245]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import LabelEncoder

import os
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Bidirectional
from keras.preprocessing import sequence
from keras.layers import Dropout


## 1. Gathering the data

### 1.1 Getting the dataset

In [112]:
# add and unzip the dataset here
! ls
! wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
! unzip stanfordSentimentTreebank.zip

__MACOSX  sample_data  stanfordSentimentTreebank  stanfordSentimentTreebank.zip
--2020-11-23 09:07:38--  http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip [following]
--2020-11-23 09:07:38--  https://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6372817 (6.1M) [application/zip]
Saving to: ‘stanfordSentimentTreebank.zip.1’


2020-11-23 09:07:39 (7.91 MB/s) - ‘stanfordSentimentTreebank.zip.1’ saved [6372817/6372817]

Archive:  stanfordSentimentTreebank.zip
replace stanfordSentimentTreebank/datasetSentences.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: st

In [113]:
! cat stanfordSentimentTreebank/README.txt

Stanford Sentiment Treebank V1.0

This is the dataset of the paper:

Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank
Richard Socher, Alex Perelygin, Jean Wu, Jason Chuang, Christopher Manning, Andrew Ng and Christopher Potts
Conference on Empirical Methods in Natural Language Processing (EMNLP 2013)

If you use this dataset in your research, please cite the above paper.

@incollection{SocherEtAl2013:RNTN,
title = {{Parsing With Compositional Vector Grammars}},
author = {Richard Socher and Alex Perelygin and Jean Wu and Jason Chuang and Christopher Manning and Andrew Ng and Christopher Potts},
booktitle = {{EMNLP}},
year = {2013}
}

This file includes:
1. original_rt_snippets.txt contains 10,605 processed snippets from the original pool of Rotten Tomatoes HTML files. Please note that some snippet may contain multiple sentences.

2. dictionary.txt contains all phrases and their IDs, separated by a vertical line |

3. sentiment_labels.txt contains all phrase 

### 1.2 Exploring the dataset

In [114]:
# take a peek at the dataset format
! echo "----- contents of the treebank -------------------"
! ls stanfordSentimentTreebank
! echo "----- first 5 lines of dictionary.txt ------------"
! tail -n 5 stanfordSentimentTreebank/dictionary.txt
! echo "----- first 5 lines of sentiment_labels.txt ------"
! tail -n 5 stanfordSentimentTreebank/sentiment_labels.txt

----- contents of the treebank -------------------
datasetSentences.txt  dictionary.txt		README.txt	      SOStr.txt
datasetSplit.txt      original_rt_snippets.txt	sentiment_labels.txt  STree.txt
----- first 5 lines of dictionary.txt ------------
zoning ordinances to protect your community from the dullest science fiction|220441
zzzzzzzzz|179256
élan|220442
É|220443
É um passatempo descompromissado|220444
----- first 5 lines of sentiment_labels.txt ------
239227|0.36111
239228|0.38889
239229|0.33333
239230|0.88889
239231|0.5


In [115]:
# reading the dataset
dataset_df = pd.read_csv('stanfordSentimentTreebank/dictionary.txt', sep='\n')
dataset_df.head()

Unnamed: 0,!|0
0,! '|22935
1,! ''|18235
2,! Alas|179257
3,! Brilliant|22936
4,! Brilliant !|40532


In [116]:
# formatting the dataframe for processing
dataset_df['phrase_text'] = dataset_df['!|0'].apply(lambda x: x.split('|')[0])
dataset_df['phrase_ids'] = dataset_df['!|0'].apply(lambda x: x.split('|')[1])
dataset_df = dataset_df.drop('!|0', axis=1)

In [117]:
# take a peek at the dataframe
dataset_df.tail()

Unnamed: 0,phrase_text,phrase_ids
239226,zoning ordinances to protect your community fr...,220441
239227,zzzzzzzzz,179256
239228,élan,220442
239229,É,220443
239230,É um passatempo descompromissado,220444


In [118]:
# reading the sentiment data
sentiment_df = pd.read_csv('stanfordSentimentTreebank/sentiment_labels.txt', sep='\n')
sentiment_df.head()

Unnamed: 0,phrase ids|sentiment values
0,0|0.5
1,1|0.5
2,2|0.44444
3,3|0.5
4,4|0.42708


### 1.3 Formatting the dataset

In [119]:
# formatting the sentiment dataframe for processing
sentiment_df['phrase_ids'] = sentiment_df['phrase ids|sentiment values'].apply(lambda x: x.split('|')[0])
sentiment_df['sentiment_values'] = sentiment_df['phrase ids|sentiment values'].apply(lambda x: x.split('|')[1])
sentiment_df = sentiment_df.drop('phrase ids|sentiment values', axis=1)

In [120]:
sentiment_df.head()

Unnamed: 0,phrase_ids,sentiment_values
0,0,0.5
1,1,0.5
2,2,0.44444
3,3,0.5
4,4,0.42708


In [121]:
# let's merge the phrases and sentiments
dataset_sentiment_df = pd.merge(left=dataset_df, right=sentiment_df, how='inner', on='phrase_ids')
# let's also validate the number of datapoints
print(f"dataset df shape: {dataset_df.shape}")
print(f"sentiment df shape: {sentiment_df.shape}")
print(f"dataset_sentiment df shape: {dataset_sentiment_df.shape}")

dataset df shape: (239231, 2)
sentiment df shape: (239232, 2)
dataset_sentiment df shape: (239231, 3)


*seems good. we missed one datapoint while merging, that's okay for now.*

In [122]:
def recover_sentiment_class(sentiment_value: float):
  """
  recovering classes from sentiment_values
  [very negative, negative, neutral, positive, very positive]
  [0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.6, 0.8], (0.8, 1.0]
  [1, 2, 3, 4, 5]
  respectively
  :params:
    sentiment_value: floating value of sentiment
  """
  if sentiment_value <= 0.2:
    return 1
  elif sentiment_value <= 0.4:
    return 2
  elif sentiment_value <= 0.6:
    return 3
  elif sentiment_value <= 0.8:
    return 4
  else:
    return 5

In [123]:
dataset_sentiment_df['sentiment_class'] = dataset_sentiment_df['sentiment_values'].apply(
    lambda x: recover_sentiment_class(float(x)))

In [124]:
dataset_sentiment_df.tail()

Unnamed: 0,phrase_text,phrase_ids,sentiment_values,sentiment_class
239226,zoning ordinances to protect your community fr...,220441,0.13889,1
239227,zzzzzzzzz,179256,0.19444,1
239228,élan,220442,0.51389,3
239229,É,220443,0.5,3
239230,É um passatempo descompromissado,220444,0.5,3


## 2. Generate Input Features

### 2.1 Generating the word vectors

### 2.1.a With Keras Tokenizer

In [185]:
keras_tokenizer = Tokenizer(num_words=1500, split=' ')
keras_tokenizer.fit_on_texts(dataset_sentiment_df['phrase_text'].values)
sequenced_dataset = keras_tokenizer.texts_to_sequences(dataset_sentiment_df['phrase_text'])
sequenced_dataset = pad_sequences(sequenced_dataset)

### 2.1.b With Bert Tokenizer

In [125]:
# let's setup the tokenizer
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
# define the vocab and tokenizer from the bert_layer here
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [127]:
# simple function to encode the sentence
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [223]:
encoded_phrases = dataset_sentiment_df['phrase_text'].apply(lambda x: encode_sentence(x))

In [225]:
encoded_phrases = pad_sequences(encoded_phrases)

*we're using bert layer for tokenization only!*

In [128]:
tokenizer.tokenize("don't be so judgemental")

['don', "'", 't', 'be', 'so', 'judgement', '##al']

### 2.2 Generating the POS Tags
*Indicates the part-of-speech tag of the word*

In [130]:
# load the spacy model
spacy_nlp_model = spacy.load("en_core_web_sm")

In [131]:
# let's create the spacy token objects of all the sequences
list_of_phrases_spacy_docs = [spacy_nlp_model(phrase) for phrase in list_of_phrases]

KeyboardInterrupt: ignored

In [None]:
def get_pos_tags(doc: spacy.tokens.doc.Doc):
  """
  returns a list of token's POS Tags
  """
  # generator of pos tags for the sequence or word
  tags = []
  for token in doc:
    tags.append(token.pos_)
  return tags


dict_of_pos_tags = {}
def get_pos_tag2id(pos_tag: str):
  """
  returns the id for the POS Tag from the universal dict of POS Tags
  """
  id = dict_of_pos_tags.get(pos_tag, None)
  if id is None:
    # i.e, POS Tag is not in the dict
    # Add the new POS Tag to the dict
    new_id = len(dict_of_pos_tags) + 1
    dict_of_pos_tags[pos_tag] = new_id
    return new_id
  else:
    return id


def get_pos_tag_ids(doc: spacy.tokens.doc.Doc):
  """
  returns a list of token's POS Tags ID's
  """
  tag_ids = []
  for token in doc:
    tag_ids.append(get_pos_tag2id(token.pos_))
  return tag_ids

In [None]:
list_of_phrases_pos_tags = [get_pos_tags(phrase) for phrase in list_of_phrases_spacy_docs]
list_of_phrases_pos_tag_ids = [get_pos_tag_ids(phrase) for phrase in list_of_phrases_spacy_docs]

### 2.3 Generate the word shape
*Indicates whether a word start with a captial letter?*

In [132]:
def get_word_shape(doc: spacy.tokens.doc.Doc):
  """
  returns a list of token's shape
  """
  shapes = []
  for token in doc:
    shapes.append(token.shape_)
  return shapes

def get_init_word_case_match(doc: spacy.tokens.doc.Doc):
  """
  returns a list of token's initial case match indicator
  indicators: 1 if the token begins with an UPPERCASE letter
  indicators: 2 if the token begins with an lowercase letter
  """
  init_word_case_match = []
  for token in doc:
    init_word_case_match.append(1 if token.text.istitle() else 2)
  return init_word_case_match

In [133]:
# list_of_phrases_word_shape
list_of_phrases_word_shape = [get_word_shape(phrase) for phrase in list_of_phrases_spacy_docs]
list_of_phrases_init_word_case_match = [get_init_word_case_match(phrase) for phrase in list_of_phrases_spacy_docs]

### 2.4 Generate the lemmatized word sequence
*Indicates whether a word end with an “ing” or with “ly” or neither?*

In [None]:
def get_word_lemma(doc: spacy.tokens.doc.Doc):
  """
  returns a list of token's lemma
  """
  lemmas = []
  for token in doc:
    lemmas.append(token.lemma_)
  return lemmas

def get_word_lemma_match(doc: spacy.tokens.doc.Doc):
  """
  returns a list of token's lemma match indicator
  indicators: 1 if there's a match of token with it's lemma
  indicators: 2 if there's not a match of token with it's lemma
  """
  lemma_match = []
  for token in doc:
    if token.text == token.lemma_:
      lemma_match.append(1)
    else:
      lemma_match.append(2)
  return lemma_match

In [None]:
list_of_phrases_lemma = [get_word_lemma(phrase) for phrase in list_of_phrases_spacy_docs]
list_of_phrases_lemma_match = [get_word_lemma_match(phrase) for phrase in list_of_phrases_spacy_docs]

In [53]:
y = dataset_sentiment_df['sentiment_class']

list_of_pharses_with_length = [[phrase, y[i], len(phrase)]
                 for i, phrase in enumerate(encoded_phrases)]

random.shuffle(list_of_pharses_with_length)
list_of_pharses_with_length.sort(key=lambda x: x[2])

sorted_phrases_sentiments = [(phrase_with_length[0], phrase_with_length[1]) for phrase_with_length in list_of_pharses_with_length]

## 3. Sentiment Analysis 
#### without multiple input features

### 3.1 Sentiment Analysis with Keras Tokenizer using LSTM

### 3.1.a 

loss: sparse_categorical_crossentropy

optimizer: adam

metrics: accuracy

In [217]:
vocab_size = 1500
embed_dim = 128
lstm_out = 196
input_length = 45

simple_model = Sequential()
simple_model.add(Embedding(vocab_size, embed_dim,input_length=input_length))
simple_model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
simple_model.add(Dense(5,activation='softmax'))
simple_model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])





In [218]:
X_train, X_test, y_train, y_test = train_test_split(
    sequenced_dataset,
    dataset_sentiment_df['sentiment_class'],
    test_size = 0.40,
    random_state = 42
    )

simple_model.fit(
    X_train,
    y_train,
    validation_data = (X_test,y_test),
    epochs = 1,
    batch_size=64
    )



<tensorflow.python.keras.callbacks.History at 0x7f55f2132e10>

In [219]:
simple_model.evaluate(X_test,y_test)



[nan, 0.0]

### 3.1.b

loss: sparse_categorical_crossentropy

optimizer: adam

metrics: sparse_categorical_accuracy

In [241]:
vocab_size = 1500
embed_dim = 128
lstm_out = 196
input_length = 45

simple_model = Sequential()
simple_model.add(Embedding(vocab_size, embed_dim,input_length=input_length))
simple_model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0))
simple_model.add(Dense(5,activation='softmax'))
simple_model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['sparse_categorical_accuracy'])

In [242]:
X_train, X_test, y_train, y_test = train_test_split(
    sequenced_dataset,
    dataset_sentiment_df['sentiment_class'],
    test_size = 0.40,
    random_state = 42
    )

simple_model.fit(
    X_train,
    y_train,
    validation_data = (X_test,y_test),
    epochs = 1,
    batch_size=32
    )



<tensorflow.python.keras.callbacks.History at 0x7f55f0ef37b8>

In [243]:
simple_model.evaluate(X_test,y_test)



[nan, 0.0]

### 3.2 Sentiment Analysis with Bert Tokenizer using LSTM

#### 3.2.a 

loss: sparse_categorical_crossentropy

optimizer: adam

metrics: accuracy

In [232]:
# Defining all the hyperparameters
vocab_size = len(tokenizer.vocab)
embed_dim = 128
lstm_out = 196
input_length = 64

simple_model_with_bert = Sequential()
simple_model_with_bert.add(Embedding(vocab_size, embed_dim,input_length=input_length))
simple_model_with_bert.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0))
simple_model_with_bert.add(Dense(5,activation='softmax'))
simple_model_with_bert.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [233]:
X_train, X_test, y_train, y_test = train_test_split(
    encoded_phrases,
    dataset_sentiment_df['sentiment_class'],
    test_size = 0.40,
    random_state = 42
    )

simple_model_with_bert.fit(
    X_train,
    y_train,
    validation_data = (X_test,y_test),
    epochs = 1,
    batch_size=32
    )



<tensorflow.python.keras.callbacks.History at 0x7f568c88e748>

In [None]:
simple_model_with_bert.evaluate(X_test,y_test)

### 3.2.b 

loss: sparse_categorical_crossentropy

optimizer: adam

metrics: sparse_categorical_accuracy

In [234]:
# Defining all the hyperparameters
vocab_size = len(tokenizer.vocab)
embed_dim = 128
lstm_out = 196
input_length = 64

simple_model_with_bert = Sequential()
simple_model_with_bert.add(Embedding(vocab_size, embed_dim,input_length=input_length))
simple_model_with_bert.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0))
simple_model_with_bert.add(Dense(5,activation='softmax'))
simple_model_with_bert.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['sparse_categorical_accuracy'])

In [235]:
X_train, X_test, y_train, y_test = train_test_split(
    encoded_phrases,
    dataset_sentiment_df['sentiment_class'],
    test_size = 0.40,
    random_state = 42
    )

simple_model_with_bert.fit(
    X_train,
    y_train,
    validation_data = (X_test,y_test),
    epochs = 1,
    batch_size=32
    )



<tensorflow.python.keras.callbacks.History at 0x7f55f17b6fd0>

In [236]:
simple_model_with_bert.evaluate(X_test, y_test)



[nan, 0.0]

### Experimental

Using Tensorflow dataset from_generator

In [156]:
encoded_phrases_with_len = [[encoded_phrase, y[i], len(encoded_phrase)]
                 for i, encoded_phrase in enumerate(encoded_phrases)]
random.shuffle(encoded_phrases_with_len)
encoded_phrases_with_len.sort(key=lambda x: x[2])
sorted_encoded_phrases_with_sentiment = [(encoded_phrase_with_len[0], encoded_phrase_with_len[1]) for encoded_phrase_with_len in encoded_phrases_with_len]

processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_encoded_phrases_with_sentiment, output_types=(tf.int32, tf.int32))

BATCH_SIZE = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

TOTAL_BATCHES = math.ceil(len(sorted_encoded_phrases_with_sentiment) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

In [178]:
class PlainSentimentLSTM(tf.keras.Model):
  def __init__(self,
               vocabulary_size,
               lstm_out=200,
               embedding_dimensions=128,
               dropout_rate=0.2,
               recurrent_dropout=0,
               sentiment_classes=5,
               training=False,
               name="PlainSentimentLSTM"
               ):
        super(PlainSentimentLSTM, self).__init__(name=name)
        
        self.embedding = layers.Embedding(
            vocabulary_size,
            embedding_dimensions,
            input_length=512
            )
        self.lstm_layer = layers.LSTM(
            lstm_out,
            dropout=dropout_rate,
            recurrent_dropout=recurrent_dropout
            )
        self.last_dense = layers.Dense(
            units=sentiment_classes,
            activation="softmax"
            )
        
  def call(self, inputs, training):
    l = self.embedding(inputs)
    concatenated = self.lstm_layer(l, training)
    model_output = self.last_dense(concatenated)
    return model_output

sentiment_model = PlainSentimentLSTM(
    vocabulary_size=VOCAB_LENGTH
    )

sentiment_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])

### Experimental (Without LSTM)

Tri-Gram Word2Vec

In [237]:
class MCTC_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="mctc_model"):
        super(MCTC_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

In [238]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 5
DROPOUT_RATE = 0.2


text_model = MCTC_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])

In [239]:
X_train, X_test, y_train, y_test = train_test_split(
    encoded_phrases,
    dataset_sentiment_df['sentiment_class'],
    test_size = 0.40,
    random_state = 42
    )


text_model.fit(
    X_train,
    y_train,
    validation_data = (X_test,y_test),
    epochs = 1,
    batch_size=32
    )



<tensorflow.python.keras.callbacks.History at 0x7f55f1200b70>

In [240]:
text_model.evaluate(X_test, y_test)



[nan, 0.0]

### Experimental: Input features