In [1]:
!pip install pytorch_pretrained_bert



In [0]:
import numpy as np
import pandas as pd
import random
import re
import string 
import tensorflow as tf
import tensorflow_hub as hub

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import Precision, Recall, FalseNegatives, FalsePositives
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.utils import to_categorical
from pytorch_pretrained_bert import BertTokenizer
from tensorflow.keras.models import Model 

# Data Preprocessing 

In [0]:
# Import data
train = pd.read_excel('Trainset.xlsx')
test = pd.read_excel('Testset.xlsx')

# Eliminate the NAs
train = train.fillna('')
test = test.fillna('')

# Remove the rows without Opinion Category values
train = train[train.OpinionCategory != ''] 
test = test[test.OpinionCategory != ''] 

# Sort the data
train = train.sort_values('Sentence_ID').reset_index(drop=True)
test = test.sort_values('Sentence_ID').reset_index(drop=True)

In [4]:
train.head()

Unnamed: 0,ID_number,Review_ID,ID_and_Review,OutOfScope,Sentence_ID,OpinionCategory,OpinionFrom,Polarity,AspectTerm,OpinionTo,Text
0,1004293,1,1004293:0,,1,RESTAURANT#GENERAL,51,negative,place,56,Judging from previous posts this used to be a ...
1,1004293,1,1004293:1,,2,SERVICE#GENERAL,75,negative,staff,80,"We, there were four of us, arrived at noon - t..."
2,1004293,1,1004293:2,,3,SERVICE#GENERAL,0,negative,,0,"They never brought us complimentary noodles, i..."
3,1004293,1,1004293:3,,4,FOOD#QUALITY,4,negative,food,8,The food was lousy - too sweet or too salty an...
4,1004293,1,1004293:3,,4,FOOD#STYLE_OPTIONS,52,negative,portions,60,The food was lousy - too sweet or too salty an...


In [5]:
train.shape, test.shape

((2507, 11), (859, 11))

In [6]:
train.OpinionCategory.value_counts()

FOOD#QUALITY                849
SERVICE#GENERAL             449
RESTAURANT#GENERAL          422
AMBIENCE#GENERAL            255
FOOD#STYLE_OPTIONS          137
RESTAURANT#MISCELLANEOUS     98
FOOD#PRICES                  90
RESTAURANT#PRICES            80
DRINKS#QUALITY               47
DRINKS#STYLE_OPTIONS         32
LOCATION#GENERAL             28
DRINKS#PRICES                20
Name: OpinionCategory, dtype: int64

Train data consists of 11 variables. The four of them indicate the ID numbers of the sentences, the reviewer, the review and the combination of them. OutofScope variable loses its function when I eliminated the null OpinionCategory values. The Opinion Category shows the aspect which the review refers to. The Opinion Category consists of 12 classes and each class has an entity and a corresponding attribute, in other words, E#A pairs. 

In this notebook, I will deal only with the Opinion Category and the corresponding reviews under the Text column and leave the analysis for Polarity and AspectTerm (and related columns with AspectTerm) to other notebooks. 

In [7]:
# Since the opinion category consists of categories, they needed to be one-hot encoded for the model.
train.OpinionCategory = train.OpinionCategory.astype('category')
one_hot = to_categorical(train.OpinionCategory.cat.codes)
one_hot = pd.DataFrame(one_hot)

test.OpinionCategory = test.OpinionCategory.astype('category')
one_hot_test = to_categorical(test.OpinionCategory.cat.codes)
one_hot_test = pd.DataFrame(one_hot_test)

one_hot.shape, one_hot_test.shape # There are 12 opinion category classes.

((2507, 12), (859, 12))

In [0]:
# As mentioned above, for the analysis only three columns are necessary.
useful_train = train[['Sentence_ID','OpinionCategory','Text']]
useful_test = test[['Sentence_ID','OpinionCategory','Text']]

## Multilabeling 

In [0]:
# First, concatenate the train data and the one-hot-encoding of the opinion category classes.
data_train = pd.concat([useful_train, one_hot], axis=1)
data_test = pd.concat([useful_test, one_hot_test], axis=1)

# Since the reviewer may have mentioned more than one opinion in a sentence, 
# I sum the one-hot-encoded classes of each sentence.
multi_label = data_train.groupby('Sentence_ID').sum().reset_index(drop=True)
multi_label_test = data_test.groupby('Sentence_ID').sum().reset_index(drop=True)

# In each sentence, an opinion category may be refered more than one.
# Because of that, there were values besides 0 and 1, after the summation.
# Since being refered is important rather than how many times, the values such as 2 or 3 are reverted to one.
multi_label = np.array(multi_label.astype(bool).astype(int))
multi_label_test = np.array(multi_label_test.astype(bool).astype(int))

# After the multi-labeling of the sentences, remove the duplicates.
train_Text = data_train.drop_duplicates(subset=['Sentence_ID'], keep='last').Text
test_Text = data_test.drop_duplicates(subset=['Sentence_ID'], keep='last').Text

In [10]:
multi_label, multi_label.shape, multi_label_test, multi_label_test.shape

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 1],
        ...,
        [0, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 1]]), (1708, 12), array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 0, 1],
        [0, 0, 0, ..., 0, 0, 0]]), (587, 12))

## Data Processing for the BERT Model

In [11]:
# add special tokens for BERT to work properly
sentences = ["[CLS] " + sent + " [SEP]" for sent in train_Text.astype(str)]
sentences_test = ["[CLS] " + sent + " [SEP]" for sent in test_Text.astype(str)]

sentences[0], sentences_test[0]

('[CLS] Judging from previous posts this used to be a good place, but not any longer. [SEP]',
 '[CLS] Yum! [SEP]')

For the tokenization, pre-trained Bert-Base-Uncased dictionary is used. They constructed it with WordPiece embeddings with a 30,000 token vocabulary. 

In [12]:
# Tokenize with BERT tokenizer both train and test data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
tokenized_texts_test = [tokenizer.tokenize(sent) for sent in sentences_test]

tokenized_texts[0], tokenized_texts_test[0]

(['[CLS]',
  'judging',
  'from',
  'previous',
  'posts',
  'this',
  'used',
  'to',
  'be',
  'a',
  'good',
  'place',
  ',',
  'but',
  'not',
  'any',
  'longer',
  '.',
  '[SEP]'],
 ['[CLS]', 'yu', '##m', '!', '[SEP]'])

For the BERT model to work, we need three inputs. 
- Input IDs: shows the ID number of each token with padding. The ID numbers are restored from the BERT vocabulary dictionary.
- Mask IDs: indicates which elements in the sequence are tokens and which are padding elements.
- Segment IDs: distinguishes different sentences, 0 for one-sentence sequence, 1 if there are two sentences.

The functions below are extracted from: https://towardsdatascience.com/simple-bert-using-tensorflow-2-0-132cb19e9b22

In [0]:
def get_ids(tokens, tokenizer, max_seq_length):
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

def get_masks(tokens, max_seq_length):
    if len(tokens) > max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def get_segments(tokens, max_seq_length):
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

In [14]:
# find the longest sequence for the padding
def find_max_list(list):
    list_len = [len(i) for i in list]
    return max(list_len)
    
longestSeq_train = find_max_list(tokenized_texts)
longestSeq_test = find_max_list(tokenized_texts_test)
max_seq_length = max(longestSeq_train, longestSeq_test)
print(max_seq_length)

93


In [0]:
# Find input_ids, mask_ids and segment_ids of the train and test data.
max_seq_length = max_seq_length
input_ids = []
for i in range(len(tokenized_texts)):
    input_ids.append(get_ids(tokenized_texts[i], tokenizer, max_seq_length))

mask_ids = [] 
for i in range(len(tokenized_texts)):
    mask_ids.append(get_masks(tokenized_texts[i], max_seq_length))
    
segments_ids = [] 
for i in range(len(tokenized_texts)):
    segments_ids.append(get_segments(tokenized_texts[i], max_seq_length))
    
input_ids_test = []
for i in range(len(tokenized_texts_test)):
    input_ids_test.append(get_ids(tokenized_texts_test[i], tokenizer, max_seq_length))

mask_ids_test = [] 
for i in range(len(tokenized_texts_test)):
    mask_ids_test.append(get_masks(tokenized_texts_test[i], max_seq_length))
    
segments_ids_test = [] 
for i in range(len(tokenized_texts_test)):
    segments_ids_test.append(get_segments(tokenized_texts_test[i], max_seq_length))
    
# For the model, I converted the lists to tensors.
input_ids = tf.convert_to_tensor(input_ids, dtype = tf.int32)
mask_ids = tf.convert_to_tensor(mask_ids, dtype = tf.int32)
segments_ids = tf.convert_to_tensor(segments_ids, dtype = tf.int32)

input_ids_test = tf.convert_to_tensor(input_ids_test, dtype = tf.int32)
mask_ids_test = tf.convert_to_tensor(mask_ids_test, dtype = tf.int32)
segments_ids_test = tf.convert_to_tensor(segments_ids_test, dtype = tf.int32)

# The BERT Model

I used the uncased BERT model with 12 hidden layers and 110M parameters, trained on Wikipedia and Book-Corpus data and hosted by Google on TensorFlow Hub. 

In [16]:
random.seed(123)
# Three Inputs of the Bert Model
InputIDLayer = Input(shape = (max_seq_length,), dtype = tf.int32, name = "InputIDs")
MaskIDLayer = Input(shape = (max_seq_length,), dtype = tf.int32, name = "MaskIDs")
SegmentIDLayer = Input(shape = (max_seq_length,), dtype = tf.int32, name = "SegmentIDs")

# Import the pre-trained uncased Bert model
bertLayer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=True)

# Since it is a classisfication problem, the pooled output is needed.
pooled_output, sequence_output = bertLayer([InputIDLayer, MaskIDLayer, SegmentIDLayer])
pooled_output = Dropout(0.5)(pooled_output)
output = Dense(units = 768, activation = "tanh")(pooled_output)
output = Dropout(0.5)(output)
output = Dense(units = 12, activation = "softmax")(output)

model = Model(inputs=[InputIDLayer, MaskIDLayer, SegmentIDLayer], outputs = output)

# Model Compilation
learning_rate = 2e-5
number_of_epochs = 10
optimizer = Adam(learning_rate = learning_rate, epsilon = 1e-08)
loss = CategoricalCrossentropy(from_logits = False)
metrics = [Precision(), Recall(),
          FalseNegatives(), FalsePositives()]

model.compile(optimizer = optimizer, 
              loss = loss,
              metrics = metrics)

# Model Training & Fine-Tuning on train data
earlyStopping = EarlyStopping(monitor = "val_loss", mode = "min", patience = 1)

bert_history = model.fit([input_ids, mask_ids, segments_ids], [multi_label],
                         epochs = number_of_epochs, 
                         batch_size = 64,
                         validation_split = 0.1,
                         callbacks = [earlyStopping]
                         )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [17]:
# Model Evaluation - Loss, Precision, Recall, PrecisionAtRecall, RecallAtPrecision, FalseNegatives, FalsePositives
results = model.evaluate([input_ids_test, mask_ids_test, segments_ids_test], multi_label_test)



In [18]:
f1_score = 2 * (results[1] * results[2])/(results[1] + results[2])
f1_score

0.692737424027747

In [34]:
# Dataframe for f1 scores:
BERT_models = {'Optimizers': ['Adam','Adam','SGD','SGD'],
        'DenseLayers': ['with', 'without', 'with', 'without'],
        'F1Scores': [f1_score,f1_score2,f1_score3,f1_score4]}

BERT_models = pd.DataFrame(BERT_models, columns = ['Optimizers', 'DenseLayers', 'F1Scores' ])
print(BERT_models)

  Optimizers DenseLayers  F1Scores
0       Adam        with  0.692737
1       Adam     without  0.711599
2        SGD        with  0.731745
3        SGD     without  0.722611


As can be seen above, the BERT model with SGD optimizer with extra dense layers gives the best result, with f1 score 0.73.

## Appendix

I would like to compare the changes on BERT model under the Appendix to compare with the main model.
- Main model without extra Dense layers
- SGD optimizer (vs. Adam)

### Without extra Dense layers

In [19]:
random.seed(123)
# Three Inputs of the Bert Model
InputIDLayer2 = Input(shape = (max_seq_length,), dtype = tf.int32, name = "InputIDs")
MaskIDLayer2 = Input(shape = (max_seq_length,), dtype = tf.int32, name = "MaskIDs")
SegmentIDLayer2 = Input(shape = (max_seq_length,), dtype = tf.int32, name = "SegmentIDs")

# Import the pre-trained uncased Bert model
bertLayer2 = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=True)

# Since it is a classisfication problem, the pooled output is needed.
pooled_output2, sequence_output2 = bertLayer2([InputIDLayer2, MaskIDLayer2, SegmentIDLayer2])
output2 = Dense(12, activation = 'softmax')(pooled_output2)

model2 = Model(inputs=[InputIDLayer2, MaskIDLayer2, SegmentIDLayer2], outputs = output2)

# Model Compilation
learning_rate = 2e-5
number_of_epochs = 10
optimizer2 = Adam(learning_rate = learning_rate, epsilon = 1e-08)
loss2 = CategoricalCrossentropy(from_logits = False)
metrics2 = [Precision(), Recall(),
          FalseNegatives(), FalsePositives()]

model2.compile(optimizer = optimizer2, 
              loss = loss2,
              metrics = metrics2)

# Model Training & Fine-Tuning on train data
earlyStopping2 = EarlyStopping(monitor = "val_loss", mode = "min", patience = 1)

bert_history2 = model2.fit([input_ids, mask_ids, segments_ids], [multi_label],
                         epochs = number_of_epochs, 
                         batch_size = 64,
                         validation_split = 0.1,
                         callbacks = [earlyStopping2]
                         )
# Model Evaluation - Loss, Precision, Recall, PrecisionAtRecall, RecallAtPrecision, FalseNegatives, FalsePositives
results2 = model2.evaluate([input_ids_test, mask_ids_test, segments_ids_test], multi_label_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [20]:
f1_score2 = 2 * (results2[1] * results2[2])/(results2[1] + results2[2])
f1_score2 

0.7115987665154876

### With SGD Optimizer

In [21]:
random.seed(123)
# Three Inputs of the Bert Model
InputIDLayer3 = Input(shape = (max_seq_length,), dtype = tf.int32, name = "InputIDs")
MaskIDLayer3 = Input(shape = (max_seq_length,), dtype = tf.int32, name = "MaskIDs")
SegmentIDLayer3 = Input(shape = (max_seq_length,), dtype = tf.int32, name = "SegmentIDs")

# Import the pre-trained uncased Bert model
bertLayer3 = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=True)

# Since it is a classisfication problem, the pooled output is needed.
pooled_output3, sequence_output3 = bertLayer3([InputIDLayer3, MaskIDLayer3, SegmentIDLayer3])
pooled_output3 = Dropout(0.5)(pooled_output3)
output3 = Dense(units = 768, activation = "tanh")(pooled_output3)
output3 = Dropout(0.5)(output3)
output3 = Dense(units = 12, activation = "softmax")(output3)

model3 = Model(inputs=[InputIDLayer3, MaskIDLayer3, SegmentIDLayer3], outputs = output3)

# Model Compilation
learning_rate = 0.01
number_of_epochs = 10
optimizer3 = SGD(learning_rate = learning_rate)
loss3 = CategoricalCrossentropy(from_logits = False)
metrics3 = [Precision(), Recall(),
          FalseNegatives(), FalsePositives()]

model3.compile(optimizer = optimizer3, 
              loss = loss3,
              metrics = metrics3)

# Model Training & Fine-Tuning on train data
earlyStopping3 = EarlyStopping(monitor = "val_loss", mode = "min", patience = 1)

bert_history3 = model3.fit([input_ids, mask_ids, segments_ids], [multi_label],
                         epochs = number_of_epochs, 
                         batch_size = 32,
                         validation_split = 0.1,
                         callbacks = [earlyStopping3]
                         )
# Model Evaluation - Loss, Precision, Recall, PrecisionAtRecall, RecallAtPrecision, FalseNegatives, FalsePositives
results3 = model3.evaluate([input_ids_test, mask_ids_test, segments_ids_test], multi_label_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10


In [22]:
f1_score3 = 2 * (results3[1] * results3[2])/(results3[1] + results3[2])
f1_score3

0.7317448191407636

### With SGD Optimizer without Extra Dense Layers

In [24]:
random.seed(123)
# Three Inputs of the Bert Model
InputIDLayer4 = Input(shape = (max_seq_length,), dtype = tf.int32, name = "InputIDs")
MaskIDLayer4 = Input(shape = (max_seq_length,), dtype = tf.int32, name = "MaskIDs")
SegmentIDLayer4 = Input(shape = (max_seq_length,), dtype = tf.int32, name = "SegmentIDs")

# Import the pre-trained uncased Bert model
bertLayer4 = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=True)

# Since it is a classisfication problem, the pooled output is needed.
pooled_output4, sequence_output4 = bertLayer4([InputIDLayer4, MaskIDLayer4, SegmentIDLayer4])
output4 = Dense(units = 12, activation = "softmax")(pooled_output4)

model4 = Model(inputs=[InputIDLayer4, MaskIDLayer4, SegmentIDLayer4], outputs = output4)

# Model Compilation
learning_rate = 0.01
number_of_epochs = 10
optimizer4 = SGD(learning_rate = learning_rate)
loss4 = CategoricalCrossentropy(from_logits = False)
metrics4 = [Precision(), Recall(),
          FalseNegatives(), FalsePositives()]

model4.compile(optimizer = optimizer4, 
              loss = loss4,
              metrics = metrics4)

# Model Training & Fine-Tuning on train data
earlyStopping4 = EarlyStopping(monitor = "val_loss", mode = "min", patience = 1)

bert_history4 = model4.fit([input_ids, mask_ids, segments_ids], [multi_label],
                         epochs = number_of_epochs, 
                         batch_size = 32,
                         validation_split = 0.1,
                         callbacks = [earlyStopping4]
                         )
# Model Evaluation - Loss, Precision, Recall, PrecisionAtRecall, RecallAtPrecision, FalseNegatives, FalsePositives
results4 = model4.evaluate([input_ids_test, mask_ids_test, segments_ids_test], multi_label_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10


In [26]:
f1_score4 = 2 * (results4[1] * results4[2])/(results4[1] + results4[2])
f1_score4

0.722610737010602