### NLPAUG - BERT Base Uncased

#### Un-augmented test set
#### Augment only the training set to equalize classes

#### Get Original Paper Data

In [1]:
# !pip install sklearn
# !pip install ekphrasis
# !pip install transformers
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

import transformers

from transformers import BertTokenizer, TFBertModel
from tensorflow.keras import backend as K
from tensorflow import keras

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

import os

import logging
tf.get_logger().setLevel(logging.ERROR)

In [3]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
encoder = LabelEncoder()
encoder.classes_ = np.load('../Data/classes.npy', allow_pickle=True)

In [5]:
tf.__version__

'2.6.0'

In [6]:
transformers.__version__

'4.16.2'

In [7]:
# test, train, dev examples from base notebook

train_data_df = pd.read_csv('./Saved_Models/EDA_base_uncased_5aug/All_DA_BERT_base_uncased_train_examples.csv')
dev_data_df = pd.read_csv('./Saved_Models/EDA_base_uncased_5aug/All_DA_BERT_base_uncased_dev_examples.csv')
test_data_df = pd.read_csv('./Saved_Models/EDA_base_uncased_5aug/All_DA_BERT_base_uncased_test_examples.csv')

In [8]:
train_data_df.head()

Unnamed: 0.1,Unnamed: 0,post_id,text,target1,target2,target3,rationales,final_label,text_combined
0,0,22448349_gab,"['common', 'core', 'weed', 'too', 'much', 'rit...","['Men', 'Women']",['Women'],['None'],[],normal,common core weed too much ritalan chem trails ...
1,1,1178948520201637888_twitter,"['took', 'my', 'nan', 'to', 'the', 'hospital',...",['None'],['None'],['None'],[],normal,took my nan to the hospital for a x ray i turn...
2,2,1482573_gab,"['<user>', 'well', 'not', 'really', 'islam', '...",['Islam'],['Other'],['Islam'],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...",offensive,<user> well not really islam does not care for...
3,3,1097184028149587969_twitter,"['<user>', 'france', 'in', '<number>', 'after'...","['Islam', 'Other']",['Islam'],['Islam'],[],normal,<user> france in <number> after muslims take o...
4,4,1089569255111176192_twitter,"['i', 'will', 'not', 'tolerate', 'non', 'arab'...","['Arab', 'Men', 'Women']",['Arab'],"['Arab', 'Islam']","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,...",hatespeech,i will not tolerate non arab women slandering ...


In [9]:
X_train_id = train_data_df['post_id']
X_test_id = test_data_df['post_id']
X_dev_id = dev_data_df['post_id']

In [10]:
y_train = train_data_df['final_label']
y_test = test_data_df['final_label']
y_dev = dev_data_df['final_label']

In [11]:
# x_train_df = pd.DataFrame({'post_id' : X_train_id.to_list()})
# x_dev_df = pd.DataFrame({'post_id' : X_dev_id.to_list()})
# x_test_df = pd.DataFrame({'post_id' : X_test_id.to_list()})

# X_train_df = pd.merge(x_train_df, raw_data_final, how='inner', on='post_id')
# X_dev_df = pd.merge(x_dev_df, raw_data_final, how='inner', on='post_id')
# X_test_df = pd.merge(x_test_df, raw_data_final, how='inner', on='post_id')

X_train_text = train_data_df['text_combined'].to_list()
X_dev_text= dev_data_df['text_combined'].to_list()
X_test_text = test_data_df['text_combined'].to_list()

print(len(X_train_text))
print(len(X_dev_text))
print(len(X_test_text))

15383
1923
1923


In [12]:
original_train_data_df = train_data_df[['text_combined', 'final_label']]
original_train_data_df.head()

Unnamed: 0,text_combined,final_label
0,common core weed too much ritalan chem trails ...,normal
1,took my nan to the hospital for a x ray i turn...,normal
2,<user> well not really islam does not care for...,offensive
3,<user> france in <number> after muslims take o...,normal
4,i will not tolerate non arab women slandering ...,hatespeech


In [13]:
original_train_data_df['final_label'].value_counts()

normal        6251
hatespeech    4748
offensive     4384
Name: final_label, dtype: int64

### Get Augmented Data

In [14]:
nlpaug_ins_set1_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set1_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set2_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set2_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set3_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set3_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set4_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set4_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set5_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set5_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set6_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set6_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set7_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set7_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set8_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set8_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set9_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set9_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set10_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set10_0_2_df.csv')[['text_combined', 'final_label']]

nlpaug_sub_set1_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set1_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set2_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set2_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set3_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set3_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set4_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set4_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set5_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set5_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set6_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set6_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set7_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set7_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set8_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set8_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set9_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set9_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set10_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set10_0_2_df.csv')[['text_combined', 'final_label']]

In [15]:
nlpaug_ins_set1_df.head()

Unnamed: 0,text_combined,final_label
0,common core to weed too [UNK] much ritalan che...,normal
1,took my nan to the hospital for a x s ray i tu...,normal
2,< from user > well no not really islam does no...,offensive
3,< specific user > france in < series number > ...,normal
4,i will surely not tolerate non arab women slan...,hatespeech


In [25]:
# need to add 1503 hatespeech
# need to add 1867 offensive

import random

random.seed(42)

#insertion
offensive_only_ins_df = nlpaug_ins_set1_df[nlpaug_ins_set1_df['final_label'] == 'offensive']

offensive_sample_ins_df = offensive_only_ins_df.sample(n=1867)

hatespeech_only_ins_df = nlpaug_ins_set1_df[nlpaug_ins_set1_df['final_label'] == 'hatespeech']

hatespeech_sample_ins_df = hatespeech_only_ins_df.sample(n=1503)


#substitution
offensive_only_sub_df = nlpaug_sub_set1_df[nlpaug_sub_set1_df['final_label'] == 'offensive']

offensive_sample_sub_df = offensive_only_sub_df.sample(n=1867)

hatespeech_only_sub_df = nlpaug_sub_set1_df[nlpaug_sub_set1_df['final_label'] == 'hatespeech']

hatespeech_sample_sub_df = hatespeech_only_sub_df.sample(n=1503)

len(hatespeech_sample_ins_df)

1503

In [26]:
# combine sets
# ins_frames = [original_train_data_df, nlpaug_ins_set1_df, nlpaug_ins_set2_df, nlpaug_ins_set3_df, nlpaug_ins_set4_df, nlpaug_ins_set5_df]
# sub_frames = [original_train_data_df, nlpaug_sub_set1_df, nlpaug_sub_set2_df, nlpaug_sub_set3_df, nlpaug_sub_set4_df, nlpaug_sub_set5_df]

# ins_frames = [original_train_data_df, nlpaug_ins_set1_df, nlpaug_ins_set2_df]
# sub_frames = [original_train_data_df, nlpaug_sub_set1_df, nlpaug_sub_set2_df]

# ins_frames = [original_train_data_df, nlpaug_ins_set1_df, nlpaug_ins_set2_df, nlpaug_ins_set3_df, 
#               nlpaug_ins_set4_df, nlpaug_ins_set5_df, nlpaug_ins_set6_df, nlpaug_ins_set7_df, 
#               nlpaug_ins_set8_df, nlpaug_ins_set9_df, nlpaug_ins_set10_df]

# sub_frames = [original_train_data_df, nlpaug_sub_set1_df, nlpaug_sub_set2_df, nlpaug_sub_set3_df, 
#               nlpaug_sub_set4_df, nlpaug_sub_set5_df, nlpaug_sub_set6_df, nlpaug_sub_set7_df, 
#               nlpaug_sub_set8_df, nlpaug_sub_set9_df, nlpaug_sub_set10_df]

ins_frames = [original_train_data_df, offensive_sample_ins_df, hatespeech_sample_ins_df]
sub_frames = [original_train_data_df, offensive_sample_sub_df, hatespeech_sample_sub_df]

ins_train_df = pd.concat(ins_frames)
sub_train_df = pd.concat(sub_frames)

In [27]:
ins_train_df['final_label'].value_counts()

normal        6251
offensive     6251
hatespeech    6251
Name: final_label, dtype: int64

In [28]:
sub_train_df['final_label'].value_counts()

normal        6251
offensive     6251
hatespeech    6251
Name: final_label, dtype: int64

In [29]:
len(ins_train_df)

18753

In [30]:
len(sub_train_df)

18753

In [31]:
X_ins_train = ins_train_df['text_combined'].to_list()
X_sub_train = sub_train_df['text_combined'].to_list()

y_ins_train = ins_train_df['final_label']
y_sub_train = sub_train_df['final_label']

#### Convert labels to one-hot encoding

In [32]:
# convert class label to 1 hot encoding

def convert_to_oh(S):
    '''takes a pandas series of text labels and returns one hot encoding equivalent
    0 = normal, 1 = offensive, 2 = hatespeech
    ''' 
    S_numerical = S.apply(lambda x: 0 if x=='normal' else (1 if x=='offensive' else 2))
    S_oh = keras.utils.to_categorical(S_numerical, num_classes = 3, dtype = 'float32')
    return S_oh
    
# original dataset - train, dev, and train
y_train_orig = convert_to_oh(pd.Series(y_train))
y_dev_orig = convert_to_oh(pd.Series(y_dev))
y_test_orig = convert_to_oh(pd.Series(y_test))

# augmented with contextual insertion
y_ins_train_oh = convert_to_oh(y_ins_train)

# augmented with contextual substitution
y_sub_train_oh = convert_to_oh(y_sub_train)


In [33]:
len(y_ins_train_oh)

18753

In [34]:
len(y_sub_train_oh)

18753

In [35]:
y_sub_train_oh

array([[1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

### BERT Model

In [36]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#bert_model = TFBertModel.from_pretrained('bert-base-uncased')

In [37]:
max_length = 128

def bert_tokenize(train_set, dev_set, test_set, max_length):
    
    train = tokenizer(train_set, max_length=max_length, truncation=True, padding='max_length', return_tensors='tf')
    dev = tokenizer(dev_set, max_length=max_length, truncation=True, padding='max_length', return_tensors='tf')
    test = tokenizer(test_set, max_length=max_length, truncation=True, padding='max_length', return_tensors='tf')
    
    return train, dev, test

X_train_orig, X_dev_orig, X_test_orig = bert_tokenize(X_train_text, X_dev_text, X_test_text, max_length)

X_train_aug_ins, X_dev_aug_ins, X_test_aug_ins = bert_tokenize(X_ins_train, X_dev_text, X_test_text, max_length)

X_train_aug_sub, X_dev_aug_sub, X_test_aug_sub = bert_tokenize(X_sub_train, X_dev_text, X_test_text, max_length)


In [38]:
#tokenizer.save_pretrained("./Tokenizer_ALL_EDA_BERT_base_uncased")

In [39]:
X_train_orig.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [40]:
X_train_aug_ins.input_ids

<tf.Tensor: shape=(18753, 128), dtype=int32, numpy=
array([[  101,  2691,  4563, ...,     0,     0,     0],
       [  101,  2165,  2026, ...,     0,     0,     0],
       [  101,  1026,  5310, ...,     0,     0,     0],
       ...,
       [  101,  2116, 16939, ...,     0,     0,     0],
       [  101,  1026,  1037, ...,     0,     0,     0],
       [  101,  2009,  6526, ...,     0,     0,     0]])>

In [41]:
X_train_aug_sub.token_type_ids

<tf.Tensor: shape=(18753, 128), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>

In [42]:
X_train_aug_ins.attention_mask

<tf.Tensor: shape=(18753, 128), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>

In [43]:
X_train_aug_sub.input_ids

<tf.Tensor: shape=(18753, 128), dtype=int32, numpy=
array([[ 101, 2691, 4563, ...,    0,    0,    0],
       [ 101, 2165, 2026, ...,    0,    0,    0],
       [ 101, 1026, 5310, ...,    0,    0,    0],
       ...,
       [ 101, 2054, 1037, ...,    0,    0,    0],
       [ 101, 1045, 2228, ...,    0,    0,    0],
       [ 101, 3773, 2032, ...,    0,    0,    0]])>

In [44]:
from keras import backend as K

def balanced_recall(y_true, y_pred):
    """This function calculates the balanced recall metric
    recall = TP / (TP + FN)
    """
    recall_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true_class, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        recall_by_class = recall_by_class + recall
    return recall_by_class / y_pred.shape[1]

def balanced_precision(y_true, y_pred):
    """This function calculates the balanced precision metric
    precision = TP / (TP + FP)
    """
    precision_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred_class, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        precision_by_class = precision_by_class + precision
    # return average balanced metric for each class
    return precision_by_class / y_pred.shape[1]

def balanced_f1_score(y_true, y_pred):
    """This function calculates the F1 score metric"""
    precision = balanced_precision(y_true, y_pred)
    recall = balanced_recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

In [45]:
def create_classification_model(bert_model, hidden_size = 5, 
                                train_layers = -1, 
                                optimizer=tf.keras.optimizers.Adam()):
    """
    Build a simple classification model with BERT. Let's keep it simple and don't add dropout, layer norms, etc.
    """

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                  'token_type_ids': token_type_ids,
                  'attention_mask': attention_mask}


    #restrict training to the train_layers outer transformer layers
    if not train_layers == -1:

            retrain_layers = []

            for retrain_layer_number in range(train_layers):

                layer_code = '_' + str(11 - retrain_layer_number)
                retrain_layers.append(layer_code)

            for w in bert_model.weights:
                if not any([x in w.name for x in retrain_layers]):
                    w._trainable = False


    bert_out = bert_model(bert_inputs)
    
    net = bert_out[0]
    
    classification_token = tf.keras.layers.Lambda(lambda x: x[:,0,:], name='get_first_vector')(net)
    
    dropout1 = tf.keras.layers.Dropout(0.4, name="dropout1")(classification_token)
    
    hidden = tf.keras.layers.Dense(hidden_size, name='hidden_layer')(dropout1)
    
    dropout2 = tf.keras.layers.Dropout(0.4, name="dropout2")(hidden)

    classification = tf.keras.layers.Dense(3, activation='sigmoid',name='classification_layer')(dropout2)

    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], 
                                          outputs=[classification])
    
    METRICS = [tf.keras.metrics.CategoricalAccuracy(name="accuracy"), 
               balanced_recall, 
               balanced_precision, 
               balanced_f1_score,
               tf.keras.metrics.AUC(curve='ROC', name="auc_roc")]
    
    
    classification_model.compile(optimizer=optimizer,
                            loss=tf.keras.losses.CategoricalCrossentropy(),
                            metrics= METRICS)


    return classification_model




#     classification_model.compile(optimizer=optimizer,
#                             loss=tf.keras.losses.CategoricalCrossentropy(),
#                             metrics=tf.keras.metrics.CategoricalAccuracy('accuracy'))

In [46]:
def fine_tune_BERT(x_train, x_dev, x_test, y_train, y_dev, y_test, name, learning_rate = 5e-05, 
                   epsilon=1e-08, train_layers = -1, epochs = 10, batch_size = 16):
    ''' Fine tunes BERT base uncased with given data, allows your to set some hyperparameters
        returns test set accuracy, f1 score, and AUC_ROC score
    '''
    try:
        del classification_model
    except:
        pass

    try:
        del bert_model
    except:
        pass
    
    tf.keras.backend.clear_session()
    bert_model = TFBertModel.from_pretrained('bert-base-uncased')

    # early stopping callback
    
    earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor = 'val_accuracy', 
                                                      patience = 4,
                                                      restore_best_weights = True)
    
    # Create a callback that saves the model's weights
    
    path_name = './Saved_Models/NLA_b_uncased_eqaug/' + name + '/' + name

    cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=path_name, 
                                                     save_weights_only=True,
                                                     verbose=1,
                                                     monitor='val_accuracy',
                                                     save_best_only=True)
    
    # create classification model
    classification_model = create_classification_model(bert_model, 
                                                       optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon),
                                                       train_layers=train_layers)    
    
    model_fit = classification_model.fit([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask],
                         y_train,
                         validation_data=([x_dev.input_ids, x_dev.token_type_ids, x_dev.attention_mask],
                         y_dev),
                        epochs=epochs,
                        batch_size=batch_size,
                        callbacks = [earlystop_callback, cp_callback])
    
    y_preds_array = classification_model.predict([x_test.input_ids, x_test.token_type_ids, x_test.attention_mask])

    # convert to predicted one-hot encoding

    from keras.utils.np_utils import to_categorical
    y_preds = to_categorical(np.argmax(y_preds_array, 1), dtype = "int64")

    # convert back to labels

    y_test_cat = np.argmax(y_test, axis=1)
    y_preds_cat = np.argmax(y_preds, axis=1)
    
    # calculate metrics
    Accuracy = accuracy_score(y_test_cat, y_preds_cat)

    Macro_F1 = f1_score(y_test_cat, y_preds_cat, average='macro')

    ROC_AUC = roc_auc_score(y_test, y_preds, multi_class='ovo',average='macro')
    
    metrics_history = model_fit.history
    
    return Accuracy, Macro_F1, ROC_AUC, metrics_history

In [51]:
%%time
# original data set
Accuracy_orig, Macro_F1_orig, ROC_AUC_orig, metrics_orig = fine_tune_BERT(X_train_orig, X_dev_orig, X_test_orig, 
                                                            y_train_orig, y_dev_orig, y_test_orig, 'orig_data_base',
                                                            learning_rate = 2e-05, epsilon=1e-08, 
                                                            train_layers = 1, epochs = 30, batch_size = 64)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/30

Epoch 00001: val_accuracy improved from -inf to 0.54498, saving model to ./Saved_Models/NLA_b_uncased_eqaug/orig_data_base\orig_data_base
Epoch 2/30

Epoch 00002: val_accuracy improved from 0.54498 to 0.59698, saving model to ./Saved_Models/NLA_b_uncased_eqaug/orig_data_base\orig_data_base
Epoch 3/30

Epoch 00003: val_accuracy improved from 0.59698 to 0.62142, saving model to ./Saved_Models/NLA_b_uncased_eqaug/orig_data_base\orig_data_base
Epoch 4/30

Epoch 00004: val_accuracy improved from 0.62142 to 0.63703, saving model to ./Saved_Models/NLA_b_uncased_eqaug/orig_data_base\orig_data_base
Epoch 5/30

Epoch 00005: val_accuracy did not improve from 0.63703
Epoch 6/30

Epoch 00006: val_accuracy improved from 0.63703 to 0.64847, saving model to ./Saved_Models/NLA_b_uncased_eqaug/orig_data_base\orig_data_base
Epoch 7/30

Epoch 00007: val_accuracy did not improve from 0.64847
Epoch 8/30

Epoch 00008: val_accuracy improved from 0.64847 to 0.65367, saving model to ./Saved_Models/N

Epoch 18/30

Epoch 00018: val_accuracy did not improve from 0.67759
Epoch 19/30

Epoch 00019: val_accuracy did not improve from 0.67759
Epoch 20/30

Epoch 00020: val_accuracy did not improve from 0.67759
Epoch 21/30

Epoch 00021: val_accuracy did not improve from 0.67759
Wall time: 18min 14s


In [48]:
%%time
# augmented with contextual word insertion
Accuracy_aug_ins, Macro_F1_aug_ins, ROC_AUC_aug_ins, metrics_ins = fine_tune_BERT(X_train_aug_ins, X_dev_aug_ins, X_test_aug_ins, 
                                                            y_ins_train_oh, y_dev_orig, y_test_orig, 'NLA_ins_base', 
                                                            learning_rate = 2e-05, epsilon=1e-08, 
                                                            train_layers = 1, epochs = 30, batch_size = 64)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/30

Epoch 00001: val_accuracy improved from -inf to 0.54862, saving model to ./Saved_Models/NLA_b_uncased_eqaug/NLA_ins_base\NLA_ins_base
Epoch 2/30

Epoch 00002: val_accuracy improved from 0.54862 to 0.60062, saving model to ./Saved_Models/NLA_b_uncased_eqaug/NLA_ins_base\NLA_ins_base
Epoch 3/30

Epoch 00003: val_accuracy improved from 0.60062 to 0.61674, saving model to ./Saved_Models/NLA_b_uncased_eqaug/NLA_ins_base\NLA_ins_base
Epoch 4/30

Epoch 00004: val_accuracy improved from 0.61674 to 0.62402, saving model to ./Saved_Models/NLA_b_uncased_eqaug/NLA_ins_base\NLA_ins_base
Epoch 5/30

Epoch 00005: val_accuracy improved from 0.62402 to 0.63027, saving model to ./Saved_Models/NLA_b_uncased_eqaug/NLA_ins_base\NLA_ins_base
Epoch 6/30

Epoch 00006: val_accuracy did not improve from 0.63027
Epoch 7/30

Epoch 00007: val_accuracy improved from 0.63027 to 0.65627, saving model to ./Saved_Models/NLA_b_uncased_eqaug/NLA_ins_base\NLA_ins_base
Epoch 8/30

Epoch 00008: val_accuracy did 


Epoch 00018: val_accuracy improved from 0.67343 to 0.67551, saving model to ./Saved_Models/NLA_b_uncased_eqaug/NLA_ins_base\NLA_ins_base
Epoch 19/30

Epoch 00019: val_accuracy improved from 0.67551 to 0.68851, saving model to ./Saved_Models/NLA_b_uncased_eqaug/NLA_ins_base\NLA_ins_base
Epoch 20/30

Epoch 00020: val_accuracy did not improve from 0.68851
Epoch 21/30

Epoch 00021: val_accuracy did not improve from 0.68851
Epoch 22/30

Epoch 00022: val_accuracy did not improve from 0.68851
Epoch 23/30

Epoch 00023: val_accuracy did not improve from 0.68851
Wall time: 24min 8s


In [49]:
%%time
# augmented with contextual word substitution
Accuracy_aug_sub, Macro_F1_aug_sub, ROC_AUC_aug_sub, metrics_sub = fine_tune_BERT(X_train_aug_sub, X_dev_aug_sub, X_test_aug_sub, 
                                                            y_sub_train_oh, y_dev_orig, y_test_orig, 'NLA_sub_base', 
                                                            learning_rate = 2e-05, epsilon=1e-08, 
                                                            train_layers = 1, epochs = 30, batch_size = 64)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/30

Epoch 00001: val_accuracy improved from -inf to 0.52886, saving model to ./Saved_Models/NLA_b_uncased_eqaug/NLA_sub_base\NLA_sub_base
Epoch 2/30

Epoch 00002: val_accuracy improved from 0.52886 to 0.59230, saving model to ./Saved_Models/NLA_b_uncased_eqaug/NLA_sub_base\NLA_sub_base
Epoch 3/30

Epoch 00003: val_accuracy improved from 0.59230 to 0.60738, saving model to ./Saved_Models/NLA_b_uncased_eqaug/NLA_sub_base\NLA_sub_base
Epoch 4/30

Epoch 00004: val_accuracy improved from 0.60738 to 0.62298, saving model to ./Saved_Models/NLA_b_uncased_eqaug/NLA_sub_base\NLA_sub_base
Epoch 5/30

Epoch 00005: val_accuracy improved from 0.62298 to 0.63287, saving model to ./Saved_Models/NLA_b_uncased_eqaug/NLA_sub_base\NLA_sub_base
Epoch 6/30

Epoch 00006: val_accuracy improved from 0.63287 to 0.64431, saving model to ./Saved_Models/NLA_b_uncased_eqaug/NLA_sub_base\NLA_sub_base
Epoch 7/30

Epoch 00007: val_accuracy improved from 0.64431 to 0.64639, saving model to ./Saved_Models/NLA_b_

In [52]:
Accuracy_orig, Macro_F1_orig, ROC_AUC_orig, metrics_orig

(0.6999479979199168,
 0.6731388765955821,
 0.7644924170192088,
 {'loss': [1.3246972560882568,
   1.1469930410385132,
   1.039675235748291,
   0.9929202198982239,
   0.9725001454353333,
   0.9487781524658203,
   0.9147466421127319,
   0.8979606628417969,
   0.8888847827911377,
   0.878456175327301,
   0.8649579882621765,
   0.858691930770874,
   0.8414878249168396,
   0.8377479910850525,
   0.8355972766876221,
   0.8195599913597107,
   0.8189342617988586,
   0.8009267449378967,
   0.7953883409500122,
   0.7922791242599487,
   0.7801547050476074],
  'accuracy': [0.41110315918922424,
   0.4765000343322754,
   0.5161541700363159,
   0.5442371368408203,
   0.547617495059967,
   0.5622440576553345,
   0.5798608660697937,
   0.5831762552261353,
   0.5920171737670898,
   0.5978677868843079,
   0.6080738306045532,
   0.6126893162727356,
   0.6141194701194763,
   0.6219202876091003,
   0.6205551624298096,
   0.629006028175354,
   0.6336215138435364,
   0.6375219225883484,
   0.6456477642059326,


In [53]:
Accuracy_aug_ins, Macro_F1_aug_ins, ROC_AUC_aug_ins, metrics_ins

(0.6973478939157566,
 0.6751178012201473,
 0.7617227955848805,
 {'loss': [1.3024036884307861,
   1.0837291479110718,
   1.00544011592865,
   0.9591243267059326,
   0.9220449924468994,
   0.8994577527046204,
   0.8822932243347168,
   0.8637439608573914,
   0.8611544370651245,
   0.8425735235214233,
   0.831375241279602,
   0.8219071626663208,
   0.8147290945053101,
   0.7995330691337585,
   0.7930953502655029,
   0.7843232750892639,
   0.7776022553443909,
   0.768406093120575,
   0.763109564781189,
   0.7549741268157959,
   0.7447827458381653,
   0.7395809292793274,
   0.7282400727272034],
  'accuracy': [0.4043619632720947,
   0.4881352186203003,
   0.5216765403747559,
   0.5438596606254578,
   0.5652962327003479,
   0.5771876573562622,
   0.5897722840309143,
   0.5999040007591248,
   0.6028901934623718,
   0.616594672203064,
   0.6236335635185242,
   0.6300325393676758,
   0.6351516842842102,
   0.6355249881744385,
   0.6429371237754822,
   0.6494960784912109,
   0.6476830244064331,
  

In [54]:
Accuracy_aug_sub, Macro_F1_aug_sub, ROC_AUC_aug_sub, metrics_sub

(0.6666666666666666,
 0.6576111487683548,
 0.7444319834670491,
 {'loss': [1.2440074682235718,
   1.0977025032043457,
   1.03812837600708,
   1.0008174180984497,
   0.9690854549407959,
   0.9448031783103943,
   0.9391717314720154,
   0.9207548499107361,
   0.9076979756355286,
   0.8956285715103149,
   0.8856244683265686,
   0.8825658559799194,
   0.8721186518669128,
   0.854709267616272,
   0.8520892858505249,
   0.8456027507781982,
   0.839051365852356],
  'accuracy': [0.39108410477638245,
   0.4581133723258972,
   0.48888176679611206,
   0.5078654289245605,
   0.5281821489334106,
   0.5511651635169983,
   0.5487121939659119,
   0.5594304800033569,
   0.5691356062889099,
   0.5825201272964478,
   0.582893431186676,
   0.5844398140907288,
   0.5968645215034485,
   0.6063029766082764,
   0.6071028709411621,
   0.6103023290634155,
   0.6125953197479248],
  'balanced_recall': [0.5126809477806091,
   0.5698287487030029,
   0.6005765199661255,
   0.6089849472045898,
   0.6305580139160156,
  

In [55]:
trial_name_list = ['Original Data', 'Aug Contextual Ins', 'Aug Contextual Sub']

acc_list = [Accuracy_orig, Accuracy_aug_ins, Accuracy_aug_sub]

macro_f1_list = [Macro_F1_orig, Macro_F1_aug_ins, Macro_F1_aug_sub]

roc_auc_list = [ROC_AUC_orig, ROC_AUC_aug_ins, ROC_AUC_aug_sub]


In [56]:
result_dict = {'Trial Name' : trial_name_list, 'Test Accuracy Score' : acc_list, 
               'Test Macro F1 Score' : macro_f1_list, 'Test ROC AUC Score' : roc_auc_list}

In [57]:
results_df = pd.DataFrame(result_dict)

results_df

Unnamed: 0,Trial Name,Test Accuracy Score,Test Macro F1 Score,Test ROC AUC Score
0,Original Data,0.699948,0.673139,0.764492
1,Aug Contextual Ins,0.697348,0.675118,0.761723
2,Aug Contextual Sub,0.666667,0.657611,0.744432


In [58]:
results_df.to_csv('./Saved_Models/NLA_b_uncased_eqaug/All_NLA_BERT_base_uncased.csv')

In [59]:
metrics_org_df = pd.DataFrame(metrics_orig)

metrics_org_df

Unnamed: 0,loss,accuracy,balanced_recall,balanced_precision,balanced_f1_score,auc_roc,val_loss,val_accuracy,val_balanced_recall,val_balanced_precision,val_balanced_f1_score,val_auc_roc
0,1.324697,0.411103,0.578798,0.372478,0.452974,0.581809,0.950057,0.544982,0.666032,0.470511,0.550918,0.727738
1,1.146993,0.4765,0.645997,0.424302,0.511878,0.661717,0.856257,0.596984,0.773551,0.522667,0.622629,0.782396
2,1.039675,0.516154,0.690401,0.454843,0.548061,0.706183,0.827523,0.621425,0.794351,0.530726,0.635045,0.797673
3,0.99292,0.544237,0.717484,0.46533,0.564211,0.725116,0.802842,0.637025,0.792706,0.547489,0.646601,0.811258
4,0.9725,0.547617,0.722157,0.473215,0.571495,0.73451,0.793842,0.635985,0.794237,0.544184,0.644764,0.815884
5,0.948778,0.562244,0.731108,0.481321,0.580146,0.745401,0.788742,0.648466,0.799795,0.545515,0.647629,0.820648
6,0.914747,0.579861,0.746512,0.488189,0.590026,0.75892,0.780175,0.647946,0.783471,0.558236,0.651089,0.825267
7,0.897961,0.583176,0.751331,0.494249,0.5959,0.76563,0.76994,0.653666,0.80832,0.557581,0.659006,0.830186
8,0.888885,0.592017,0.766144,0.496984,0.602533,0.7707,0.76484,0.660426,0.807225,0.558516,0.659237,0.831005
9,0.878456,0.597868,0.766257,0.504379,0.608047,0.775519,0.764534,0.665107,0.784994,0.562225,0.65434,0.833648


In [60]:
metrics_list = [metrics_orig, metrics_ins, metrics_sub]
name_list = ['fit_metrics_orig.csv', 'fit_metrics_ins.csv', 'fit_metrics_sub.csv']

i = 0
for m in metrics_list:
    df = pd.DataFrame(m)
    df.to_csv(name_list[i])
    i += 1

        