### NLPAUG - BERT Base Uncased

#### Un-augmented test set
#### Augment only the training set

#### Get Original Paper Data

In [1]:
# !pip install sklearn
# !pip install ekphrasis
# !pip install transformers
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

import transformers

from transformers import BertTokenizer, TFBertModel
from tensorflow.keras import backend as K
from tensorflow import keras

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

import os

import logging
tf.get_logger().setLevel(logging.ERROR)

In [3]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
encoder = LabelEncoder()
encoder.classes_ = np.load('../Data/classes.npy', allow_pickle=True)

In [5]:
tf.__version__

'2.6.0'

In [6]:
transformers.__version__

'4.16.2'

In [7]:
# test, train, dev examples from base notebook

train_data_df = pd.read_csv('./Saved_Models/EDA_base_uncased_5aug/All_DA_BERT_base_uncased_train_examples.csv')
dev_data_df = pd.read_csv('./Saved_Models/EDA_base_uncased_5aug/All_DA_BERT_base_uncased_dev_examples.csv')
test_data_df = pd.read_csv('./Saved_Models/EDA_base_uncased_5aug/All_DA_BERT_base_uncased_test_examples.csv')

In [8]:
train_data_df.head()

Unnamed: 0.1,Unnamed: 0,post_id,text,target1,target2,target3,rationales,final_label,text_combined
0,0,22448349_gab,"['common', 'core', 'weed', 'too', 'much', 'rit...","['Men', 'Women']",['Women'],['None'],[],normal,common core weed too much ritalan chem trails ...
1,1,1178948520201637888_twitter,"['took', 'my', 'nan', 'to', 'the', 'hospital',...",['None'],['None'],['None'],[],normal,took my nan to the hospital for a x ray i turn...
2,2,1482573_gab,"['<user>', 'well', 'not', 'really', 'islam', '...",['Islam'],['Other'],['Islam'],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...",offensive,<user> well not really islam does not care for...
3,3,1097184028149587969_twitter,"['<user>', 'france', 'in', '<number>', 'after'...","['Islam', 'Other']",['Islam'],['Islam'],[],normal,<user> france in <number> after muslims take o...
4,4,1089569255111176192_twitter,"['i', 'will', 'not', 'tolerate', 'non', 'arab'...","['Arab', 'Men', 'Women']",['Arab'],"['Arab', 'Islam']","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,...",hatespeech,i will not tolerate non arab women slandering ...


In [9]:
X_train_id = train_data_df['post_id']
X_test_id = test_data_df['post_id']
X_dev_id = dev_data_df['post_id']

In [10]:
y_train = train_data_df['final_label']
y_test = test_data_df['final_label']
y_dev = dev_data_df['final_label']

In [11]:
# x_train_df = pd.DataFrame({'post_id' : X_train_id.to_list()})
# x_dev_df = pd.DataFrame({'post_id' : X_dev_id.to_list()})
# x_test_df = pd.DataFrame({'post_id' : X_test_id.to_list()})

# X_train_df = pd.merge(x_train_df, raw_data_final, how='inner', on='post_id')
# X_dev_df = pd.merge(x_dev_df, raw_data_final, how='inner', on='post_id')
# X_test_df = pd.merge(x_test_df, raw_data_final, how='inner', on='post_id')

X_train_text = train_data_df['text_combined'].to_list()
X_dev_text= dev_data_df['text_combined'].to_list()
X_test_text = test_data_df['text_combined'].to_list()

print(len(X_train_text))
print(len(X_dev_text))
print(len(X_test_text))

15383
1923
1923


In [12]:
original_train_data_df = train_data_df[['text_combined', 'final_label']]
original_train_data_df.head()

Unnamed: 0,text_combined,final_label
0,common core weed too much ritalan chem trails ...,normal
1,took my nan to the hospital for a x ray i turn...,normal
2,<user> well not really islam does not care for...,offensive
3,<user> france in <number> after muslims take o...,normal
4,i will not tolerate non arab women slandering ...,hatespeech


In [13]:
original_train_data_df['final_label'].value_counts()

normal        6251
hatespeech    4748
offensive     4384
Name: final_label, dtype: int64

### Get Augmented Data

In [14]:
nlpaug_ins_set1_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set1_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set2_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set2_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set3_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set3_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set4_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set4_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set5_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set5_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set6_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set6_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set7_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set7_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set8_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set8_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set9_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set9_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_ins_set10_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_ins_set10_0_2_df.csv')[['text_combined', 'final_label']]

nlpaug_sub_set1_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set1_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set2_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set2_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set3_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set3_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set4_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set4_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set5_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set5_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set6_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set6_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set7_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set7_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set8_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set8_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set9_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set9_0_2_df.csv')[['text_combined', 'final_label']]
nlpaug_sub_set10_df = pd.read_csv('../test_data_set/NLPAUG/NLPAUG_sub_set10_0_2_df.csv')[['text_combined', 'final_label']]

In [15]:
nlpaug_ins_set1_df.head()

Unnamed: 0,text_combined,final_label
0,common core to weed too [UNK] much ritalan che...,normal
1,took my nan to the hospital for a x s ray i tu...,normal
2,< from user > well no not really islam does no...,offensive
3,< specific user > france in < series number > ...,normal
4,i will surely not tolerate non arab women slan...,hatespeech


In [16]:
# combine sets

# # 10 augmentations
# ins_frames = [original_train_data_df, nlpaug_ins_set1_df, nlpaug_ins_set2_df, nlpaug_ins_set3_df, 
#               nlpaug_ins_set4_df, nlpaug_ins_set5_df, nlpaug_ins_set6_df, nlpaug_ins_set7_df, 
#               nlpaug_ins_set8_df, nlpaug_ins_set9_df, nlpaug_ins_set10_df]

# sub_frames = [original_train_data_df, nlpaug_sub_set1_df, nlpaug_sub_set2_df, nlpaug_sub_set3_df, 
#               nlpaug_sub_set4_df, nlpaug_sub_set5_df, nlpaug_sub_set6_df, nlpaug_sub_set7_df, 
#               nlpaug_sub_set8_df, nlpaug_sub_set9_df, nlpaug_sub_set10_df]

# 7 augmentations
ins_frames = [original_train_data_df, nlpaug_ins_set1_df, nlpaug_ins_set2_df, nlpaug_ins_set3_df, 
              nlpaug_ins_set4_df, nlpaug_ins_set5_df, nlpaug_ins_set6_df, nlpaug_ins_set7_df]

sub_frames = [original_train_data_df, nlpaug_sub_set1_df, nlpaug_sub_set2_df, nlpaug_sub_set3_df, 
              nlpaug_sub_set4_df, nlpaug_sub_set5_df, nlpaug_sub_set6_df, nlpaug_sub_set7_df]

# # 5 augmentations
# ins_frames = [original_train_data_df, nlpaug_ins_set1_df, nlpaug_ins_set2_df, nlpaug_ins_set3_df, 
#               nlpaug_ins_set4_df, nlpaug_ins_set5_df]

# sub_frames = [original_train_data_df, nlpaug_sub_set1_df, nlpaug_sub_set2_df, nlpaug_sub_set3_df, 
#               nlpaug_sub_set4_df, nlpaug_sub_set5_df]

# # 2 augmentations
# ins_frames = [original_train_data_df, nlpaug_ins_set1_df, nlpaug_ins_set2_df]
# sub_frames = [original_train_data_df, nlpaug_sub_set1_df, nlpaug_sub_set2_df]



ins_train_df = pd.concat(ins_frames)
sub_train_df = pd.concat(sub_frames)

In [17]:
len(ins_train_df)

123064

In [18]:
len(sub_train_df)

123064

In [19]:
X_ins_train = ins_train_df['text_combined'].to_list()
X_sub_train = sub_train_df['text_combined'].to_list()

y_ins_train = ins_train_df['final_label']
y_sub_train = sub_train_df['final_label']

#### Convert labels to one-hot encoding

In [20]:
# convert class label to 1 hot encoding

def convert_to_oh(S):
    '''takes a pandas series of text labels and returns one hot encoding equivalent
    0 = normal, 1 = offensive, 2 = hatespeech
    ''' 
    S_numerical = S.apply(lambda x: 0 if x=='normal' else (1 if x=='offensive' else 2))
    S_oh = keras.utils.to_categorical(S_numerical, num_classes = 3, dtype = 'float32')
    return S_oh
    
# original dataset - train, dev, and train
y_train_orig = convert_to_oh(pd.Series(y_train))
y_dev_orig = convert_to_oh(pd.Series(y_dev))
y_test_orig = convert_to_oh(pd.Series(y_test))

# augmented with contextual insertion
y_ins_train_oh = convert_to_oh(y_ins_train)

# augmented with contextual substitution
y_sub_train_oh = convert_to_oh(y_sub_train)


In [21]:
len(y_ins_train_oh)

123064

In [22]:
len(y_sub_train_oh)

123064

In [23]:
y_sub_train_oh

array([[1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)

### BERT Model

In [24]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#bert_model = TFBertModel.from_pretrained('bert-base-uncased')

In [25]:
max_length = 128

def bert_tokenize(train_set, dev_set, test_set, max_length):
    
    train = tokenizer(train_set, max_length=max_length, truncation=True, padding='max_length', return_tensors='tf')
    dev = tokenizer(dev_set, max_length=max_length, truncation=True, padding='max_length', return_tensors='tf')
    test = tokenizer(test_set, max_length=max_length, truncation=True, padding='max_length', return_tensors='tf')
    
    return train, dev, test

X_train_orig, X_dev_orig, X_test_orig = bert_tokenize(X_train_text, X_dev_text, X_test_text, max_length)

X_train_aug_ins, X_dev_aug_ins, X_test_aug_ins = bert_tokenize(X_ins_train, X_dev_text, X_test_text, max_length)

X_train_aug_sub, X_dev_aug_sub, X_test_aug_sub = bert_tokenize(X_sub_train, X_dev_text, X_test_text, max_length)


In [26]:
#tokenizer.save_pretrained("./Tokenizer_ALL_EDA_BERT_base_uncased")

In [27]:
X_train_orig.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [28]:
X_train_aug_ins.input_ids

<tf.Tensor: shape=(123064, 128), dtype=int32, numpy=
array([[  101,  2691,  4563, ...,     0,     0,     0],
       [  101,  2165,  2026, ...,     0,     0,     0],
       [  101,  1026,  5310, ...,     0,     0,     0],
       ...,
       [  101, 22814,  2103, ...,     0,     0,     0],
       [  101,  1037,  5152, ...,     0,     0,     0],
       [  101,  6014,  1045, ...,     0,     0,     0]])>

In [29]:
X_train_aug_sub.token_type_ids

<tf.Tensor: shape=(123064, 128), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>

In [30]:
X_train_aug_ins.attention_mask

<tf.Tensor: shape=(123064, 128), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>

In [31]:
X_train_aug_sub.input_ids

<tf.Tensor: shape=(123064, 128), dtype=int32, numpy=
array([[  101,  2691,  4563, ...,     0,     0,     0],
       [  101,  2165,  2026, ...,     0,     0,     0],
       [  101,  1026,  5310, ...,     0,     0,     0],
       ...,
       [  101, 22814,  1998, ...,     0,     0,     0],
       [  101,  5152, 12290, ...,     0,     0,     0],
       [  101,  1045,  8239, ...,     0,     0,     0]])>

In [32]:
from keras import backend as K

def balanced_recall(y_true, y_pred):
    """This function calculates the balanced recall metric
    recall = TP / (TP + FN)
    """
    recall_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true_class, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        recall_by_class = recall_by_class + recall
    return recall_by_class / y_pred.shape[1]

def balanced_precision(y_true, y_pred):
    """This function calculates the balanced precision metric
    precision = TP / (TP + FP)
    """
    precision_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred_class, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        precision_by_class = precision_by_class + precision
    # return average balanced metric for each class
    return precision_by_class / y_pred.shape[1]

def balanced_f1_score(y_true, y_pred):
    """This function calculates the F1 score metric"""
    precision = balanced_precision(y_true, y_pred)
    recall = balanced_recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

In [33]:
def create_classification_model(bert_model, hidden_size = 5, 
                                train_layers = -1, 
                                optimizer=tf.keras.optimizers.Adam()):
    """
    Build a simple classification model with BERT. Let's keep it simple and don't add dropout, layer norms, etc.
    """

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                  'token_type_ids': token_type_ids,
                  'attention_mask': attention_mask}


    #restrict training to the train_layers outer transformer layers
    if not train_layers == -1:

            retrain_layers = []

            for retrain_layer_number in range(train_layers):

                layer_code = '_' + str(11 - retrain_layer_number)
                retrain_layers.append(layer_code)

            for w in bert_model.weights:
                if not any([x in w.name for x in retrain_layers]):
                    w._trainable = False


    bert_out = bert_model(bert_inputs)
    
    net = bert_out[0]
    
    classification_token = tf.keras.layers.Lambda(lambda x: x[:,0,:], name='get_first_vector')(net)
    
    dropout1 = tf.keras.layers.Dropout(0.4, name="dropout1")(classification_token)
    
    hidden = tf.keras.layers.Dense(hidden_size, name='hidden_layer')(dropout1)
    
    dropout2 = tf.keras.layers.Dropout(0.4, name="dropout2")(hidden)

    classification = tf.keras.layers.Dense(3, activation='sigmoid',name='classification_layer')(dropout2)

    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], 
                                          outputs=[classification])
    
    METRICS = [tf.keras.metrics.CategoricalAccuracy(name="accuracy"), 
               balanced_recall, 
               balanced_precision, 
               balanced_f1_score,
               tf.keras.metrics.AUC(curve='ROC', name="auc_roc")]
    
    
    classification_model.compile(optimizer=optimizer,
                            loss=tf.keras.losses.CategoricalCrossentropy(),
                            metrics= METRICS)


    return classification_model




#     classification_model.compile(optimizer=optimizer,
#                             loss=tf.keras.losses.CategoricalCrossentropy(),
#                             metrics=tf.keras.metrics.CategoricalAccuracy('accuracy'))

In [34]:
def fine_tune_BERT(x_train, x_dev, x_test, y_train, y_dev, y_test, name, learning_rate = 5e-05, 
                   epsilon=1e-08, train_layers = -1, epochs = 10, batch_size = 16):
    ''' Fine tunes BERT base uncased with given data, allows your to set some hyperparameters
        returns test set accuracy, f1 score, and AUC_ROC score
    '''
    try:
        del classification_model
    except:
        pass

    try:
        del bert_model
    except:
        pass
    
    tf.keras.backend.clear_session()
    bert_model = TFBertModel.from_pretrained('bert-base-uncased')

    # early stopping callback
    
    earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor = 'val_accuracy', 
                                                      patience = 4,
                                                      restore_best_weights = True)
    
    # Create a callback that saves the model's weights
    
    path_name = './Saved_Models/NLA_b_uncased_7aug/' + name + '/' + name

    cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=path_name, 
                                                     save_weights_only=True,
                                                     verbose=1,
                                                     monitor='val_accuracy',
                                                     save_best_only=True)
    
    # create classification model
    classification_model = create_classification_model(bert_model, 
                                                       optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon),
                                                       train_layers=train_layers)    
    
    model_fit = classification_model.fit([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask],
                         y_train,
                         validation_data=([x_dev.input_ids, x_dev.token_type_ids, x_dev.attention_mask],
                         y_dev),
                        epochs=epochs,
                        batch_size=batch_size,
                        callbacks = [earlystop_callback, cp_callback])
    
    y_preds_array = classification_model.predict([x_test.input_ids, x_test.token_type_ids, x_test.attention_mask])

    # convert to predicted one-hot encoding

    from keras.utils.np_utils import to_categorical
    y_preds = to_categorical(np.argmax(y_preds_array, 1), dtype = "int64")

    # convert back to labels

    y_test_cat = np.argmax(y_test, axis=1)
    y_preds_cat = np.argmax(y_preds, axis=1)
    
    # calculate metrics
    Accuracy = accuracy_score(y_test_cat, y_preds_cat)

    Macro_F1 = f1_score(y_test_cat, y_preds_cat, average='macro')

    ROC_AUC = roc_auc_score(y_test, y_preds, multi_class='ovo',average='macro')
    
    metrics_history = model_fit.history
    
    return Accuracy, Macro_F1, ROC_AUC, metrics_history

In [36]:
%%time
# original data set
Accuracy_orig, Macro_F1_orig, ROC_AUC_orig, metrics_orig = fine_tune_BERT(X_train_orig, X_dev_orig, X_test_orig, 
                                                            y_train_orig, y_dev_orig, y_test_orig, 'orig_data_base',
                                                            learning_rate = 2e-05, epsilon=1e-08, 
                                                            train_layers = 1, epochs = 30, batch_size = 64)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/30

Epoch 00001: val_accuracy improved from -inf to 0.55226, saving model to ./Saved_Models/NLA_b_uncased_7aug/orig_data_base\orig_data_base
Epoch 2/30

Epoch 00002: val_accuracy improved from 0.55226 to 0.59594, saving model to ./Saved_Models/NLA_b_uncased_7aug/orig_data_base\orig_data_base
Epoch 3/30

Epoch 00003: val_accuracy improved from 0.59594 to 0.61258, saving model to ./Saved_Models/NLA_b_uncased_7aug/orig_data_base\orig_data_base
Epoch 4/30

Epoch 00004: val_accuracy improved from 0.61258 to 0.63287, saving model to ./Saved_Models/NLA_b_uncased_7aug/orig_data_base\orig_data_base
Epoch 5/30

Epoch 00005: val_accuracy improved from 0.63287 to 0.64275, saving model to ./Saved_Models/NLA_b_uncased_7aug/orig_data_base\orig_data_base
Epoch 6/30

Epoch 00006: val_accuracy improved from 0.64275 to 0.65055, saving model to ./Saved_Models/NLA_b_uncased_7aug/orig_data_base\orig_data_base
Epoch 7/30

Epoch 00007: val_accuracy improved from 0.65055 to 0.65159, saving model to ./S


Epoch 00017: val_accuracy improved from 0.67603 to 0.68435, saving model to ./Saved_Models/NLA_b_uncased_7aug/orig_data_base\orig_data_base
Epoch 18/30

Epoch 00018: val_accuracy did not improve from 0.68435
Epoch 19/30

Epoch 00019: val_accuracy did not improve from 0.68435
Epoch 20/30

Epoch 00020: val_accuracy did not improve from 0.68435
Epoch 21/30

Epoch 00021: val_accuracy improved from 0.68435 to 0.68747, saving model to ./Saved_Models/NLA_b_uncased_7aug/orig_data_base\orig_data_base
Epoch 22/30

Epoch 00022: val_accuracy did not improve from 0.68747
Epoch 23/30

Epoch 00023: val_accuracy did not improve from 0.68747
Epoch 24/30

Epoch 00024: val_accuracy improved from 0.68747 to 0.68799, saving model to ./Saved_Models/NLA_b_uncased_7aug/orig_data_base\orig_data_base
Epoch 25/30

Epoch 00025: val_accuracy did not improve from 0.68799
Epoch 26/30

Epoch 00026: val_accuracy did not improve from 0.68799
Epoch 27/30

Epoch 00027: val_accuracy did not improve from 0.68799
Epoch 28/

In [37]:
%%time
# augmented with contextual word insertion
Accuracy_aug_ins, Macro_F1_aug_ins, ROC_AUC_aug_ins, metrics_ins = fine_tune_BERT(X_train_aug_ins, X_dev_aug_ins, X_test_aug_ins, 
                                                            y_ins_train_oh, y_dev_orig, y_test_orig, 'NLA_ins_base', 
                                                            learning_rate = 2e-05, epsilon=1e-08, 
                                                            train_layers = 1, epochs = 30, batch_size = 64)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/30

Epoch 00001: val_accuracy improved from -inf to 0.65835, saving model to ./Saved_Models/NLA_b_uncased_7aug/NLA_ins_base\NLA_ins_base
Epoch 2/30

Epoch 00002: val_accuracy improved from 0.65835 to 0.68123, saving model to ./Saved_Models/NLA_b_uncased_7aug/NLA_ins_base\NLA_ins_base
Epoch 3/30

Epoch 00003: val_accuracy did not improve from 0.68123
Epoch 4/30

Epoch 00004: val_accuracy did not improve from 0.68123
Epoch 5/30

Epoch 00005: val_accuracy did not improve from 0.68123
Epoch 6/30

Epoch 00006: val_accuracy did not improve from 0.68123
Wall time: 32min 16s


In [38]:
%%time
# augmented with contextual word substitution
Accuracy_aug_sub, Macro_F1_aug_sub, ROC_AUC_aug_sub, metrics_sub = fine_tune_BERT(X_train_aug_sub, X_dev_aug_sub, X_test_aug_sub, 
                                                            y_sub_train_oh, y_dev_orig, y_test_orig, 'NLA_sub_base', 
                                                            learning_rate = 2e-05, epsilon=1e-08, 
                                                            train_layers = 1, epochs = 30, batch_size = 64)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/30

Epoch 00001: val_accuracy improved from -inf to 0.63963, saving model to ./Saved_Models/NLA_b_uncased_7aug/NLA_sub_base\NLA_sub_base
Epoch 2/30

Epoch 00002: val_accuracy improved from 0.63963 to 0.66043, saving model to ./Saved_Models/NLA_b_uncased_7aug/NLA_sub_base\NLA_sub_base
Epoch 3/30

Epoch 00003: val_accuracy improved from 0.66043 to 0.66979, saving model to ./Saved_Models/NLA_b_uncased_7aug/NLA_sub_base\NLA_sub_base
Epoch 4/30

Epoch 00004: val_accuracy did not improve from 0.66979
Epoch 5/30

Epoch 00005: val_accuracy improved from 0.66979 to 0.67967, saving model to ./Saved_Models/NLA_b_uncased_7aug/NLA_sub_base\NLA_sub_base
Epoch 6/30

Epoch 00006: val_accuracy did not improve from 0.67967
Epoch 7/30

Epoch 00007: val_accuracy did not improve from 0.67967
Epoch 8/30

Epoch 00008: val_accuracy did not improve from 0.67967
Epoch 9/30

Epoch 00009: val_accuracy did not improve from 0.67967
Wall time: 52min 23s


In [39]:
Accuracy_orig, Macro_F1_orig, ROC_AUC_orig, metrics_orig

(0.703068122724909,
 0.690205079744718,
 0.7691058819451243,
 {'loss': [1.221578598022461,
   1.0894231796264648,
   1.0119553804397583,
   0.9666054844856262,
   0.9280912280082703,
   0.907863438129425,
   0.897736668586731,
   0.8641343116760254,
   0.851116418838501,
   0.8380746841430664,
   0.8335153460502625,
   0.8157700896263123,
   0.8141983151435852,
   0.8020946383476257,
   0.7916494607925415,
   0.7746538519859314,
   0.7766710519790649,
   0.752069890499115,
   0.7583571076393127,
   0.743678092956543,
   0.7477236986160278,
   0.723038375377655,
   0.7163364887237549,
   0.7102758288383484,
   0.6982280015945435,
   0.701815664768219,
   0.6914464235305786,
   0.6741040349006653],
  'accuracy': [0.41597867012023926,
   0.48306572437286377,
   0.5294805765151978,
   0.5467724204063416,
   0.5674445629119873,
   0.5861665606498718,
   0.592147171497345,
   0.6056035757064819,
   0.6171748042106628,
   0.6204901337623596,
   0.6264708042144775,
   0.6267307996749878,
   0.

In [40]:
Accuracy_aug_ins, Macro_F1_aug_ins, ROC_AUC_aug_ins, metrics_ins

(0.6931877275091004,
 0.6772416323251157,
 0.7613586457377055,
 {'loss': [0.9941275119781494,
   0.8390501737594604,
   0.7714720964431763,
   0.7255303263664246,
   0.6789470314979553,
   0.6313865184783936],
  'accuracy': [0.5268071889877319,
   0.6211158633232117,
   0.6582916378974915,
   0.6838393211364746,
   0.7068760991096497,
   0.7322937846183777],
  'balanced_recall': [0.514661967754364,
   0.5694782137870789,
   0.576718807220459,
   0.5959262251853943,
   0.6238436102867126,
   0.6543011665344238],
  'balanced_precision': [0.45118430256843567,
   0.46170127391815186,
   0.45762139558792114,
   0.467529833316803,
   0.47906777262687683,
   0.4924197196960449],
  'balanced_f1_score': [0.47884345054626465,
   0.5085262060165405,
   0.5089513659477234,
   0.5226735472679138,
   0.5405933856964111,
   0.5607804656028748],
  'auc_roc': [0.652643620967865,
   0.6678628325462341,
   0.6611381769180298,
   0.6725638508796692,
   0.6901378035545349,
   0.7109382748603821],
  'val_lo

In [41]:
Accuracy_aug_sub, Macro_F1_aug_sub, ROC_AUC_aug_sub, metrics_sub

(0.6968278731149246,
 0.6806514041387136,
 0.7673139976027917,
 {'loss': [1.0900989770889282,
   0.9486828446388245,
   0.904265284538269,
   0.8716813921928406,
   0.8450084328651428,
   0.8172816634178162,
   0.7915228605270386,
   0.7639266848564148,
   0.7376047968864441],
  'accuracy': [0.4749155044555664,
   0.5540856719017029,
   0.5824042558670044,
   0.6008012294769287,
   0.6180524230003357,
   0.6328739523887634,
   0.647370457649231,
   0.6616557240486145,
   0.6782405972480774],
  'balanced_recall': [0.5957022309303284,
   0.6492529511451721,
   0.6705367565155029,
   0.6868736147880554,
   0.7063277363777161,
   0.7223654985427856,
   0.7361578345298767,
   0.7505174875259399,
   0.765038251876831],
  'balanced_precision': [0.41497358679771423,
   0.46997198462486267,
   0.49226096272468567,
   0.5075451731681824,
   0.5199236273765564,
   0.5299041867256165,
   0.542458176612854,
   0.5506762266159058,
   0.5604746341705322],
  'balanced_f1_score': [0.4885994791984558,
 

In [42]:
trial_name_list = ['Original Data', 'Aug Contextual Ins', 'Aug Contextual Sub']

acc_list = [Accuracy_orig, Accuracy_aug_ins, Accuracy_aug_sub]

macro_f1_list = [Macro_F1_orig, Macro_F1_aug_ins, Macro_F1_aug_sub]

roc_auc_list = [ROC_AUC_orig, ROC_AUC_aug_ins, ROC_AUC_aug_sub]


In [43]:
result_dict = {'Trial Name' : trial_name_list, 'Test Accuracy Score' : acc_list, 
               'Test Macro F1 Score' : macro_f1_list, 'Test ROC AUC Score' : roc_auc_list}

In [44]:
results_df = pd.DataFrame(result_dict)

results_df

Unnamed: 0,Trial Name,Test Accuracy Score,Test Macro F1 Score,Test ROC AUC Score
0,Original Data,0.703068,0.690205,0.769106
1,Aug Contextual Ins,0.693188,0.677242,0.761359
2,Aug Contextual Sub,0.696828,0.680651,0.767314


In [50]:
results_df.to_csv('./Saved_Models/NLA_b_uncased_7aug/All_NLA_BERT_base_uncased.csv')

In [49]:
metrics_org_df = pd.DataFrame(metrics_orig)

metrics_org_df

Unnamed: 0,loss,accuracy,balanced_recall,balanced_precision,balanced_f1_score,auc_roc,val_loss,val_accuracy,val_balanced_recall,val_balanced_precision,val_balanced_f1_score,val_auc_roc
0,1.221579,0.415979,0.550148,0.386916,0.453543,0.589448,0.93907,0.552262,0.639782,0.491838,0.55531,0.724094
1,1.089423,0.483066,0.632816,0.436892,0.516369,0.660733,0.866831,0.595944,0.792324,0.501178,0.612661,0.764484
2,1.011955,0.529481,0.707279,0.463571,0.559598,0.701929,0.831169,0.612585,0.802817,0.516703,0.627293,0.783952
3,0.966605,0.546772,0.72339,0.477743,0.574834,0.722412,0.81539,0.632865,0.764501,0.538485,0.630948,0.798358
4,0.928091,0.567445,0.741247,0.488633,0.588463,0.736602,0.790993,0.642746,0.812034,0.521577,0.633838,0.801836
5,0.907863,0.586167,0.75798,0.496148,0.599278,0.747566,0.779591,0.650546,0.81506,0.527232,0.638952,0.810263
6,0.897737,0.592147,0.762987,0.49573,0.600585,0.749263,0.772066,0.651586,0.803534,0.535509,0.6415,0.813298
7,0.864134,0.605604,0.774701,0.500991,0.608001,0.762195,0.767739,0.654706,0.854412,0.514094,0.640293,0.808661
8,0.851116,0.617175,0.782603,0.504961,0.613359,0.765922,0.759929,0.660426,0.836635,0.528399,0.646238,0.816757
9,0.838075,0.62049,0.788568,0.505723,0.615694,0.77079,0.763784,0.660946,0.844426,0.520159,0.642268,0.816209


In [48]:
metrics_list = [metrics_orig, metrics_ins, metrics_sub]
name_list = ['fit_metrics_orig.csv', 'fit_metrics_ins.csv', 'fit_metrics_sub.csv']

i = 0
for m in metrics_list:
    df = pd.DataFrame(m)
    df.to_csv(name_list[i])
    i += 1

        