### Synonym Replacement - BERT Base Uncased

#### Un-augmented test set
#### Augment only the training set

#### Get Original Paper Data

In [1]:
# !pip install sklearn
# !pip install ekphrasis
# !pip install transformers
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

import transformers

from transformers import BertTokenizer, TFBertModel
from tensorflow.keras import backend as K
from tensorflow import keras

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

import logging
tf.get_logger().setLevel(logging.ERROR)

In [3]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [4]:
encoder = LabelEncoder()
encoder.classes_ = np.load('../Data/classes.npy', allow_pickle=True)

In [5]:
tf.__version__

'2.6.0'

In [6]:
transformers.__version__

'4.16.2'

In [7]:
# set random seed

import random

random.seed(42)

In [8]:
import sys

sys.path.append('../Preprocess')

from dataCollect import *

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


Reading english - 1grams ...


In [9]:
params = {'data_file' : '../Data/dataset.json', 'class_names' : '../Data/classes.npy'}

raw_data = get_annotated_data(params)

raw_data_final = raw_data[['post_id', 'text', 'target1', 'target2', 'target3', 'rationales', 'final_label']]

raw_data_final

Unnamed: 0,post_id,text,target1,target2,target3,rationales,final_label
0,1179055004553900032_twitter,"[i, dont, think, im, getting, my, baby, them, ...",[None],[None],[None],[],normal
1,1179063826874032128_twitter,"[we, cannot, continue, calling, ourselves, fem...",[None],[None],[None],[],normal
2,1178793830532956161_twitter,"[nawt, yall, niggers, ignoring, me]",[African],[None],[African],[],normal
3,1179088797964763136_twitter,"[<user>, i, am, bit, confused, coz, chinese, p...",[Asian],[Asian],[Asian],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",hatespeech
4,1179085312976445440_twitter,"[this, bitch, in, whataburger, eating, a, burg...","[Caucasian, Women]","[Women, Caucasian]","[Women, Caucasian]","[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",hatespeech
...,...,...,...,...,...,...,...
20143,9989999_gab,"[if, ur, still, on, twitter, tell, carlton, i,...","[Men, Women, Other]",[None],[None],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,...",offensive
20144,9990225_gab,"[when, i, first, got, on, here, and, said, i, ...",[African],"[African, Other]",[None],"[[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,...",offensive
20145,9991681_gab,"[was, macht, der, moslem, wenn, der, zion, geg...",[Islam],[Other],[None],[],normal
20146,9992513_gab,"[it, is, awful, look, at, world, demographics,...",[Hispanic],[Asian],[Asian],"[[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,...",hatespeech


In [10]:
def listToString(s): 
    
    # initialize an empty string
    str1 = "" 
    
    # traverse in the string  
    for ele in s: 
        str1 += ele
        str1 += ' '
    
    # return string  
    return str1 
        

In [11]:
# Select appropriate columns for original paper data
# re-assemble token list to text

raw_data_filtered = raw_data[raw_data['final_label'] != 'undecided']

orig_text_token = raw_data_filtered['text']

orig_text = [listToString(s) for s in orig_text_token]

orig_post_id = raw_data_filtered['post_id']

orig_labels = raw_data_filtered['final_label'].to_list()

### Test Train Split Original Data on post_id

In [12]:
# stratified test train split, return train, dev, and test sets

def create_train_dev_test(text, labels, dev_size, test_size):
    t = dev_size + test_size
    r = dev_size/t
    X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=t, stratify=labels)
    X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test)
    
    return X_train, X_dev, X_test, y_train, y_dev, y_test

In [13]:
X_train_id, X_dev_id, X_test_id, y_train, y_dev, y_test = create_train_dev_test(orig_post_id, orig_labels, 0.1, 0.1)

In [14]:
#len(X_train_df)

In [15]:
x_train_df = pd.DataFrame({'post_id' : X_train_id.to_list()})
x_dev_df = pd.DataFrame({'post_id' : X_dev_id.to_list()})
x_test_df = pd.DataFrame({'post_id' : X_test_id.to_list()})

X_train_df = pd.merge(x_train_df, raw_data_final, how='inner', on='post_id')
X_dev_df = pd.merge(x_dev_df, raw_data_final, how='inner', on='post_id')
X_test_df = pd.merge(x_test_df, raw_data_final, how='inner', on='post_id')

X_train_text = [listToString(s) for s in X_train_df['text']]
X_dev_text= [listToString(s) for s in X_dev_df['text']]
X_test_text = [listToString(s) for s in X_test_df['text']]

print(len(X_train_text))
print(len(X_dev_text))
print(len(X_test_text))

15383
1923
1923


### Get Augmented Data

In [16]:
# load augmented datasets generated by EDA
# sr = synonym replacement
# ri = random synonym insertion
# rs = random swap
# rd = random deletion
# dataframe name format: method_number 

sr_1_df = pd.read_csv('../test_data_set/EDA_5_0_7_sr_rest_0_1.csv')
ri_1_df = pd.read_csv('../test_data_set/EDA_5_0_7_ri_rest_0_1.csv')
rs_1_df = pd.read_csv('../test_data_set/EDA_5_0_7_rs_rest_0_1.csv')
rd_1_df = pd.read_csv('../test_data_set/EDA_5_0_7_rd_rest_0_1.csv')
all_1_df = pd.read_csv('../test_data_set/EDA_5_all_0_1s.csv')
all_5_df = pd.read_csv('../test_data_set/EDA_5_all_0_5s.csv')

# remove undecided labeled examples
sr_1_df_filtered = sr_1_df[sr_1_df['final_label'] != 'undecided']
ri_1_df_filtered = ri_1_df[ri_1_df['final_label'] != 'undecided']
rs_1_df_filtered = rs_1_df[rs_1_df['final_label'] != 'undecided']
rd_1_df_filtered = rd_1_df[rd_1_df['final_label'] != 'undecided']
all_1_df_filtered = all_1_df[all_1_df['final_label'] != 'undecided']
all_5_df_filtered = all_5_df[all_5_df['final_label'] != 'undecided']

len(sr_1_df_filtered)

115374

In [17]:
# separate train, dev, test for each set
sr_1_df_train = sr_1_df_filtered[sr_1_df_filtered['post_id'].isin(X_train_id)]
ri_1_df_train = ri_1_df_filtered[ri_1_df_filtered['post_id'].isin(X_train_id)]
rs_1_df_train = rs_1_df_filtered[rs_1_df_filtered['post_id'].isin(X_train_id)]
rd_1_df_train = rd_1_df_filtered[rd_1_df_filtered['post_id'].isin(X_train_id)]
all_1_df_train = all_1_df_filtered[all_1_df_filtered['post_id'].isin(X_train_id)]
all_5_df_train = all_5_df_filtered[all_5_df_filtered['post_id'].isin(X_train_id)]

# select text sets

aug_sr_text = sr_1_df_train['text_str'].to_list()
aug_ri_text = ri_1_df_train['text_str'].to_list()
aug_rs_text = rs_1_df_train['text_str'].to_list()
aug_rd_text = rd_1_df_train['text_str'].to_list()
aug_all_1_text = all_1_df_train['text_str'].to_list()
aug_all_5_text = all_5_df_train['text_str'].to_list()

# select label sets

aug_sr_labels = sr_1_df_train['final_label']
aug_ri_labels = ri_1_df_train['final_label']
aug_rs_labels = rs_1_df_train['final_label']
aug_rd_labels = rd_1_df_train['final_label']
aug_all_1_labels = all_1_df_train['final_label']
aug_all_5_labels = all_5_df_train['final_label']

len(aug_sr_text)

92298

In [18]:
# # combine with original data

# # leave in list format
# aug_sr_text = aug_sr_text + orig_text
# aug_ri_text = aug_ri_text + orig_text
# aug_rs_text = aug_rs_text + orig_text
# aug_rd_text = aug_rd_text + orig_text
# aug_all_1_text = aug_all_1_text + orig_text
# aug_all_5_text = aug_all_5_text + orig_text

# # in Series format
# aug_sr_labels = pd.Series(aug_sr_labels + orig_labels)
# aug_ri_labels = pd.Series(aug_ri_labels + orig_labels)
# aug_rs_labels = pd.Series(aug_rs_labels + orig_labels)
# aug_rd_labels = pd.Series(aug_rd_labels + orig_labels)
# aug_all_1_labels = pd.Series(aug_all_1_labels + orig_labels)
# aug_all_5_labels = pd.Series(aug_all_5_labels + orig_labels)

# len(aug_sr_text)

#### Convert labels to one-hot encoding

In [19]:
# convert class label to 1 hot encoding

original_labels = pd.Series(orig_labels)


def convert_to_oh(S):
    '''takes a pandas series of text labels and returns one hot encoding equivalent
    0 = normal, 1 = offensive, 2 = hatespeech
    ''' 
    S_numerical = S.apply(lambda x: 0 if x=='normal' else (1 if x=='offensive' else 2))
    S_oh = keras.utils.to_categorical(S_numerical, num_classes = 3, dtype = 'float32')
    return S_oh
    
# original dataset - train, dev, and train
y_train_orig = convert_to_oh(pd.Series(y_train))
y_dev_orig = convert_to_oh(pd.Series(y_dev))
y_test_orig = convert_to_oh(pd.Series(y_test))

# augmented with sr = 0.1
y_train_aug_sr = convert_to_oh(aug_sr_labels)

# augmented with ri = 0.1
y_train_aug_ri = convert_to_oh(aug_ri_labels)

# augmented with rs = 0.1
y_train_aug_rs = convert_to_oh(aug_rs_labels)

# augmented with rd = 0.1
y_train_aug_rd = convert_to_oh(aug_rd_labels)

# augmented with all = 0.1
y_train_all_1 = convert_to_oh(aug_all_1_labels)

# augmented with all = 0.5
y_train_all_5 = convert_to_oh(aug_all_5_labels)


In [20]:
len(aug_ri_text)

92298

In [21]:
len(y_train_aug_ri)

92298

In [22]:
y_train_aug_ri

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)

### BERT Model

In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
#bert_model = TFBertModel.from_pretrained('bert-base-uncased')

In [24]:
aug_sr_text

['we arent continue calling ourselves feminists if the rights of all womxn to addressed yes list a sexual offences public cannot but will a trans lesbian bisexual on queer womxn be able to enter their information and the reporting sheet gender forum',
 'we cannot continue feminists ourselves calling if the be of list womxn arent addressed yes to a sexual offences public but all will a trans lesbian bisexual and queer womxn rights able to enter their information on the reporting sheet gender forum',
 'we cannot keep on vocation ourselves feminist if the right wing of all womxn arent come up to yes to a intimate discourtesy populace lean but will a trans sapphic epicene and bilk womxn be able bodied to go in their info on the account sheet of paper sexuality meeting place',
 'we cannot continue calling ourselves feminists if the rights of all womxn arent yes to a sexual offences public list but a trans bisexual and queer womxn be able to enter their information on reporting gender forum'

In [25]:
max_length = 128

def bert_tokenize(train_set, dev_set, test_set, max_length):
    
    train = tokenizer(train_set, max_length=max_length, truncation=True, padding='max_length', return_tensors='tf')
    dev = tokenizer(dev_set, max_length=max_length, truncation=True, padding='max_length', return_tensors='tf')
    test = tokenizer(test_set, max_length=max_length, truncation=True, padding='max_length', return_tensors='tf')
    
    return train, dev, test

X_train_orig, X_dev_orig, X_test_orig = bert_tokenize(X_train_text, X_dev_text, X_test_text, max_length)

X_train_aug_sr, X_dev_aug_sr, X_test_aug_sr = bert_tokenize(aug_sr_text, X_dev_text, X_test_text, max_length)

X_train_aug_ri, X_dev_aug_ri, X_test_aug_ri = bert_tokenize(aug_ri_text, X_dev_text, X_test_text, max_length)

X_train_aug_rs, X_dev_aug_rs, X_test_aug_rs = bert_tokenize(aug_rs_text, X_dev_text, X_test_text, max_length)

X_train_aug_rd, X_dev_aug_rd, X_test_aug_rd = bert_tokenize(aug_rd_text, X_dev_text, X_test_text, max_length)

X_train_all_1, X_dev_all_1, X_test_all_1 = bert_tokenize(aug_all_1_text, X_dev_text, X_test_text, max_length)

X_train_all_5, X_dev_all_5, X_test_all_5 = bert_tokenize(aug_all_5_text, X_dev_text, X_test_text, max_length)


In [26]:
X_train_orig.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [27]:
X_train_aug_sr.input_ids

<tf.Tensor: shape=(92298, 128), dtype=int32, numpy=
array([[ 101, 2057, 4995, ...,    0,    0,    0],
       [ 101, 2057, 3685, ...,    0,    0,    0],
       [ 101, 2057, 3685, ...,    0,    0,    0],
       ...,
       [ 101, 1996, 3795, ...,    0,    0,    0],
       [ 101, 1996, 3644, ...,    0,    0,    0],
       [ 101, 1996, 3644, ...,    0,    0,    0]])>

In [28]:
X_train_aug_ri.token_type_ids

<tf.Tensor: shape=(92298, 128), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>

In [29]:
X_train_all_1.attention_mask

<tf.Tensor: shape=(92298, 128), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>

In [30]:
X_train_all_5.input_ids

<tf.Tensor: shape=(92298, 128), dtype=int32, numpy=
array([[  101, 22437,  1037, ...,     0,     0,     0],
       [  101,  3613,  4214, ...,     0,     0,     0],
       [  101,  2247,  2247, ...,     0,     0,     0],
       ...,
       [  101,  1996, 10620, ...,     0,     0,     0],
       [  101,  1996,  3644, ...,     0,     0,     0],
       [  101,  1996,  3644, ...,     0,     0,     0]])>

In [31]:
from keras import backend as K

def balanced_recall(y_true, y_pred):
    """This function calculates the balanced recall metric
    recall = TP / (TP + FN)
    """
    recall_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true_class, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        recall_by_class = recall_by_class + recall
    return recall_by_class / y_pred.shape[1]

def balanced_precision(y_true, y_pred):
    """This function calculates the balanced precision metric
    precision = TP / (TP + FP)
    """
    precision_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred_class, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        precision_by_class = precision_by_class + precision
    # return average balanced metric for each class
    return precision_by_class / y_pred.shape[1]

def balanced_f1_score(y_true, y_pred):
    """This function calculates the F1 score metric"""
    precision = balanced_precision(y_true, y_pred)
    recall = balanced_recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

In [32]:
def create_classification_model(bert_model, hidden_size = 5, 
                                train_layers = -1, 
                                optimizer=tf.keras.optimizers.Adam()):
    """
    Build a simple classification model with BERT. Let's keep it simple and don't add dropout, layer norms, etc.
    """

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                  'token_type_ids': token_type_ids,
                  'attention_mask': attention_mask}


    #restrict training to the train_layers outer transformer layers
    if not train_layers == -1:

            retrain_layers = []

            for retrain_layer_number in range(train_layers):

                layer_code = '_' + str(11 - retrain_layer_number)
                retrain_layers.append(layer_code)

            for w in bert_model.weights:
                if not any([x in w.name for x in retrain_layers]):
                    w._trainable = False


    bert_out = bert_model(bert_inputs)
    
    net = bert_out[0]
    
    classification_token = tf.keras.layers.Lambda(lambda x: x[:,0,:], name='get_first_vector')(net)
    
    dropout1 = tf.keras.layers.Dropout(0.4, name="dropout1")(classification_token)
    
    hidden = tf.keras.layers.Dense(hidden_size, name='hidden_layer')(dropout1)
    
    dropout2 = tf.keras.layers.Dropout(0.4, name="dropout2")(hidden)

    classification = tf.keras.layers.Dense(3, activation='sigmoid',name='classification_layer')(dropout2)

    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], 
                                          outputs=[classification])
    
    METRICS = [tf.keras.metrics.CategoricalAccuracy(name="accuracy"), 
               balanced_recall, 
               balanced_precision, 
               balanced_f1_score,
               tf.keras.metrics.AUC(curve='ROC', name="auc_roc")]
    
    
    classification_model.compile(optimizer=optimizer,
                            loss=tf.keras.losses.CategoricalCrossentropy(),
                            metrics= METRICS)


    return classification_model




#     classification_model.compile(optimizer=optimizer,
#                             loss=tf.keras.losses.CategoricalCrossentropy(),
#                             metrics=tf.keras.metrics.CategoricalAccuracy('accuracy'))

In [33]:
def fine_tune_BERT(x_train, x_dev, x_test, y_train, y_dev, y_test, learning_rate = 5e-05, 
                   epsilon=1e-08, train_layers = -1, epochs = 10, batch_size = 16):
    ''' Fine tunes BERT base uncased with given data, allows your to set some hyperparameters
        returns test set accuracy, f1 score, and AUC_ROC score
    '''
    try:
        del classification_model
    except:
        pass

    try:
        del bert_model
    except:
        pass
    
    tf.keras.backend.clear_session()
    bert_model = TFBertModel.from_pretrained('bert-large-uncased')

    earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", 
                                                      patience = 3,
                                                      restore_best_weights = True)

    classification_model = create_classification_model(bert_model, 
                                                       optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon),
                                                       train_layers=train_layers)    
    
    model_fit = classification_model.fit([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask],
                         y_train,
                         validation_data=([x_dev.input_ids, x_dev.token_type_ids, x_dev.attention_mask],
                         y_dev),
                        epochs=epochs,
                        batch_size=batch_size,
                        callbacks = [earlystop_callback])
    
    y_preds_array = classification_model.predict([x_test.input_ids, x_test.token_type_ids, x_test.attention_mask])

    # convert to predicted one-hot encoding

    from keras.utils.np_utils import to_categorical
    y_preds = to_categorical(np.argmax(y_preds_array, 1), dtype = "int64")

    # convert back to labels

    y_test_cat = np.argmax(y_test, axis=1)
    y_preds_cat = np.argmax(y_preds, axis=1)
    
    # calculate metrics
    Accuracy = accuracy_score(y_test_cat, y_preds_cat)

    Macro_F1 = f1_score(y_test_cat, y_preds_cat, average='macro')

    ROC_AUC = roc_auc_score(y_test, y_preds, multi_class='ovo',average='macro')
    
    metrics_history = model_fit.history
    
    return Accuracy, Macro_F1, ROC_AUC, metrics_history

In [35]:
%%time

# original data set
Accuracy_orig, Macro_F1_orig, ROC_AUC_orig, metrics_orig = fine_tune_BERT(X_train_orig, X_dev_orig, X_test_orig, 
                                                            y_train_orig, y_dev_orig, y_test_orig, 
                                                            learning_rate = 5e-05, epsilon=1e-08, 
                                                            train_layers = 1, epochs = 20, batch_size = 48)

# augmented with sr = 0.1
Accuracy_aug_sr, Macro_F1_aug_sr, ROC_AUC_aug_sr, metrics_sr = fine_tune_BERT(X_train_aug_sr, X_dev_aug_sr, X_test_aug_sr, 
                                                            y_train_aug_sr, y_dev_orig, y_test_orig, 
                                                            learning_rate = 5e-05, epsilon=1e-08, 
                                                            train_layers = 1, epochs = 20, batch_size = 48)

# augmented with ri = 0.1
Accuracy_aug_ri, Macro_F1_aug_ri, ROC_AUC_aug_ri, metrics_ri = fine_tune_BERT(X_train_aug_ri, X_dev_aug_ri, X_test_aug_ri, 
                                                            y_train_aug_ri, y_dev_orig, y_test_orig, 
                                                            learning_rate = 5e-05, epsilon=1e-08, 
                                                            train_layers = 1, epochs = 20, batch_size = 48)

# augmented with rs = 0.1
Accuracy_aug_rs, Macro_F1_aug_rs, ROC_AUC_aug_rs, metrics_rs = fine_tune_BERT(X_train_aug_rs, X_dev_aug_rs, X_test_aug_rs, 
                                                            y_train_aug_rs, y_dev_orig, y_test_orig, 
                                                            learning_rate = 5e-05, epsilon=1e-08, 
                                                            train_layers = 1, epochs = 20, batch_size = 48)

# augmented with rd = 0.1
Accuracy_aug_rd, Macro_F1_aug_rd, ROC_AUC_aug_rd, metrics_rd = fine_tune_BERT(X_train_aug_rd, X_dev_aug_rd, X_test_aug_rd, 
                                                            y_train_aug_rd, y_dev_orig, y_test_orig,
                                                            learning_rate = 5e-05, epsilon=1e-08, 
                                                            train_layers = 1, epochs = 20, batch_size = 48)

# augmented with all = 0.1
Accuracy_aug_all_1, Macro_F1_aug_all_1, ROC_AUC_aug_all_1, metrics_all_1 = fine_tune_BERT(X_train_all_1, X_dev_all_1, X_test_all_1, 
                                                            y_train_all_1, y_dev_orig, y_test_orig, 
                                                            learning_rate = 5e-05, epsilon=1e-08, 
                                                            train_layers = 1, epochs = 20, batch_size = 48)

# augmented with all = 0.5
Accuracy_aug_all_5, Macro_F1_aug_all_5, ROC_AUC_aug_all_5, metrics_all_5 = fine_tune_BERT(X_train_all_5, X_dev_all_5, X_test_all_5, 
                                                            y_train_all_5, y_dev_orig, y_test_orig, 
                                                            learning_rate = 5e-05, epsilon=1e-08, 
                                                            train_layers = 1, epochs = 20, batch_size = 48)

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Wall time: 13h 57min 39s


In [36]:
Accuracy_orig, Macro_F1_orig, ROC_AUC_orig, metrics_orig

(0.6609464378575143,
 0.6319246919999925,
 0.7326758982835102,
 {'loss': [1.414437174797058,
   1.2325364351272583,
   1.13459050655365,
   1.130818486213684,
   1.0874961614608765,
   1.0560382604599,
   1.0214968919754028,
   0.9854006767272949,
   0.9597710967063904,
   0.9445297122001648,
   1.0980370044708252,
   1.0452228784561157,
   0.981499969959259,
   0.9482216238975525,
   0.9320251941680908,
   0.9102798104286194,
   0.8996171355247498,
   0.8778436779975891,
   0.8663511872291565,
   0.8538827896118164],
  'accuracy': [0.37924981117248535,
   0.40310731530189514,
   0.42709484696388245,
   0.4508223235607147,
   0.4779301881790161,
   0.48833128809928894,
   0.510498583316803,
   0.5326659083366394,
   0.5521679520606995,
   0.5612689256668091,
   0.5004875659942627,
   0.5054280757904053,
   0.5413768291473389,
   0.5581486225128174,
   0.5746603608131409,
   0.5830461978912354,
   0.5898719429969788,
   0.6011180877685547,
   0.6129493713378906,
   0.6200351119041443],


In [37]:
Accuracy_aug_sr, Macro_F1_aug_sr, ROC_AUC_aug_sr, metrics_sr

(0.6703068122724909,
 0.6538685661420338,
 0.7450707431347788,
 {'loss': [1.1054279804229736,
   0.9279928207397461,
   0.8564785718917847,
   0.8107340931892395,
   0.7710567116737366,
   0.7259699702262878,
   0.6634292006492615],
  'accuracy': [0.4498472213745117,
   0.5655810236930847,
   0.6128085255622864,
   0.6387462615966797,
   0.6589092016220093,
   0.6806864738464355,
   0.7096686959266663],
  'balanced_recall': [0.4705999493598938,
   0.5378668308258057,
   0.6412853002548218,
   0.6750226020812988,
   0.6892650127410889,
   0.7149829864501953,
   0.7492078542709351],
  'balanced_precision': [0.409149706363678,
   0.48599928617477417,
   0.4892413914203644,
   0.49492835998535156,
   0.5056651830673218,
   0.5193694829940796,
   0.5352498888969421],
  'balanced_f1_score': [0.43542370200157166,
   0.5083931088447571,
   0.5535715818405151,
   0.5698105692863464,
   0.5818925499916077,
   0.6002241969108582,
   0.6231601238250732],
  'auc_roc': [0.6076003313064575,
   0.7021

In [38]:
Accuracy_aug_ri, Macro_F1_aug_ri, ROC_AUC_aug_ri, metrics_ri

(0.6448257930317213,
 0.5888863055619634,
 0.7136823288681992,
 {'loss': [0.9977815747261047,
   0.848426342010498,
   0.7991135716438293,
   0.7769655585289001,
   0.7588010430335999,
   0.7368476390838623,
   0.711444616317749],
  'accuracy': [0.5153090953826904,
   0.6095581650733948,
   0.6384970545768738,
   0.6512166857719421,
   0.662029504776001,
   0.6740015745162964,
   0.6886281371116638],
  'balanced_recall': [0.7195204496383667,
   0.7729053497314453,
   0.7849979996681213,
   0.7913492918014526,
   0.7885153889656067,
   0.7843513488769531,
   0.7920415997505188],
  'balanced_precision': [0.3967333734035492,
   0.4356802999973297,
   0.4325752258300781,
   0.4377046823501587,
   0.4441376328468323,
   0.4495792090892792,
   0.45542436838150024],
  'balanced_f1_score': [0.5104688405990601,
   0.5563392639160156,
   0.556854248046875,
   0.5626671314239502,
   0.5672370195388794,
   0.5705582499504089,
   0.5773205757141113],
  'auc_roc': [0.6407245993614197,
   0.707136094

In [39]:
Accuracy_aug_rs, Macro_F1_aug_rs, ROC_AUC_aug_rs, metrics_rs

(0.6775871034841394,
 0.6542451466652167,
 0.7465263218765769,
 {'loss': [1.0660706758499146,
   0.8527681827545166,
   0.7865257859230042,
   0.7437723875045776,
   0.7178320288658142,
   0.6820753812789917],
  'accuracy': [0.47251296043395996,
   0.6175540089607239,
   0.6562764048576355,
   0.6796788573265076,
   0.6960497498512268,
   0.7139157652854919],
  'balanced_recall': [0.6313719749450684,
   0.7743516564369202,
   0.8015339374542236,
   0.8229678869247437,
   0.8323614001274109,
   0.8414154648780823],
  'balanced_precision': [0.3908116817474365,
   0.448280394077301,
   0.46522659063339233,
   0.47511905431747437,
   0.48521748185157776,
   0.4926108121871948],
  'balanced_f1_score': [0.4816170334815979,
   0.5669378638267517,
   0.5878910422325134,
   0.601685106754303,
   0.6122819185256958,
   0.6206327080726624],
  'auc_roc': [0.6162319183349609,
   0.7302096486091614,
   0.7563493251800537,
   0.7717620730400085,
   0.7827745676040649,
   0.7944461703300476],
  'val_l

In [40]:
Accuracy_aug_rd, Macro_F1_aug_rd, ROC_AUC_aug_rd, metrics_rd

(0.6734269370774831,
 0.6482129102106291,
 0.7407903276343201,
 {'loss': [1.1635398864746094,
   1.0358493328094482,
   1.128196358680725,
   0.9664922952651978,
   0.8705714344978333,
   0.8428657054901123,
   0.8227418065071106,
   0.802208423614502,
   0.7920058369636536,
   0.7733984589576721,
   0.7601956725120544],
  'accuracy': [0.3703330457210541,
   0.46836334466934204,
   0.39625993371009827,
   0.5276170372962952,
   0.605982780456543,
   0.623599648475647,
   0.6348024606704712,
   0.6440117955207825,
   0.6512817144393921,
   0.6606968641281128,
   0.6664716601371765],
  'balanced_recall': [0.6582804918289185,
   0.6761072874069214,
   0.6070119142532349,
   0.6166681051254272,
   0.5802937746047974,
   0.5663964748382568,
   0.5621505379676819,
   0.5638511776924133,
   0.549810528755188,
   0.5466416478157043,
   0.5568165183067322],
  'balanced_precision': [0.3352229595184326,
   0.34241780638694763,
   0.3367694616317749,
   0.34790241718292236,
   0.35342496633529663,

In [41]:
Accuracy_aug_all_1, Macro_F1_aug_all_1, ROC_AUC_aug_all_1, metrics_all_1

(0.6692667706708268,
 0.6651056681811542,
 0.7508844564104412,
 {'loss': [1.0542728900909424,
   0.8443873524665833,
   0.7781355381011963,
   0.7510735988616943,
   0.6788820624351501,
   0.6138486862182617],
  'accuracy': [0.4936618208885193,
   0.627055823802948,
   0.666374146938324,
   0.6828208565711975,
   0.7142516374588013,
   0.7447073459625244],
  'balanced_recall': [0.6870146989822388,
   0.7157942652702332,
   0.7388201355934143,
   0.7321839928627014,
   0.7397295236587524,
   0.74458247423172],
  'balanced_precision': [0.411501944065094,
   0.5028978586196899,
   0.5256931185722351,
   0.5356272459030151,
   0.5532181859016418,
   0.5672118067741394],
  'balanced_f1_score': [0.5125657320022583,
   0.5897437334060669,
   0.6132490634918213,
   0.6174566745758057,
   0.6317146420478821,
   0.6426250338554382],
  'auc_roc': [0.6440902352333069,
   0.7572376132011414,
   0.784781813621521,
   0.7935385704040527,
   0.8110112547874451,
   0.8251016139984131],
  'val_loss': [0

In [42]:
Accuracy_aug_all_5, Macro_F1_aug_all_5, ROC_AUC_aug_all_5, metrics_all_5

(0.6801872074882995,
 0.6553712454377197,
 0.748144649373535,
 {'loss': [1.1269656419754028,
   0.9892517924308777,
   0.9310850501060486,
   0.8949293494224548,
   0.8623104095458984,
   0.8377537727355957,
   0.8240257501602173,
   0.8075839281082153,
   0.8223223686218262,
   0.7890941500663757,
   0.7513064742088318,
   0.7109605073928833,
   0.6454113721847534],
  'accuracy': [0.4057617783546448,
   0.5073782801628113,
   0.5612147450447083,
   0.5876400470733643,
   0.6147153973579407,
   0.6277925968170166,
   0.635680079460144,
   0.645236074924469,
   0.6352142095565796,
   0.6527660489082336,
   0.6727989912033081,
   0.6956380605697632,
   0.7260503768920898],
  'balanced_recall': [0.6335519552230835,
   0.6663182377815247,
   0.6404587626457214,
   0.6408832669258118,
   0.6626099348068237,
   0.6521501541137695,
   0.6525240540504456,
   0.65675288438797,
   0.6455704569816589,
   0.6332390308380127,
   0.648090124130249,
   0.639869749546051,
   0.6412532329559326],
  'ba

In [43]:
trial_name_list = ['Original Data', 'Augmented SR 0.1', 'Augmented RI 0.1', 
                   'Augmented RS 0.1', 'Augmented RD 0.1', 'Augmented All 0.1', 'Augmented All 0.5']

acc_list = [Accuracy_orig, Accuracy_aug_sr, Accuracy_aug_ri, Accuracy_aug_rs, 
            Accuracy_aug_rd, Accuracy_aug_all_1, Accuracy_aug_all_5]

macro_f1_list = [Macro_F1_orig, Macro_F1_aug_sr, Macro_F1_aug_ri, Macro_F1_aug_rs, 
                 Macro_F1_aug_rd, Macro_F1_aug_all_1, Macro_F1_aug_all_5]

roc_auc_list = [ROC_AUC_orig, ROC_AUC_aug_sr, ROC_AUC_aug_ri, ROC_AUC_aug_rs, 
                ROC_AUC_aug_rd, ROC_AUC_aug_all_1, ROC_AUC_aug_all_5]

In [44]:
result_dict = {'Trial Name' : trial_name_list, 'Test Accuracy Score' : acc_list, 
               'Test Macro F1 Score' : macro_f1_list, 'Test ROC AUC Score' : roc_auc_list}

In [45]:
results_df = pd.DataFrame(result_dict)

results_df

Unnamed: 0,Trial Name,Test Accuracy Score,Test Macro F1 Score,Test ROC AUC Score
0,Original Data,0.660946,0.631925,0.732676
1,Augmented SR 0.1,0.670307,0.653869,0.745071
2,Augmented RI 0.1,0.644826,0.588886,0.713682
3,Augmented RS 0.1,0.677587,0.654245,0.746526
4,Augmented RD 0.1,0.673427,0.648213,0.74079
5,Augmented All 0.1,0.669267,0.665106,0.750884
6,Augmented All 0.5,0.680187,0.655371,0.748145


In [47]:
results_df.to_csv('All_DA_BERT_large_uncased_unaug_test.csv')