### BERT Baseline - Base Uncased - Functional Based on Walkthrough

In [1]:
# !pip install sklearn
# !pip install ekphrasis
# !pip install transformers
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import transformers

from transformers import BertTokenizer, TFBertModel
from tensorflow.keras import backend as K
from tensorflow import keras

import logging
tf.get_logger().setLevel(logging.ERROR)

In [3]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [6]:
encoder = LabelEncoder()
encoder.classes_ = np.load('../Data/classes.npy', allow_pickle=True)

In [7]:
tf.__version__

'2.6.0'

In [8]:
transformers.__version__

'4.16.2'

In [9]:
# set random seed

import random

random.seed(42)

In [10]:
import sys

sys.path.append('../Preprocess')

from dataCollect import *

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


Reading english - 1grams ...


In [13]:
params = {'data_file' : '../Data/dataset.json', 'class_names' : '../Data/classes.npy'}

raw_data = get_annotated_data(params)

raw_data

Unnamed: 0,post_id,text,annotatorid1,target1,label1,annotatorid2,target2,label2,annotatorid3,target3,label3,rationales,final_label
0,1179055004553900032_twitter,"[i, dont, think, im, getting, my, baby, them, ...",1,[None],normal,2,[None],normal,3,[None],normal,[],normal
1,1179063826874032128_twitter,"[we, cannot, continue, calling, ourselves, fem...",1,[None],normal,2,[None],normal,3,[None],normal,[],normal
2,1178793830532956161_twitter,"[nawt, yall, niggers, ignoring, me]",4,[African],normal,2,[None],normal,3,[African],hatespeech,[],normal
3,1179088797964763136_twitter,"[<user>, i, am, bit, confused, coz, chinese, p...",1,[Asian],hatespeech,4,[Asian],offensive,3,[Asian],hatespeech,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",hatespeech
4,1179085312976445440_twitter,"[this, bitch, in, whataburger, eating, a, burg...",4,"[Caucasian, Women]",hatespeech,2,"[Women, Caucasian]",hatespeech,3,"[Women, Caucasian]",offensive,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",hatespeech
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20143,9989999_gab,"[if, ur, still, on, twitter, tell, carlton, i,...",217,"[Men, Women, Other]",offensive,199,[None],offensive,215,[None],normal,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,...",offensive
20144,9990225_gab,"[when, i, first, got, on, here, and, said, i, ...",220,[African],offensive,223,"[African, Other]",offensive,231,[None],normal,"[[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,...",offensive
20145,9991681_gab,"[was, macht, der, moslem, wenn, der, zion, geg...",206,[Islam],offensive,203,[Other],normal,211,[None],normal,[],normal
20146,9992513_gab,"[it, is, awful, look, at, world, demographics,...",209,[Hispanic],hatespeech,253,[Asian],hatespeech,222,[Asian],offensive,"[[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,...",hatespeech


In [14]:
def listToString(s): 
    
    # initialize an empty string
    str1 = "" 
    
    # traverse in the string  
    for ele in s: 
        str1 += ele
        str1 += ' '
    
    # return string  
    return str1 
        

In [15]:
raw_data_filtered = raw_data[raw_data['final_label'] != 'undecided']

all_text_token = raw_data_filtered['text']

all_text = [listToString(s) for s in all_text_token]

all_labels = raw_data_filtered['final_label']

# convert class label to 1 hot encoding

all_labels_numerical = all_labels.apply(lambda x: 0 if x=='normal' else (1 if x=='offensive' else 2))

all_labels_oh = keras.utils.to_categorical(all_labels_numerical, num_classes = 3, dtype = 'float32')

In [16]:
all_text 

['i dont think im getting my baby them white 9 he has two white j and nikes not even touched ',
 'we cannot continue calling ourselves feminists if the rights of all womxn arent addressed yes to a sexual offences public list but will a trans lesbian bisexual and queer womxn be able to enter their information on the reporting sheet gender forum ',
 'nawt yall niggers ignoring me ',
 '<user> i am bit confused coz chinese ppl can not access twitter thn how this ching chong using it i think he pakistani 🤔 🤔 🤔 ',
 'this bitch in whataburger eating a burger with no top bun and holding it palm up i hate white bitches ',
 'laura loomer raped me while screaming at me in her disgusting kike language and said we must exterminate the goyim laura loomer loomergate ',
 'and this is why i end up with nigger trainee doctors who can not speak properly lack basic knowledge of biology it truly scary if the public only knew ',
 'nogs jews and dykes how enriching ',
 'guilty until proven innocent unless yo

In [17]:
all_labels_oh

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.]], dtype=float32)

In [18]:
# test train split, 80-20 split (train, test)
X_train, X_test, y_train, y_test = train_test_split(all_text, all_labels_oh, test_size=0.2, stratify=all_labels_oh)

In [19]:
# test train split, split 20% test into 10% test, 10% dev

X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test)

In [20]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [21]:
max_length = 128

x_train = tokenizer(X_train, 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')

x_test = tokenizer(X_test, 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')

x_dev = tokenizer(X_dev, 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')

In [22]:
x_train.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [23]:
x_train.input_ids

<tf.Tensor: shape=(15383, 128), dtype=int32, numpy=
array([[  101,  1026,  5310, ...,     0,     0,     0],
       [  101,  2204,  2391, ...,     0,     0,     0],
       [  101,  2748, 17990, ...,     0,     0,     0],
       ...,
       [  101,  1026,  5310, ...,     0,     0,     0],
       [  101,  1057, 10587, ...,     0,     0,     0],
       [  101,  2057,  2439, ...,     0,     0,     0]])>

In [24]:
x_train.token_type_ids

<tf.Tensor: shape=(15383, 128), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>

In [25]:
x_train.attention_mask

<tf.Tensor: shape=(15383, 128), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>

In [26]:
x_test.input_ids

<tf.Tensor: shape=(1923, 128), dtype=int32, numpy=
array([[  101,  2138,  6616, ...,     0,     0,     0],
       [  101,  1037, 22212, ...,     0,     0,     0],
       [  101, 27593, 15593, ...,     0,     0,     0],
       ...,
       [  101,  8398,  2005, ...,     0,     0,     0],
       [  101,  2183,  2000, ...,     0,     0,     0],
       [  101,  2339,  2024, ...,     0,     0,     0]])>

In [27]:
def create_classification_model(hidden_size = 5, 
                                train_layers = -1, 
                                optimizer=tf.keras.optimizers.Adam()):
    """
    Build a simple classification model with BERT. Let's keep it simple and don't add dropout, layer norms, etc.
    """

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                  'token_type_ids': token_type_ids,
                  'attention_mask': attention_mask}


    #restrict training to the train_layers outer transformer layers
    if not train_layers == -1:

            retrain_layers = []

            for retrain_layer_number in range(train_layers):

                layer_code = '_' + str(11 - retrain_layer_number)
                retrain_layers.append(layer_code)

            for w in bert_model.weights:
                if not any([x in w.name for x in retrain_layers]):
                    w._trainable = False


    bert_out = bert_model(bert_inputs)
    
    net = bert_out[0]
    
    classification_token = tf.keras.layers.Lambda(lambda x: x[:,0,:], name='get_first_vector')(net)
    
    dropout1 = tf.keras.layers.Dropout(0.4, name="dropout1")(classification_token)
    
    hidden = tf.keras.layers.Dense(hidden_size, name='hidden_layer')(dropout1)
    
    dropout2 = tf.keras.layers.Dropout(0.4, name="dropout2")(hidden)

    classification = tf.keras.layers.Dense(3, activation='sigmoid',name='classification_layer')(dropout2)

    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], 
                                          outputs=[classification])
    
    classification_model.compile(optimizer=optimizer,
                            loss=tf.keras.losses.CategoricalCrossentropy(),
                            metrics='Accuracy')


    return classification_model




#     classification_model.compile(optimizer=optimizer,
#                             loss=tf.keras.losses.CategoricalCrossentropy(),
#                             metrics=tf.keras.metrics.CategoricalAccuracy('accuracy'))

In [28]:
try:
    del classification_model
except:
    pass

try:
    del bert_model
except:
    pass

tf.keras.backend.clear_session()
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

classification_model = create_classification_model(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-05, 
                                                                                      epsilon=1e-08),
                                                  train_layers=1)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [29]:
classification_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_mask_layer (InputLaye [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_ids_layer (InputLayer)    [(None, 128)]        0                                            
__________________________________________________________________________________________________
token_type_ids_layer (InputLaye [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 109482240   attention_mask_layer[0][0]       
                                                                 input_ids_layer[0][0]        

In [30]:
classification_model.fit([x_train.input_ids, x_train.token_type_ids, x_train.attention_mask],
                         y_train,
                         validation_data=([x_dev.input_ids, x_dev.token_type_ids, x_dev.attention_mask],
                         y_dev),
                        epochs=20,
                        batch_size=16)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x26536d2db80>

In [35]:
# Model predictions

y_preds_array = classification_model.predict([x_test.input_ids, x_test.token_type_ids, x_test.attention_mask])

# convert to predicted one-hot encoding

from keras.utils.np_utils import to_categorical
y_preds = to_categorical(np.argmax(y_preds_array, 1), dtype = "int64")

y_preds

array([[0, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       ...,
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0]], dtype=int64)

In [36]:
# convert back to labels

y_test_cat = np.argmax(y_test, axis=1)
y_preds_cat = np.argmax(y_preds, axis=1)

In [37]:
# confusion matrix and classification report

from sklearn import metrics

# Print the confusion matrix
print(metrics.confusion_matrix(y_test_cat, y_preds_cat))

# Print the precision and recall, among other metrics
print(metrics.classification_report(y_test_cat, y_preds_cat, digits=3))

[[525 184  72]
 [149 286 113]
 [ 45  90 459]]
              precision    recall  f1-score   support

           0      0.730     0.672     0.700       781
           1      0.511     0.522     0.516       548
           2      0.713     0.773     0.742       594

    accuracy                          0.660      1923
   macro avg      0.651     0.656     0.653      1923
weighted avg      0.662     0.660     0.660      1923



In [38]:
# Scores

from sklearn.metrics import roc_auc_score

print("Accuracy Score:", round(metrics.accuracy_score(y_test_cat, y_preds_cat), 3))

print("Macro F1 Score:", round(metrics.f1_score(y_test_cat, y_preds_cat, average='macro'), 3))

print("ROC_AUC Score:", round(roc_auc_score(y_test, y_preds, multi_class='ovo',average='macro'), 3))

Accuracy Score: 0.66
Macro F1 Score: 0.653
ROC_AUC Score: 0.743
