Necessary imports

In [1]:
#importing the necessary packages and libraries
import pandas as pd
import numpy as np

import torch
from torchvision import datasets, transforms
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torchvision.utils import make_grid

import os
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import xml.etree.cElementTree as et
from collections import defaultdict

Installing transformers and importing the tokenizer as well as the model of DistilBERT




In [2]:
#installing transformers
!pip3 install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 43.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 28.9 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 13.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [3]:
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, DistilBertConfig, DistilBertTokenizerFast, TFDistilBertModel
from transformers import Trainer, TrainingArguments

In [4]:
model_name = "distilbert-base-uncased" #setting model name as the transformer we want to use, i.e. distilBERT

In [5]:
max_length = 128 #setting a value for max_length which is the max number of words to tokenize in a given text

Reading the training, validation and testing files

In [37]:
#reading the input files
training_df = pd.read_csv('training_df.csv')
validation_df = pd.read_csv('validation_df.csv')
testing_df = pd.read_csv('testing_df.csv')

In [38]:
#replacing the null strings by empty strings. This is required to use a transformer
training_df = training_df.fillna('')
validation_df = validation_df.fillna('')
testing_df = testing_df.fillna('')

In [39]:
# over- and under-sampling to help address class imbalance
positive_samples = training_df.loc[training_df["Ground_Truth"] == 1]
negative_samples = training_df.loc[training_df["Ground_Truth"] == 0]
training_df = pd.concat([positive_samples, positive_samples, negative_samples.sample(frac=0.2)], ignore_index=True)

In [40]:
len(training_df.loc[training_df["Ground_Truth"]==1])/len(training_df.loc[training_df["Ground_Truth"] == 0])

0.9629773211862764

In [41]:
train_text = training_df['Comments'].values.tolist()
val_text = validation_df['Comments'].values.tolist()
test_text = testing_df['Comments'].values.tolist()

In [42]:
train_labels = torch.tensor(training_df['Ground_Truth'].values.tolist())
val_labels = torch.tensor(validation_df['Ground_Truth'].values.tolist())
test_labels = torch.tensor(testing_df['Ground_Truth'].values.tolist())

Batch encoding

In [43]:
## writing a function to perform batch encoding of texts
## input: tokenizer object, list of text strings, batch size and max_length
# output: senquence of texts and corresponding attention masks encoded as tensor objects
def batch_encode(tokenizer, texts, batch_size=256, max_length=max_length):
    input_ids = []
    attention_mask = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer.batch_encode_plus(batch,
                                             max_length=max_length, 
                                             padding='max_length', #dynamic padding 
                                             truncation=True, 
                                             return_attention_mask=True, #to return attention mask
                                             return_token_type_ids=False, #to return the token type IDS
                                             )
        input_ids.extend(inputs['input_ids'])
        attention_mask.extend(inputs['attention_mask'])
    
    
    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)

In [44]:
#initializing tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

Implementing the batch_encode function defined above to tokenize the train, validation and test text

In [45]:
X_train_ids, X_train_attention = batch_encode(tokenizer, train_text) # X_train_attention

In [46]:
X_val_ids, X_val_attention = batch_encode(tokenizer, val_text[:20000]) # X_val_attention

In [78]:
X_test_ids, X_test_attention = batch_encode(tokenizer, test_text[:50000])

Setting an intial configuration for the pretrained DistilBERT Model

In [48]:
config = DistilBertConfig(dropout=0.2, #dropout for fully connected layers
                          attention_dropout=0.2, #dropout for attention probabilities
                          output_hidden_states=True) #returning the hidden states of all layers
distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)
for layer in distilBERT.layers:
    layer.trainable = False #freezing the layers

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_transform', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Building the model

In [49]:
max_length = 128
dropout = 0.2
lr = 0.001
random_state = 42 #ensuring reproducibility

# defining a function to build a model of DistilBERT architecture
# input: a base Huggingface transformer model without a classification head, max_length
# output: a compiled keras model with custom classification layers built on top of the base architecture

def build_model(transformer, max_length=max_length):
    
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=random_state) 
    
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')

    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0] #the element at index 0 is the hidden state at the output of the model's last layer
    
    cls_token = last_hidden_state[:, 0, :]

    x = tf.keras.layers.Dense(8, activation=tf.nn.relu)(cls_token)
    x = tf.keras.layers.Dense(4, activation=tf.nn.relu)(x)
    
    output = tf.keras.layers.Dense(1, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,  
                                   kernel_constraint=None,
                                   bias_initializer='zeros'
                                   )(x) #single node for binary classification

    model = tf.keras.Model([input_ids_layer, input_attention_layer], output) #model definiton

    model.compile(tf.keras.optimizers.Adam(learning_rate=lr), #model compiling using Adam optimizer, binary cross entropy loss and binary accuracy metric
                  loss=tf.keras.losses.BinaryFocalCrossentropy(),
                  metrics=[tf.keras.metrics.BinaryAccuracy()])
    
    return model

In [50]:
model = build_model(distilBERT, max_length=max_length) #building model

Training the model

In [51]:
epochs = 10
batch_size = 64
num_steps = len(training_df['Comments'].index) // batch_size

# Train the model
training = model.fit(
    x = [X_train_ids, X_train_attention],
    y = train_labels.numpy(),
    epochs = epochs,
    batch_size = batch_size,
    steps_per_epoch = num_steps,
    validation_data = ([X_val_ids, X_val_attention], val_labels[:20000].numpy()),
    verbose=2
)

Epoch 1/10
474/474 - 220s - loss: 0.1412 - binary_accuracy: 0.7143 - val_loss: 0.1611 - val_binary_accuracy: 0.7094 - 220s/epoch - 464ms/step
Epoch 2/10
474/474 - 212s - loss: 0.1334 - binary_accuracy: 0.7365 - val_loss: 0.1389 - val_binary_accuracy: 0.7520 - 212s/epoch - 448ms/step
Epoch 3/10
474/474 - 213s - loss: 0.1318 - binary_accuracy: 0.7403 - val_loss: 0.1211 - val_binary_accuracy: 0.8099 - 213s/epoch - 448ms/step
Epoch 4/10
474/474 - 212s - loss: 0.1304 - binary_accuracy: 0.7460 - val_loss: 0.1428 - val_binary_accuracy: 0.7384 - 212s/epoch - 448ms/step
Epoch 5/10
474/474 - 212s - loss: 0.1295 - binary_accuracy: 0.7468 - val_loss: 0.1532 - val_binary_accuracy: 0.7132 - 212s/epoch - 448ms/step
Epoch 6/10
474/474 - 212s - loss: 0.1286 - binary_accuracy: 0.7502 - val_loss: 0.1341 - val_binary_accuracy: 0.7656 - 212s/epoch - 448ms/step
Epoch 7/10
474/474 - 213s - loss: 0.1274 - binary_accuracy: 0.7536 - val_loss: 0.1379 - val_binary_accuracy: 0.7549 - 213s/epoch - 449ms/step
Epoch 

In [52]:
model.save_weights("transformer_ci3_weights.h5") #saving the weights of the model

In [53]:
model.load_weights("transformer_ci3_weights.h5")

In [54]:
results = model.evaluate(x = [X_test_ids, X_test_attention],y = np.array(test_labels[:10000]),batch_size = batch_size)



In [95]:
predictions= model.predict([X_val_ids[:20000], X_val_attention[:20000]])

In [96]:
prediction_labels = np.argmax(predictions, axis=1)

In [97]:
set(prediction_labels)

{0}

In [98]:
true_labels = np.array(val_labels[:20000])

In [99]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, balanced_accuracy_score, f1_score
print("Accuracy-Score of Model is: ", accuracy_score(true_labels,prediction_labels))
print("Balanced-Accuracy-Score of Model is: ", balanced_accuracy_score(true_labels,prediction_labels))
print("Recall-Score of Model is: ", recall_score(true_labels,prediction_labels,zero_division=1))
print("Precision-Score of Model is: ", precision_score(true_labels,prediction_labels,zero_division=1))
print("F1-Score of Model is: ", f1_score(true_labels,prediction_labels))

Accuracy-Score of Model is:  0.8856
Balanced-Accuracy-Score of Model is:  0.5
Recall-Score of Model is:  0.0
Precision-Score of Model is:  1.0
F1-Score of Model is:  0.0
