This code was adapted from @abhimishra91's multilabel classification tutorial via GitHub: https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb#scrollTo=Ov1_3R_pAcMo.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DATA_PATH = "drive/My Drive/colab/DeepNLP/data/"
FILE_NAME = "sc_join_gte4_onehot.tsv"

In [None]:
!pip install -q transformers
!pip install -q tensorflow

[K     |████████████████████████████████| 2.3MB 10.0MB/s 
[K     |████████████████████████████████| 3.3MB 34.3MB/s 
[K     |████████████████████████████████| 901kB 45.7MB/s 
[?25h

In [None]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
import tensorflow as tf

In [None]:
from torch import cuda
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

Found GPU at: /device:GPU:0


'Tesla P100-PCIE-16GB'

In [None]:
df = pd.read_csv(DATA_PATH + FILE_NAME, delimiter="\t").drop(labels=["relation"], axis=1)
df.head()

Unnamed: 0,sentence,R-2-3,R-11-50,R-4-50,R-1-3,R-1-4,R-5-5,R-2-6,R-5-6,R-6-50,R-11-12,R-12-2,R-5-50,R-3-50,R-4-5,R-3-6,R-3-4,R-1-5,R-5-4,R-3-5,R-1-50,R-2-5,R-50-5,R-6-5,R-2-1,R-4-6,R-12-3,R-2-50,R-11-3,R-2-4,R-11-5,R-12-5,R-50-2,R-12-50,R-1-2
0,"This essay is about skin damage, latitude and ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,The skin damage is on our bodies that have num...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,There are three main varieties of skin cancer ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,That would be what skin damage is.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Latitude and direct sunlight would be the cols...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
cols = list(df)
label_cols = cols[1:len(cols)]
num_labels = len(label_cols)
label_cols[:10]

['R-2-3',
 'R-11-50',
 'R-4-50',
 'R-1-3',
 'R-1-4',
 'R-5-5',
 'R-2-6',
 'R-5-6',
 'R-6-50',
 'R-11-12']

In [None]:
for rel in label_cols:
  df[rel] = df[rel].astype(int)
df.head()

Unnamed: 0,sentence,R-2-3,R-11-50,R-4-50,R-1-3,R-1-4,R-5-5,R-2-6,R-5-6,R-6-50,R-11-12,R-12-2,R-5-50,R-3-50,R-4-5,R-3-6,R-3-4,R-1-5,R-5-4,R-3-5,R-1-50,R-2-5,R-50-5,R-6-5,R-2-1,R-4-6,R-12-3,R-2-50,R-11-3,R-2-4,R-11-5,R-12-5,R-50-2,R-12-50,R-1-2
0,"This essay is about skin damage, latitude and ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,The skin damage is on our bodies that have num...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,There are three main varieties of skin cancer ...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,That would be what skin damage is.,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Latitude and direct sunlight would be the cols...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [None]:
df['relations'] = list(df[label_cols].values)

In [None]:
df2 = df.copy(deep=True).filter(['sentence', 'relations'])
df2.head(5)

Unnamed: 0,sentence,relations
0,"This essay is about skin damage, latitude and ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,The skin damage is on our bodies that have num...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,There are three main varieties of skin cancer ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,That would be what skin damage is.,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Latitude and direct sunlight would be the cols...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 4
EPOCHS = 4
LEARNING_RATE = 2e-5
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.sentence = dataframe.sentence
        self.targets = self.data.relations
        self.max_len = max_len

    def __len__(self):
        return len(self.sentence)

    def __getitem__(self, index):
        sentence = str(self.sentence[index])
        sentence = " ".join(sentence.split())

        inputs = self.tokenizer.encode_plus(
            sentence,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=df2.sample(frac=train_size,random_state=42)
test_dataset=df2.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df2.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (10874, 2)
TRAIN Dataset: (8699, 2)
TEST Dataset: (2175, 2)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', return_dict=False)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, len(label_cols))
  
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids,)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  0.6953293085098267
Epoch: 1, Loss:  0.09134535491466522
Epoch: 2, Loss:  0.06055288389325142
Epoch: 3, Loss:  0.050444189459085464


In [None]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
threshold = 0.06

outputs_global, targets_global = [], []
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs_global.append(outputs)
    targets_global.append(targets)
    outputs = np.array(outputs) >= threshold
    accuracy = metrics.accuracy_score(targets, outputs.round())
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

  average, "true nor predicted", 'F-score is', len(true_sum)


Accuracy Score = 0.4868965517241379
F1 Score (Micro) = 0.449438202247191
F1 Score (Macro) = 0.20809963998798064
Accuracy Score = 0.4868965517241379
F1 Score (Micro) = 0.449438202247191
F1 Score (Macro) = 0.20809963998798064
Accuracy Score = 0.4868965517241379
F1 Score (Micro) = 0.449438202247191
F1 Score (Macro) = 0.20809963998798064
Accuracy Score = 0.4868965517241379
F1 Score (Micro) = 0.449438202247191
F1 Score (Macro) = 0.20809963998798064


In [None]:
out = []
for arr in outputs_global:
  out += arr
out = [[1 if i >= threshold else 0 for i in j] for j in out]

target = []
for arr in targets_global:
  target += arr

In [None]:
print(metrics.classification_report(target, out))

              precision    recall  f1-score   support

           0       0.23      0.93      0.36       176
           1       0.29      0.12      0.17        64
           2       0.25      0.41      0.31       148
           3       0.48      0.46      0.47       112
           4       0.00      0.00      0.00         8
           5       0.00      0.00      0.00        12
           6       0.00      0.00      0.00         4
           7       0.51      0.98      0.67       500
           8       0.36      0.94      0.52       376
           9       0.52      0.98      0.68       204
          10       0.00      0.00      0.00         8
          11       0.31      0.98      0.47       892
          12       0.32      0.81      0.45       408
          13       0.44      0.85      0.58       208
          14       0.00      0.00      0.00        36
          15       0.31      0.87      0.46       212
          16       0.00      0.00      0.00        24
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# outputs_global[0][:5]