https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb


In [1]:
!pip install -q transformers


[K     |████████████████████████████████| 3.1 MB 5.4 MB/s 
[K     |████████████████████████████████| 895 kB 36.9 MB/s 
[K     |████████████████████████████████| 59 kB 3.6 MB/s 
[K     |████████████████████████████████| 596 kB 24.6 MB/s 
[K     |████████████████████████████████| 3.3 MB 37.5 MB/s 
[?25h

# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

In [None]:
# Importing the libraries needed
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer
import numpy as np
from sklearn import metrics


In [None]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'



Importing and Pre-Processing the domain data


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import the csv into pandas dataframe and add the headers
df = pd.read_csv('/content/drive/MyDrive/stack_nlp_large.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,PostId,PostCreationDate,OwnerUserId,OwnerCreationDate,ReputationAtPostCreation,OwnerUndeletedAnswerCountAtPostTime,Title,BodyMarkdown,Tag1,Tag2,Tag3,Tag4,Tag5,PostClosedDate,OpenStatus,titletext,closed_reason_label
0,7,23,08/01/2008 12:09:41,48,08/01/2008 13:25:15,1,0,Latest information on PHP upcoming releases,I'm trying to track the progress of PHP 5.3 an...,php,,,,,05/18/2012 11:12:42,not constructive,Latest information on PHP upcoming releases. I...,4
1,30,126,08/01/2008 16:10:30,58,08/01/2008 13:56:33,11,1,How would you access Object properties from wi...,"What is the ""purist"" or ""correct"" way to acces...",oo,java,php,theory,,05/08/2012 18:11:27,not constructive,How would you access Object properties from wi...,4
2,31,129,08/01/2008 16:22:42,48,08/01/2008 13:25:15,11,1,How to export data from SQL Server to MySQL,I've been banging my head against SQL Server 2...,csv,ansi,sql,php,mssql,07/03/2012 14:30:16,off topic,How to export data from SQL Server to MySQL. I...,3
3,37,173,08/01/2008 18:33:08,83,08/01/2008 16:31:56,16,4,How do I version my MS SQL database in SVN?,I've been wanting to get my databases under ve...,subversion,svn,sql,mssql,versioncontrol,06/29/2012 15:08:28,not constructive,How do I version my MS SQL database in SVN?. I...,4
4,41,177,08/01/2008 18:37:55,83,08/01/2008 16:31:56,16,4,How do I programmatically create a PDF in my ....,Please recommend a good library for programmat...,pdf,.net,,,,04/25/2012 11:32:29,not constructive,How do I programmatically create a PDF in my ....,4


In [None]:
# # Converting the codes to appropriate categories using a dictionary
def convertReasonToLabel(reason):
    mapper = {"not a real question": 0, "too localized": 1, "off topic": 2, 'not constructive': 3, "open": 4 }
    return mapper.get(reason)

df['category'] = df['OpenStatus'].apply(convertReasonToLabel)

# remove all open 
df = df[df.category != 4]

# df.head()

In [None]:
# # Removing unwanted columns and only leaving title of news and the category which will be the target
df = df[['titletext','category']]
df

Unnamed: 0,titletext,category
0,Latest information on PHP upcoming releases. I...,3
1,How would you access Object properties from wi...,3
2,How to export data from SQL Server to MySQL. I...,2
3,How do I version my MS SQL database in SVN?. I...,3
4,How do I programmatically create a PDF in my ....,3
...,...,...
49995,How does a Java interface reduce coupling?. Ho...,0
49996,IE 8 CSS Selectors. I'm tring to set the style...,1
49997,"Problems using a ""Magellan 8500xt""-3D Scanner/...",2
49998,Vector declaration globally and in the main cl...,0


In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 5
VALID_BATCH_SIZE = 2
EPOCHS = 2
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')


In [None]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.titletext[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.category[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [None]:
short_df = df.sample(5000)
# df = new_df 
short_df["category"].unique()

array([2, 0, 1, 3])

In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=short_df.sample(frac=train_size,random_state=200)
test_dataset=short_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(short_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (5000, 2)
TRAIN Dataset: (4000, 2)
TEST Dataset: (1000, 2)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 4)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = DistillBERTClass()
model.to(device)

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [None]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [None]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)

        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        # if _%5000==0:
        #     loss_step = tr_loss/nb_tr_steps
        #     accu_step = (n_correct*100)/nb_tr_examples 
        #     print(f"Training Loss per 5000 steps: {loss_step}")
        #     print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [None]:
for epoch in range(EPOCHS):
    train(epoch)



The Total Accuracy for Epoch 0: 42.225
Training Loss Epoch: 1.2238589463382958
Training Accuracy Epoch: 42.225
The Total Accuracy for Epoch 1: 50.825
Training Loss Epoch: 1.0800287288799881
Training Accuracy Epoch: 50.825


In [None]:
def validation(model, testing_loader):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    fin_targets=[]
    fin_outputs=[]
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            # print("big_idx", big_idx)
            # print("targets", targets)

            
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(big_idx.cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets


In [None]:
for epoch in range(EPOCHS):
    outputs, targets = validation(model, testing_loader)
    # outputs = np.array(outputs) >= 0.5
    print(outputs)
    print(targets)
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")



[3, 0, 0, 3, 3, 3, 3, 0, 0, 0, 0, 0, 3, 3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 2, 3, 0, 0, 0, 0, 3, 0, 3, 3, 3, 0, 3, 3, 0, 2, 0, 0, 0, 3, 0, 0, 3, 0, 0, 2, 0, 3, 3, 0, 0, 0, 0, 0, 3, 2, 0, 3, 0, 3, 3, 0, 3, 0, 0, 0, 0, 2, 3, 0, 3, 0, 2, 0, 2, 0, 2, 3, 3, 3, 3, 3, 2, 0, 0, 0, 3, 3, 0, 3, 2, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 2, 3, 3, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 0, 0, 3, 0, 3, 3, 0, 2, 3, 0, 0, 3, 0, 0, 3, 2, 0, 0, 0, 3, 3, 2, 3, 0, 0, 0, 3, 2, 0, 2, 0, 3, 3, 3, 0, 2, 0, 0, 0, 0, 2, 0, 3, 3, 0, 2, 0, 3, 3, 3, 0, 3, 0, 3, 0, 0, 3, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 3, 3, 0, 3, 3, 0, 3, 0, 3, 0, 0, 0, 3, 0, 3, 2, 3, 3, 3, 0, 0, 2, 0, 2, 0, 3, 3, 0, 0, 2, 3, 0, 2, 3, 0, 3, 0, 0, 0, 2, 3, 2, 3, 2, 0, 0, 3, 0, 0, 3, 3, 0, 0, 3, 0, 2, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 3, 3, 2, 3, 0, 0, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 2, 3, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 3, 0, 3, 0, 2, 3, 3, 3, 0, 0, 0, 3, 0, 0, 0, 0, 3, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 



[0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 2, 2, 3, 0, 3, 0, 0, 0, 3, 3, 3, 0, 0, 2, 0, 0, 3, 3, 0, 2, 2, 0, 0, 3, 0, 0, 3, 0, 3, 3, 0, 0, 0, 3, 2, 2, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 3, 3, 3, 2, 3, 0, 3, 0, 0, 2, 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 0, 2, 3, 3, 0, 0, 2, 0, 3, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 3, 0, 3, 2, 0, 0, 0, 0, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 2, 3, 0, 2, 3, 0, 2, 0, 0, 3, 3, 0, 3, 3, 0, 0, 0, 2, 3, 2, 3, 2, 3, 0, 0, 0, 0, 0, 0, 3, 2, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 0, 0, 3, 2, 0, 0, 3, 3, 3, 0, 0, 3, 3, 2, 0, 3, 2, 0, 0, 3, 0, 0, 0, 0, 3, 3, 2, 2, 0, 0, 0, 3, 3, 3, 3, 2, 3, 0, 2, 0, 3, 3, 3, 0, 0, 0, 3, 3, 0, 3, 3, 0, 0, 0, 3, 2, 0, 0, 3, 0, 3, 0, 3, 2, 3, 3, 0, 0, 0, 0, 0, 3, 3, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 2, 3, 0, 3, 3, 3, 2, 0, 0, 3, 3, 2, 3, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 3, 0, 2, 3, 0, 0, 3, 2, 3, 0, 3, 0, 0, 3, 0, 2, 0, 2, 3, 3, 3, 3, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 0, 0, 0, 3, 

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_curve
y_true = np.array([0, 0, 1, 1])
y_scores = np.array([0.1, 0.4, 0.35, 0.8])
precision, recall, thresholds = precision_recall_curve(
    y_true, y_scores)
precision

recall

thresholds


array([0.35, 0.4 , 0.8 ])

In a multi-class classification setup, micro-average is preferable if you suspect there might be class imbalance (i.e you may have many more examples of one class than of other classes).

https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb#scrollTo=iNCaZ2epNcSO
