In [42]:
#!pip install transformers
import numpy as np
import pandas as pd
from transformers import BertModel, BertTokenizer, BertConfig
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import torch
import torch.nn as nn
from sklearn import metrics
from tqdm import tqdm
from torch import cuda
import ast

In [44]:
train_df = pd.read_csv('head_train.csv')
val_df = pd.read_csv('head_val.csv')
test_df = pd.read_csv('head_test.csv')
# train_df.groupby(['label']).size().plot.bar()
#set(val_df['author'].tolist()+train_df['author'].tolist())

In [45]:
#https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb#scrollTo=7KnNeQx6SI78
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 20
LEARNING_RATE = 5e-05
NUM_CLASSES = len(ast.literal_eval(train_df.loc[0, 'label']))
device = 'cuda' if cuda.is_available() else 'cpu'

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
device

'cpu'

In [46]:
def get_y(x):
    return np.asarray(ast.literal_eval(x), dtype=int)

In [55]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        y = self.data['label'].apply(get_y)
        y1 = []
        for i in y:
            y1.append(i)
        y1 = np.argmax(np.asarray(y1),axis=1)
        self.targets = np.asarray(y1)
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        inputs = self.tokenizer(
            self.text[index],
            padding='max_length',
            max_length = 512,
            truncation=True,
            return_tensors="pt"
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']


        return {
            'ids': inputs['input_ids'].to(device, dtype=torch.long),
            'mask': inputs['attention_mask'].to(device, dtype=torch.long),
            'targets': torch.tensor(self.targets[index]).to(device, dtype=torch.long)
        }


training_set = CustomDataset(train_df.reset_index(drop=True), tokenizer, MAX_LEN)
val_set = CustomDataset(val_df.reset_index(drop=True), tokenizer, MAX_LEN)
test_set = CustomDataset(test_df.reset_index(drop=True), tokenizer, MAX_LEN)
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
               'shuffle': False,
               'num_workers': 0
               }

training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **test_params)
test_loader = DataLoader(test_set, **test_params)

In [56]:
class BertForClassification(nn.Module):
    def __init__(self, config):
        super(BertForClassification, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, NUM_CLASSES)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids= input_ids,  attention_mask=attention_mask)
        pooled_output = self.dropout(outputs.pooler_output)
        linear = self.classifier(pooled_output)
        logits = self.relu(linear)
        return logits


model = BertForClassification(BertConfig())
# Fine-tune the model on your downstream task

for param in model.bert.parameters():
    param.requires_grad = False
    # Unfreeze the weights of the last 4 layers
for param in model.bert.encoder.layer[-4:].parameters():
    param.requires_grad = True
for param in model.bert.pooler.parameters():
    param.requires_grad = True
model.to(device)
model

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [57]:
num_train_optimization_steps = len(training_loader) * EPOCHS
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer,
                                                         int(num_train_optimization_steps * 0.1),
                                                         num_train_optimization_steps)

In [58]:
#https://stackoverflow.com/questions/71998978/early-stopping-in-pytorch
class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [59]:
def loss_fn(outputs, targets):
    return nn.CrossEntropyLoss().to(device)(outputs, targets)

In [62]:
def val(epoch, val_loader: DataLoader, model: BertModel):
    with torch.no_grad():
        model.eval()
        fin_targets = []
        fin_outputs = []
        for _, data in enumerate(val_loader, 0):
            outputs = model(input_ids=data['ids'],
                            attention_mask=data['mask'])
            preds = torch.argmax(outputs, dim=1)
            fin_targets.extend(data['targets'].cpu().detach().numpy().tolist())
            fin_outputs.extend(preds.cpu().detach().numpy().tolist())
            loss = loss_fn(outputs, data['targets'])
        acc = (torch.tensor(fin_targets) == torch.tensor(fin_outputs)).float().mean().item()
        print(f"Epoch {epoch + 1}, Val accuracy = {acc}")
        print(f'Val truth:{fin_targets}')
        print(f'Val output:{fin_outputs}')
        val_df['truth'] = fin_targets
        val_df['pred'] = fin_outputs
        return loss


def train(epoch, training_loader: DataLoader, val_loader: DataLoader, model: BertModel):
    early_stopper = EarlyStopper(patience=3)

    for epoch in range(epoch):
        fin_targets = []
        fin_outputs = []
        model.train()
        total_loss_train = 0
        for _, data in enumerate(tqdm(training_loader, 0)):
            outputs = model(input_ids=data['ids'].squeeze(1),
                            attention_mask=data['mask'])
            preds = torch.argmax(outputs, dim=1)
            #print(data['targets'])
            fin_targets.extend(data['targets'].cpu().detach().numpy().tolist())
            fin_outputs.extend(preds.cpu().detach().numpy().tolist())
            loss = loss_fn(outputs, data['targets'])
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            total_loss_train+=loss.item()

        acc = (torch.tensor(fin_targets) == torch.tensor(fin_outputs)).float().mean().item()
        print(f"Epoch {epoch + 1}, Loss= {total_loss_train}, Train accuracy = {acc}")
        # print(f'Train truth:{fin_targets}')
        # print(f'Train output:{fin_outputs}')

        # Evaluate the model on the dev set
        validation_loss = val(epoch, val_loader, model)
        # if early_stopper.early_stop(validation_loss):
        #     break


def test(test_loader, model: BertModel):
    # Use the fine-tuned model to make predictions on new data
    with torch.no_grad():
        model.eval()
        fin_targets = []
        fin_outputs = []
        for _, data in enumerate(test_loader, 0):
            outputs = model(input_ids=data['ids'],
                            attention_mask=data['mask'])
            preds = torch.argmax(outputs, dim=1)
            fin_targets.extend(data['targets'].cpu().detach().numpy().tolist())
            fin_outputs.extend(preds.cpu().detach().numpy().tolist())
        test_df['truth'] = fin_targets
        test_df['pred'] = fin_outputs
        acc = (torch.tensor(fin_targets) == torch.tensor(fin_outputs)).float().mean().item()
        print(f"Test accuracy = {acc}")


train(EPOCHS, training_loader, val_loader, model)

  7%|▋         | 17/240 [01:19<17:44,  4.77s/it][E thread_pool.cpp:113] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:113] Exception in thread pool task: mutex lock failed: Invalid argument
  7%|▋         | 17/240 [01:22<17:56,  4.83s/it]


KeyboardInterrupt: 

In [21]:

# test(test_loader, model)
# test_df[['text', 'truth', 'pred']]
val_df[['text', 'truth', 'pred']]


KeyboardInterrupt

