<a href="https://colab.research.google.com/github/gonzaq94/Active-Learning-for-Image-Classification/blob/master/fine_tune_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
COMET_API_KEY = ""

In [30]:
import comet_ml
import comet_llm
import os
from datasets import load_dataset
import random
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import precision_recall_fscore_support, accuracy_score


## Load dataset

Take 1000 random examples of the CONLL dataset.

In [40]:
# Load the CONLL dataset
dataset_train = load_dataset("conll2003", split="train").shuffle().select(range(1000))
dataset_valid = load_dataset("conll2003", split="validation").shuffle().select(range(1000))

In [41]:
print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 1000
})
Dataset({
    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
    num_rows: 1000
})


In [7]:
for i in range(5):
    print(dataset_train[i])

{'id': '11812', 'tokens': ['The', 'individual', 'Names', ',', 'however', ',', 'now', 'must', 'decide', 'whether', 'to', 'accept', 'Lloyd', "'s", 'settlement', 'offer', 'or', 'reject', 'the', 'offer', 'and', 'pursue', 'litigation', '.'], 'pos_tags': [12, 16, 24, 6, 30, 6, 30, 20, 37, 15, 35, 37, 22, 27, 21, 21, 10, 37, 12, 21, 10, 37, 21, 7], 'chunk_tags': [11, 12, 12, 0, 3, 0, 3, 21, 22, 17, 21, 22, 11, 11, 12, 12, 0, 21, 11, 12, 0, 21, 11, 0], 'ner_tags': [0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
{'id': '4686', 'tokens': ['3.', 'Margaret', 'Crowley', '(', 'Australia', ')', '2:02.40'], 'pos_tags': [22, 22, 22, 4, 22, 5, 11], 'chunk_tags': [11, 12, 12, 0, 11, 0, 11], 'ner_tags': [0, 1, 2, 0, 5, 0, 0]}
{'id': '5513', 'tokens': ['Stabaek', '20', '7', '8', '5', '41', '34', '29'], 'pos_tags': [21, 11, 11, 11, 11, 11, 11, 11], 'chunk_tags': [11, 12, 12, 12, 12, 12, 12, 12], 'ner_tags': [3, 0, 0, 0, 0, 0, 0, 0]}
{'id': '2014', 'tokens': ['"', 'I', 'got', 'more'

 ## Load BERT model

Train a BERT on this dataset to classify tokens (the ner_tags column). Report performance.

In [9]:
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=9)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Dataset preparation

In [42]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]

        tokens = []
        tokens_labels = []
        attention_masks = []

        for word, label in zip(text, labels):
          # Tokenize each word
          word_encoding = self.tokenizer(
              word,
              add_special_tokens=False,
              max_length=self.max_length,
              truncation=True,
              return_tensors="pt"
          )

          tokens.extend(word_encoding['input_ids'][0].tolist())
          attention_masks.extend(word_encoding['attention_mask'][0].tolist())
          tokens_labels.extend([label] * len(word_encoding['input_ids'][0]))

        # add padding
        padding_len = self.max_length - len(tokens)
        tokens += [0] * padding_len
        attention_masks += [0] * padding_len
        tokens_labels += [-100] * padding_len

        return {
            'input_ids': torch.tensor(tokens).flatten(),
            'attention_mask': torch.tensor(attention_masks).flatten(),
            'labels': torch.tensor(tokens_labels).flatten()
        }


In [47]:
train_custom_dataset = CustomDataset(
    texts=dataset_train['tokens'],
    labels=dataset_train['ner_tags'],
    tokenizer=tokenizer,
    max_length=256
)

valid_custom_dataset = CustomDataset(
    texts=dataset_valid['tokens'],
    labels=dataset_valid['ner_tags'],
    tokenizer=tokenizer,
    max_length=256
)

## Fine-tune model

In [15]:
# log results and assets to Comet
os.environ["COMET_LOG_ASSETS"] = "True"
os.environ["COMET_WORKSPACE"] = "gonzaq94"
os.environ["COMET_PROJECT_NAME"] = "ner_classif-bert"


In [48]:
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(-1)  # Get the predicted class indices
    true_predictions = [p for (p, l) in zip(predictions.flatten(), labels.flatten()) if l != -100]
    true_labels = [l for l in labels.flatten() if l != -100]

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, true_predictions, average='weighted')
    acc = accuracy_score(true_labels, true_predictions)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [49]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    dataloader_drop_last=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_custom_dataset,
    eval_dataset=valid_custom_dataset,
    compute_metrics=compute_metrics
)




In [50]:
trainer.train()

[1;38;5;39mCOMET INFO:[0m An experiment with the same configuration options is already running and will be reused.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0872,0.122693,0.971808,0.971357,0.971303,0.971808
2,0.0625,0.140124,0.96962,0.969203,0.969551,0.96962
3,0.0412,0.139428,0.97247,0.972332,0.972464,0.97247
4,0.022,0.142345,0.973945,0.97324,0.973245,0.973945
5,0.0113,0.14566,0.974098,0.973601,0.973626,0.974098


KeyboardInterrupt: 

 ## Evaluate the model

In [51]:
eval_results = trainer.evaluate()
print(eval_results)

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0872,0.122693,0.971808,0.971357,0.971303,0.971808
2,0.0625,0.140124,0.96962,0.969203,0.969551,0.96962
3,0.0412,0.139428,0.97247,0.972332,0.972464,0.97247
4,0.022,0.142345,0.973945,0.97324,0.973245,0.973945
5,0.0113,0.14566,0.974098,0.973601,0.973626,0.974098


{'eval_loss': 0.14565999805927277, 'eval_accuracy': 0.9740980102793751, 'eval_f1': 0.9736012078645125, 'eval_precision': 0.9736256430804959, 'eval_recall': 0.9740980102793751}
