# Albert finetuning

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#install dependencies
!pip install transformers
!pip install datasets
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m84.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://

In [3]:
import os
import random
import numpy as np
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from ast import literal_eval
import pandas as pd
from datasets import Dataset, load_metric


# Set seed for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

set_seed(42)

In [4]:
#loading the data from csv
def load_data_from_csv(file_path):
    data = pd.read_csv(file_path)
    data['word'] = data['word'].apply(literal_eval)
    data['tag'] = data['tag'].apply(literal_eval)
    return data

In [5]:
# Load your CoNLL-formatted data
train_data = load_data_from_csv("/content/drive/MyDrive/NLP_project/data/output/train_conll.csv")
valid_data = load_data_from_csv("/content/drive/MyDrive/NLP_project/data/output/valid_conll.csv")

train_dataset = Dataset.from_pandas(train_data)
valid_dataset = Dataset.from_pandas(valid_data)
datasets = {"train": train_dataset, "validation": valid_dataset}

metric = load_metric("seqeval")

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [6]:
datasets

{'train': Dataset({
     features: ['Unnamed: 0', 'word', 'tag'],
     num_rows: 5302
 }),
 'validation': Dataset({
     features: ['Unnamed: 0', 'word', 'tag'],
     num_rows: 1090
 })}

In [7]:
#loading the pretarined model
model_checkpoint = "albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

In [8]:
# Get the number of NER labels from the dataset
# Get the number of NER labels from the dataset
label_list = sorted(set(tag for tags in train_data["tag"] for tag in tags))
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for i, l in enumerate(label_list)}
num_labels = len(label_list)

#define model
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Downloading pytorch_model.bin:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForTokenClassification: ['predictions.bias', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.weight', 'predictions.dense.weight', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably

In [9]:
label_list

['B-person',
 'B-problem',
 'B-pronoun',
 'B-test',
 'B-treatment',
 'I-person',
 'I-problem',
 'I-test',
 'I-treatment',
 'O']

In [10]:
#tokenzer defintion
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["word"], truncation=True, is_split_into_words=True, padding="max_length")
    labels = []
    for i, label in enumerate(examples["tag"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        prev_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != prev_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            prev_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [11]:
#tokenizing data
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_valid_dataset = valid_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/5302 [00:00<?, ? examples/s]

Map:   0%|          | 0/1090 [00:00<?, ? examples/s]

In [16]:
# defining training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/NLP_project/data/output",
    evaluation_strategy="epoch",
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="/content/drive/MyDrive/NLP_project/data/log",
    seed=42,
)


In [17]:
#defining compute metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)

    overall_result =  {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"]
    }

    #label-wise f1 score
    for key in results.keys():
      if key not in overall_result.keys():
        overall_result[key+"_f1"] = results[key]["f1"] 

    return overall_result


In [18]:
#trainer definition
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [19]:
#training and saving the model
trainer.train()

model.save_pretrained("/content/drive/MyDrive/NLP_project/model-albert")
tokenizer.save_pretrained("/content/drive/MyDrive/NLP_project/model-albert")

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored

In [None]:
#testing on test data
# Load and process the test data
test_data = load_data_from_csv("/content/drive/MyDrive/NLP_project/data/output/test_conll.csv")
test_dataset = Dataset.from_pandas(test_data)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

# Evaluate the model on the test dataset
test_results = trainer.evaluate(tokenized_test_dataset)

# Print the test results
print("Test results:")
for key, value in test_results.items():
    print(f"{key}: {value:.4f}")

In [None]:
import os
import matplotlib.pyplot as plt

# Define the data
epochs = [1, 2, 3, 4, 5, 6]
training_losses = [0.3746, 0.2032, 0.1547, 0.0804, 0.0594, 0.0402]
validation_losses = [0.234656, 0.231905, 0.231322, 0.237391, 0.261234, 0.278624]
precision = [0.800605, 0.812020, 0.821649, 0.842291, 0.839628, 0.844256]
recall = [0.794921, 0.837520, 0.835336, 0.847351, 0.837794, 0.840797]
f1 = [0.797753, 0.824573, 0.828436, 0.844814, 0.838710, 0.842523]
accuracy = [0.931789, 0.935972, 0.938178, 0.944613, 0.944843, 0.946084]
person_f1 = [0.869691, 0.883789, 0.894587, 0.896117, 0.891732, 0.891389]
problem_f1 = [0.730175, 0.757660, 0.778596, 0.813354, 0.795421, 0.800799]
pronoun_f1 = [0.932249, 0.960630, 0.958115, 0.963158, 0.957447, 0.954907]
test_f1 = [0.782123, 0.826305, 0.803793, 0.823446, 0.830162, 0.831351]
treatment_f1 = [0.769014, 0.803020, 0.792478, 0.806345, 0.802703, 0.813187]

# Plot the training and validation losses
plt.plot(epochs, training_losses, label='Training Loss')
plt.plot(epochs, validation_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.savefig('albert_training_validation_loss.png')
plt.show()

# Plot the precision, recall, and F1 scores
plt.plot(epochs, precision, label='Precision')
plt.plot(epochs, recall, label='Recall')
plt.plot(epochs, f1, label='F1')
plt.xlabel('Epoch')
plt.ylabel('Score')
plt.title('Precision, Recall, and F1')
plt.legend()
plt.savefig('albert_precision_recall_f1.png')
plt.show()

# Plot the accuracy
plt.plot(epochs, accuracy, label='Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy')
plt.legend()
plt.savefig('albert_accuracy.png')
plt.show()

# Plot the F1 scores for each entity type
plt.plot(epochs, person_f1, label='Person')
plt.plot(epochs, problem_f1, label='Problem')
plt.plot(epochs, pronoun_f1, label='Pronoun')
plt.plot(epochs, test_f1, label='Test')
plt.plot(epochs, treatment_f1, label='Treatment')
plt.xlabel('Epoch')
plt.ylabel('F1')
plt.title('Entity Type F1 Scores')
plt.legend()
plt.savefig('albert_entitywise_f1.png')
plt.show()
