In [None]:
!pip install --upgrade transformers datasets evaluate
!pip install wandb

Collecting transformers
  Downloading transformers-4.48.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.48.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
#@title Imports
# Dataset
from datasets import load_dataset

# Model
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# Fine-tuning
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

# Evaluation
from evaluate import evaluator
import evaluate
# from datasets import load_metric

# Experiment tracking
import wandb

# API keys
from dotenv import load_dotenv

# Utility
import random
import numpy as np
import torch
import os

In [None]:
#@title Preprocess SQUAD dataset
def preprocess_train_function(examples, tokenizer):
    inputs = tokenizer(
        [q.strip() for q in examples["question"]],
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs["offset_mapping"]
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, (offset, answer) in enumerate(zip(offset_mapping, answers)):
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

def preprocess_validation_function(examples, tokenizer):
    inputs = tokenizer(
        [q.strip() for q in examples["question"]],
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [None]:
#@title Set Seed
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# Set the seed
set_seed(42)

In [None]:
#@title Evaluation 1 - Metrics
def compute_metrics(model, tokenizer, validation_dataset, dataset_name = "squad"):
    task_evaluator = evaluator("question-answering")
    squad_v2_format = dataset_name == "squad_v2"

    metrics_result = task_evaluator.compute(
                                            model_or_pipeline=model,
                                            tokenizer=tokenizer,
                                            data=validation_dataset,
                                            metric=dataset_name,
                                            squad_v2_format=squad_v2_format,
                                        )
    return metrics_result

In [None]:
#@title Evaluation 2 - Trainer
def compute_metrics2(eval_pred):
    # metric = load_metric("squad") # datasets library
    metric = evaluate.load("squad")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
#@title Set Seed
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# Set the seed
set_seed(42)

In [None]:
#@title HF & WandB API setup
load_dotenv()
hf_write_token = os.getenv('HF_WRITE_TOKEN')
wandb_api_key = os.getenv('WANDB_API_KEY')

In [None]:
#@title Setup WandB
wandb.login(key = wandb_api_key)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mirmak-eren[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="Thesis-fine-tuning-for-experiments",
    name = "bert-base-cased-mysquad-unstructured-finetuning"
)

In [None]:
#@title Model & Dataset parameters
model_name = "bert-base-cased"
dataset_name = "martineden/structurized_squad"

In [None]:
#@title Load Model & Tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
#@title Load Dataset
my_dataset = load_dataset(dataset_name)

README.md:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/31.9M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/3.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
my_dataset["train"]

Dataset({
    features: ['squad_id', 'title', 'context', 'structurized_context', 'question', 'answer_text', 'answer_start_index', 'structurized_answer_start_index', 'answers', 'structurized_answers'],
    num_rows: 87599
})

In [None]:
my_dataset["validation"]

Dataset({
    features: ['squad_id', 'title', 'context', 'structurized_context', 'question', 'answer_text', 'answer_start_index', 'structurized_answer_start_index', 'answers', 'structurized_answers'],
    num_rows: 10570
})

# Preprocessing

Filter dataset - only consider rows that has answers for both unstructured and structurized contexts.

In [None]:
# Filtering function
def filter_function(example):
    return example['structurized_answer_start_index'] != -1

In [None]:
# Apply the filter to both 'train' and 'validation' datasets
my_dataset = my_dataset.filter(filter_function)

Filter:   0%|          | 0/87599 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
print(my_dataset["train"])
print(my_dataset["validation"])

Dataset({
    features: ['squad_id', 'title', 'context', 'structurized_context', 'question', 'answer_text', 'answer_start_index', 'structurized_answer_start_index', 'answers', 'structurized_answers'],
    num_rows: 67700
})
Dataset({
    features: ['squad_id', 'title', 'context', 'structurized_context', 'question', 'answer_text', 'answer_start_index', 'structurized_answer_start_index', 'answers', 'structurized_answers'],
    num_rows: 4832
})


In [None]:
my_dataset["train"] = my_dataset["train"].remove_columns(['structurized_context', 'structurized_answers', 'structurized_answer_start_index', 'answer_text', 'answer_start_index'])
my_dataset["validation"] = my_dataset["validation"].remove_columns(['structurized_context', 'structurized_answers', 'structurized_answer_start_index', 'answer_text', 'answer_start_index'])

In [None]:
print(my_dataset["train"])
print(my_dataset["validation"])

Dataset({
    features: ['squad_id', 'title', 'context', 'question', 'answers'],
    num_rows: 67700
})
Dataset({
    features: ['squad_id', 'title', 'context', 'question', 'answers'],
    num_rows: 4832
})


In [None]:
column_mapping = {
    'squad_id': 'id'
}

# Apply the renaming to each split in the dataset dictionary
my_dataset = {split: ds.rename_columns(column_mapping) for split, ds in my_dataset.items()}

In [None]:
#@title Preprocessing for Extractive QA Fine-Tuning
# set up train and val dataset
tokenized_squad_dataset = {}

tokenized_squad_dataset["train"] = my_dataset["train"].map(
    lambda x: preprocess_train_function(x, tokenizer), batched=True
)

tokenized_squad_dataset["validation"] = my_dataset["validation"].map(
    lambda x: preprocess_validation_function(x, tokenizer),
    batched=True,
    remove_columns=my_dataset["train"].column_names,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/67700 [00:00<?, ? examples/s]

Map:   0%|          | 0/4832 [00:00<?, ? examples/s]

In [None]:
print(tokenized_squad_dataset["train"])
print(tokenized_squad_dataset["validation"])

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'start_positions', 'end_positions'],
    num_rows: 67700
})
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 4970
})


# Fine-Tuning

In [None]:
#@title Training Arguments / Hyperparameters
training_args = TrainingArguments("bert-base-cased-unstructured-squad",
                                  num_train_epochs = 2,
                                  learning_rate = 5e-5,
                                  lr_scheduler_type = "constant",
                                  per_device_train_batch_size = 64,
                                  per_device_eval_batch_size = 512,
                                  logging_first_step = True,
                                  logging_steps = 50,
                                  logging_dir = './logs',
                                  save_steps = 100,
                                  report_to="wandb",
                                  run_name = "bert-base-cased-mysquad-unstructured-finetuned",
                                )

In [None]:
# Convert TrainingArguments object to a dictionary
training_args_dict = training_args.to_dict()

# Print all hyperparameters, including defaults
for key, value in training_args_dict.items():
    print(f"{key}: {value}")

output_dir: bert-base-cased-unstructured-squad
overwrite_output_dir: False
do_train: False
do_eval: False
do_predict: False
eval_strategy: no
prediction_loss_only: False
per_device_train_batch_size: 64
per_device_eval_batch_size: 512
per_gpu_train_batch_size: None
per_gpu_eval_batch_size: None
gradient_accumulation_steps: 1
eval_accumulation_steps: None
eval_delay: 0
torch_empty_cache_steps: None
learning_rate: 5e-05
weight_decay: 0.0
adam_beta1: 0.9
adam_beta2: 0.999
adam_epsilon: 1e-08
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: -1
lr_scheduler_type: constant
lr_scheduler_kwargs: {}
warmup_ratio: 0.0
warmup_steps: 0
log_level: passive
log_on_each_node: True
logging_dir: ./logs
logging_strategy: steps
logging_first_step: True
logging_steps: 50
logging_nan_inf_filter: True
save_strategy: steps
save_steps: 100
save_total_limit: None
save_safetensors: True
save_on_each_node: False
save_only_model: False
restore_callback_states_from_checkpoint: False
no_cuda: False
use_cpu: False
us

In [None]:
#@title Trainer setup
trainer = Trainer(args = training_args,
                  model = model,
                  tokenizer = tokenizer,
                  train_dataset = tokenized_squad_dataset["train"],
                  eval_dataset = tokenized_squad_dataset["validation"],
                  data_collator = data_collator,
                  compute_metrics = compute_metrics2
                )

  trainer = Trainer(args = training_args,


In [None]:
trainer.train()

Step,Training Loss
1,5.915
50,3.3424
100,1.8554
150,1.5397
200,1.3819
250,1.2676
300,1.2535
350,1.1682
400,1.121
450,1.1185


TrainOutput(global_step=2116, training_loss=0.9732603472437434, metrics={'train_runtime': 1909.0843, 'train_samples_per_second': 70.924, 'train_steps_per_second': 1.108, 'total_flos': 2.65346856465408e+16, 'train_loss': 0.9732603472437434, 'epoch': 2.0})

In [None]:
#@title Evaluate performance
trainer.evaluate()

{'eval_runtime': 22.9594,
 'eval_samples_per_second': 216.469,
 'eval_steps_per_second': 0.436,
 'epoch': 2.0}

In [None]:
result = compute_metrics(model, tokenizer, my_dataset["validation"])
print(result)

Device set to use cuda:0


Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

{'exact_match': 79.28394039735099, 'f1': 86.72720188038475, 'total_time_in_seconds': 52.29136182000002, 'samples_per_second': 92.4053195752093, 'latency_in_seconds': 0.010821887793874175}


In [None]:
#@title Save Fine-tuned model
trainer.push_to_hub(commit_message = "BERT MySQUAD Unstructured Fine-Tuning Result" , token = hf_write_token)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/martineden/bert-base-cased-unstructured-squad/commit/4351b035c7172c680c1599fcdaeea3bfa6af1a29', commit_message='BERT MySQUAD Unstructured Fine-Tuning Result', commit_description='', oid='4351b035c7172c680c1599fcdaeea3bfa6af1a29', pr_url=None, repo_url=RepoUrl('https://huggingface.co/martineden/bert-base-cased-unstructured-squad', endpoint='https://huggingface.co', repo_type='model', repo_id='martineden/bert-base-cased-unstructured-squad'), pr_revision=None, pr_num=None)

In [None]:
wandb.finish()

0,1
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▁█▇▅▆▅▅▄▅▅▆▅▄▄▄▃▅▄▆▄▅▄▂▅▆▄▇▅▃▄▄▅▅▃▄▇▄▄▃▆
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▅▃▂▂▂▂▂▂▂▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/runtime,22.9594
eval/samples_per_second,216.469
eval/steps_per_second,0.436
total_flos,2.65346856465408e+16
train/epoch,2.0
train/global_step,2116.0
train/grad_norm,9.53811
train/learning_rate,5e-05
train/loss,0.6698
train_loss,0.97326
