# Unstructured Knowledge Fine-Tuning

## Prerequisites

In [None]:
!pip install transformers datasets evaluate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7

Experiment tracking: Weights & Biases

In [None]:
!pip install wandb



In [None]:
#@title Imports
# Dataset
from datasets import load_dataset

# Model
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# Fine-tuning
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

# Evaluation
from evaluate import evaluator
# from datasets import load_metric

# Experiment tracking
import wandb

# API keys
from google.colab import userdata

## Helper functions

In [None]:
#@title Preprocess SQUAD dataset
def preprocess_train_function(examples, tokenizer):
    inputs = tokenizer(
        [q.strip() for q in examples["question"]],
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs["offset_mapping"]
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, (offset, answer) in enumerate(zip(offset_mapping, answers)):
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

def preprocess_validation_function(examples, tokenizer):
    inputs = tokenizer(
        [q.strip() for q in examples["question"]],
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [None]:
#@title Evaluation 1 - Metrics
def compute_metrics(model, tokenizer, validation_dataset, dataset_name = "squad"):
    task_evaluator = evaluator("question-answering")
    squad_v2_format = dataset_name == "squad_v2"

    metrics_result = task_evaluator.compute(
                                            model_or_pipeline=model,
                                            tokenizer=tokenizer,
                                            data=validation_dataset,
                                            metric=dataset_name,
                                            squad_v2_format=squad_v2_format,
                                        )
    return metrics_result

In [None]:
#@title Evaluation 2 - Trainer
def compute_metrics2(eval_pred):
    # metric = load_metric("squad") # datasets library
    metric = evaluate.load("squad")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
#@title HF & WandB API setup
hf_write_token = userdata.get('HF_TOKEN_WRITE')
wandb_api_key = userdata.get('WANDB_API_KEY')

In [None]:
#@title Setup WandB
wandb.login(key = wandb_api_key)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="bert-squad-qa-fine-tuning"
)

[34m[1mwandb[0m: Currently logged in as: [33mirmak-eren[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
#@title Model & Dataset parameters
model_name = "bert-base-cased"
dataset_name = "squad"

In [None]:
#@title Load Model & Tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
#@title Load Dataset
squad_dataset = load_dataset(dataset_name)

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

## Preprocessing

In [None]:
#@title Preprocessing for Extractive QA Fine-Tuning
# set up train and val dataset
tokenized_squad_dataset = {}

tokenized_squad_dataset["train"] = squad_dataset["train"].map(
    lambda x: preprocess_train_function(x, tokenizer), batched=True
)

tokenized_squad_dataset["validation"] = squad_dataset["validation"].map(
    lambda x: preprocess_validation_function(x, tokenizer),
    batched=True,
    remove_columns=squad_dataset["train"].column_names,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
tokenized_squad_dataset["train"]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'start_positions', 'end_positions'],
    num_rows: 87599
})

In [None]:
tokenized_squad_dataset["validation"]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 10798
})

## Fine-Tuning

In [None]:
#@title Training Arguments / Hyperparameters
training_args = TrainingArguments("bert-base-cased-squadv1-finetuned",
                                  num_train_epochs = 2,
                                  learning_rate = 5e-5,
                                  lr_scheduler_type = "constant",
                                  per_device_train_batch_size = 64,
                                  per_device_eval_batch_size = 512,
                                  logging_first_step = True,
                                  logging_steps = 50,
                                  logging_dir = './logs',
                                  save_steps = 100,
                                  report_to="wandb",
                                  run_name = "bert-base-cased-squadv1-finetuned",
                                )

In [None]:
#@title Trainer setup
trainer = Trainer(args = training_args,
                  model = model,
                  tokenizer = tokenizer,
                  train_dataset = tokenized_squad_dataset["train"],
                  eval_dataset = tokenized_squad_dataset["validation"],
                  data_collator = data_collator,
                  compute_metrics = compute_metrics2
                )

In [None]:
trainer.train()



Step,Training Loss
1,5.9968
50,3.4775
100,2.0332
150,1.7107
200,1.543
250,1.4594
300,1.3814
350,1.3182
400,1.2431
450,1.2361


TrainOutput(global_step=2738, training_loss=1.0551023694004256, metrics={'train_runtime': 2476.0324, 'train_samples_per_second': 70.758, 'train_steps_per_second': 1.106, 'total_flos': 3.4334001889975296e+16, 'train_loss': 1.0551023694004256, 'epoch': 2.0})

In [None]:
#@title Evaluate performance
trainer.evaluate()

{'eval_runtime': 50.3677,
 'eval_samples_per_second': 214.383,
 'eval_steps_per_second': 0.437,
 'epoch': 2.0}

In [None]:
result = compute_metrics(model, tokenizer, squad_dataset["validation"])
print(result)

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

{'exact_match': 80.09460737937559, 'f1': 87.57757654304311, 'total_time_in_seconds': 110.98169544600023, 'samples_per_second': 95.24093101589882, 'latency_in_seconds': 0.010499687364806077}


In [None]:
#@title Save Fine-tuned model
trainer.push_to_hub(commit_message = "BERT-QA SQUADv1 Fine-Tuning Result" , token = hf_write_token)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/martineden/bert-base-cased-squadv1-finetuned/commit/e711f911bacd035368bb2c43f838f366bbc397de', commit_message='BERT-QA SQUADv1 Fine-Tuning Result', commit_description='', oid='e711f911bacd035368bb2c43f838f366bbc397de', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
wandb.finish()

VBox(children=(Label(value='0.004 MB of 0.025 MB uploaded\r'), FloatProgress(value=0.151576175354349, max=1.0)…

0,1
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▁██▆▇▅██▅▆▆▅▆▆▅▅▆▅▆▅▅▅▇▆▅▆▅▅▆▆▇▅▃▄▅▄▅▆▆▅
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▅▃▂▂▂▂▂▂▂▂▂▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/runtime,50.3677
eval/samples_per_second,214.383
eval/steps_per_second,0.437
total_flos,3.4334001889975296e+16
train/epoch,2.0
train/global_step,2738.0
train/grad_norm,7.48677
train/learning_rate,5e-05
train/loss,0.8266
train_loss,1.0551
