In [1]:
from clearml import Task
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DefaultDataCollator
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

In [21]:
#task = Task.init(project_name="tutorial", task_name="qa_exp_tiny")

# Initialize or connect to an existing task
task = Task.current_task() or Task.init(project_name='tutorial', task_name='qa_exp', task_type=Task.TaskTypes.optimizer)


ClearML Task: created new task id=206dc60eaf284e4e9c7dbc4e3c7691d6
ClearML results page: https://app.clear.ml/projects/7ceabae35f3c457db0aa8403b4fad826/experiments/206dc60eaf284e4e9c7dbc4e3c7691d6/output/log


In [2]:
squad = load_dataset("squad", split="train[:500]")
squad = squad.train_test_split(test_size=0.2)
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 400
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 100
    })
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")

In [23]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [24]:
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [25]:
model

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [6]:
data_collator = DefaultDataCollator()

peft_config = LoraConfig(
    task_type=TaskType.QUESTION_ANS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1, 
    target_modules=["q_lin", "k_lin", "v_lin", "out_lin", "lin1", "lin2", "qa_outputs"],
)

In [26]:
data_collator = DefaultDataCollator()

peft_config = LoraConfig(
    task_type=TaskType.QUESTION_ANS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1, 
    target_modules=["query", "key", "value", "dense", "qa_outputs"],
)

In [27]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 671,250 || all params: 67,043,364 || trainable%: 1.0012176596627818


In [28]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [30]:
trainer.train()

Parameters must be of builtin type (Transformers_2/accelerator_config[AcceleratorConfig])


Epoch,Training Loss,Validation Loss
1,No log,2.26544
2,No log,1.610186
3,No log,1.477376


TrainOutput(global_step=75, training_loss=2.4598543294270834, metrics={'train_runtime': 208.5988, 'train_samples_per_second': 5.753, 'train_steps_per_second': 0.36, 'total_flos': 119460693196800.0, 'train_loss': 2.4598543294270834, 'epoch': 3.0})

In [31]:
# Save the model
trainer.save_model("./fine_tuned_tiny")



In [15]:
task.upload_artifact(name="fine_tuned_tiny", artifact_object="./fine_tuned_tiny")

Action failed <400/110: tasks.add_or_update_artifacts/v2.10 (Invalid task status: expected=created, status=completed)> (task=e93a14ed82364560909cc1c3a4716da5, artifacts=[{'key': 'notebook', 'type': 'custom', 'uri': 'https://files.clear.ml/tutorial/qa_exp.e93a14ed82364560909cc1c3a4716da5/artifacts/notebook/clearml_tutorial.ipynb', 'content_size': 12941, 'hash': '498d70d4bac5576aca8d68a8d605df19661094065a6c93853bd678cb2a4324bb', 'timestamp': 1726901381, 'type_data': {'preview': 'See `notebook preview` artifact'}, 'display_data': [('UPDATE', '2024-09-21 06:49:41')]}, {'key': 'notebook preview', 'type': 'custom', 'uri': 'https://files.clear.ml/tutorial/qa_exp.e93a14ed82364560909cc1c3a4716da5/artifacts/notebook%20preview/notebook_e93a14ed82364560909cc1c3a4716da5.html', 'content_size': 303651, 'hash': '6effef04f98c0a9e3857f9a9b6b020c3f76bc77fd6973decbec6d99faf81aca8', 'timestamp': 1726901384, 'type_data': {'preview': 'Click `FILE PATH` link', 'content_type': 'text/html'}, 'display_data': [('

True

In [32]:
task.close()