In [1]:
import os
from transformers import AutoTokenizer,AutoModelForQuestionAnswering, TrainingArguments, Trainer,AutoConfig
from transformers import DefaultDataCollator
from transformers import TrainingArguments
from transformers import Trainer
from datasets import load_dataset
import torch
from transformers import DistilBertModel
from transformers import PreTrainedModel,PretrainedConfig
from transformers.modeling_outputs import QuestionAnsweringModelOutput
import numpy as np
import re
import string
import collections
from datasets import DatasetDict
import ast
from transformers import EarlyStoppingCallback

In [8]:
my_dataset = load_dataset("csv", data_files="/content/Task2dataSet_train.csv",split="train[:]")
my_dataset = my_dataset.train_test_split(test_size=0.1)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")




Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-5841f28079bd0717/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-5841f28079bd0717/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
# Taken from https://huggingface.co/docs/transformers/tasks/question_answering
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = ast.literal_eval(answers[i])
        start_char = int(answer["answer_start"][0])
        end_char = int(answer["answer_start"][0]) + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    inputs["context"] = examples["context"]
    inputs["answer"] = answers
    return inputs


In [10]:

class DistillBERTQA(PreTrainedModel):
    def __init__(self,config: PretrainedConfig):
        # super(DistillBERTQA, config).__init__()
        super().__init__(config)
        self.distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.qa_outputs = torch.nn.Linear(768, 2)
        self.dropout = torch.nn.Dropout(0.3)

    def forward(self, input_ids=None, attention_mask=None,start_positions=None,end_positions=None,return_dict=None):
        distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = distilbert_output[0]
        hidden_states = self.dropout(hidden_states)
        logits = self.qa_outputs(hidden_states)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()  # (bs, max_query_len)
        end_logits = end_logits.squeeze(-1).contiguous()  # (bs, max_query_len)
        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + distilbert_output[1:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions,
        )


In [11]:
tokenized_data = my_dataset.map(preprocess_function, batched=True, remove_columns=my_dataset["train"].column_names)
config = AutoConfig.from_pretrained("distilbert-base-uncased")
model = DistillBERTQA(config)
data_collator = DefaultDataCollator()

  0%|          | 0/41 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:

LOG_NAME = "task2_50epochs_2e-5_FINAL_ES"

training_args = TrainingArguments(
    output_dir="/content/training_dir"+ LOG_NAME,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    metric_for_best_model = 'eval_loss',
    greater_is_better = False,
    load_best_model_at_end = True,
    save_strategy = 'epoch'

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)],
)
trainer.args.save_total_limit = 3

trainer.train()
trainer.save_model() 
trainer.save_state()
