# Huggingface transformers - Fine tuning a pretrained model

Note: The Auto class automatically uses pipeline() to retrieve the model architecture
Note: Every time a model is initiated, a tokenizer must be initiated with the same model name to ensure that the rules for tokenization are the same as with the pretrained model. 

### Importing modules

In [45]:
from transformers import pipeline
from transformers import AutoTokenizer
from datasets import load_dataset

### Loading dataset

In [48]:
ds = load_dataset("microsoft/orca-math-word-problems-200k")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 200035
    })
})


### Tokenizer

In [50]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
    

def tokenize_function(examples):
    #return tokenizer(examples["text"], padding="max_length", truncation=True)
    concatenated_text = examples["question"] + examples["answer"]
    return tokenizer(concatenated_text, padding="max_length", truncation=True)

# examples = batch of data
# the tokenizer function applies the tokenizer to the text field of each example, converting them into token IDs
# truncation = true ensures that all sequences longer than the max length are truncated

tokenized_datasets = ds.map(tokenize_function, batched=True)

#creating a smaller dataset for fine tuning to reduce training time
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
#shuffle ensures randomness, select creates a smaller dataset


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/200035 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ArrowInvalid: Column 2 named input_ids expected length 1000 but got length 2000

### Train with Pytorch

In [None]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-70m")

# Hyperparameters
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer") #there are many more hyperparameters that can be set
# e.g eval_strategy="epoch" ensures that the evaluation metric is returned at the end of each epoch

#Trainer does not evaluate model performance during training!!!

#Evaluate
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1) #converts logits to predictions
    return metric.compute(predictions=predictions, references=labels) #compute.metric calculates the accuracy of the predictions

# Trainer
# Creating a Trainer object with the model, training arguments, dataset, etc
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset
)
    #compute_metrics=compute_metrics,

#Fine-tuning
trainer.train()

In [None]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  device = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

In [None]:
test_text = "Jungkook is the 5th place. Find the number of people who crossed the finish line faster than Jungkook."
print("Question input (test):", test_text)
print("Model's answer: ")
print(inference(test_text, model, tokenizer))

In [60]:
from datasets import load_dataset, load_metric
dataset = load_dataset("microsoft/orca-math-word-problems-200k")
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 200035
    })
})

## Huggingface Question Answering Tutorial 

In [63]:
# This flag is the difference between SQUAD v1 or 2 (if you're using another dataset, it indicates if impossible
# answers are allowed or not).
squad_v2 = False
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

# LOADING DATASET
from datasets import load_dataset, load_metric
dataset = load_dataset("microsoft/orca-math-word-problems-200k")

# ENSURING DATASET IS SPLIT
# If there's no validation split, create it
if "validation" not in dataset:
    dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
    train_dataset = dataset['train']
    eval_dataset = dataset['test']
else:
    train_dataset = dataset['train']
    eval_dataset = dataset['validation']
    
#PREPROCESSING TRAINING DATA
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast) #ensures that the tokenizer being used is a fast tokenizer

max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]
    # Ensure that the tokenizer is prepared for the structure of your data
    pad_on_right = tokenizer.padding_side == "right"
    
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

tokenized_datasets = dataset.map(prepare_train_features, batched=True)

#FINE-TUNING
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

from transformers import default_data_collator

data_collator = default_data_collator

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)
trainer.train()
trainer.save_model("test-squad-trained")

# TESTING
from transformers import pipeline       
# Load the fine-tuned model and tokenizer using the pipeline
question_answerer = pipeline("question-answering", model="path/to/your/fine-tuned-model", tokenizer="test-squad-trained")

# Define the question and context
question = "What is the capital of France?"
context = "France, officially the French Republic, is a country whose territory consists of metropolitan France in Western Europe and several overseas regions and territories. The capital of France is Paris."

# Get the answer using the pipeline
result = question_answerer(question=question, context=context)

# Print the result
print(f"Question: {question}")
print(f"Answer: {result['answer']}")

Map:   0%|          | 0/180031 [00:00<?, ? examples/s]

TypeError: 'BatchEncoding' object is not callable