In [None]:
!pip install -qqq -U wandb --progress-bar off
import wandb
from huggingface_hub import login
from google.colab import userdata

login(userdata.get('HF_TOKEN'))

wb_token = userdata.get('wandb')
wandb.login(key=wb_token)

In [None]:
!pip install -q -U git+https://github.com/huggingface/transformers.git --progress-bar off
!pip install -q -U git+https://github.com/huggingface/accelerate.git --progress-bar off
!pip install datasets evaluate --progress-bar off

In [None]:
# see https://huggingface.co/google/flan-t5-small
base_model_id = "google/flan-t5-base"#"google-t5/t5-small"#"google-t5/t5-base"#"google-t5/t5-small"#"google-t5/t5-base"#"google/t5-v1_1-small" #"google/flan-t5-small"

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained(base_model_id)
tokenizer = T5Tokenizer.from_pretrained(base_model_id)
tokenizer.add_special_tokens({'sep_token': "<s>"})

In [None]:
print(tokenizer.sep_token)

In [None]:
from datasets import load_dataset

mrqa = load_dataset("enriquesaou/mrqa-squadded-sample")

In [None]:
mrqa

In [None]:
max_length = 512
stride = 128

In [None]:
# adapted from https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_seq2seq_qa.py
def generate_input(_question, _context):
    return " ".join(["question:", _question.strip(), tokenizer.sep_token, "context:", _context.strip()])

def preprocess_mrqa_batch(examples):
        questions = examples["question"]
        contexts = examples["context"]
        answers = examples["answers"]

        inputs = [generate_input(question, context) for question, context in zip(questions, contexts)]
        targets = [answer["text"][0] if len(answer["text"]) > 0 else "" for answer in answers]
        return inputs, targets

def preprocess_training(examples, _max_length=max_length, _stride=stride, padding="max_length", truncation=True):
    inputs, targets = preprocess_mrqa_batch(examples)

    model_inputs = tokenizer(inputs,
                             max_length=_max_length,
                             stride=_stride,
                             padding=padding,
                             truncation=truncation)
    labels = tokenizer(text_target=targets,
                       max_length=_max_length,
                       stride=_stride,
                       padding=padding,
                       truncation=truncation)

    # Replace tokenizer.pad_token_id in the labels to ignore padding in the loss
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
def preprocess_validation(examples, _max_length=max_length, _stride=stride, padding="max_length", truncation=True):
        inputs, targets = preprocess_mrqa_batch(examples)

        model_inputs = tokenizer(inputs,
                                 max_length=_max_length,
                                 padding=padding,
                                 truncation=truncation,
                                 return_overflowing_tokens=True,
                                 return_offsets_mapping=True)
        labels = tokenizer(text_target=targets,
                           max_length=_max_length,
                           padding=padding,
                           truncation=truncation)

        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

        sample_mapping = model_inputs.pop("overflow_to_sample_mapping")

        model_inputs["example_id"] = []
        labels_out = []

        for i in range(len(model_inputs["input_ids"])):
            sample_index = sample_mapping[i]
            model_inputs["example_id"].append(examples["id"][sample_index])
            labels_out.append(labels["input_ids"][sample_index])

        model_inputs["labels"] = labels_out
        return model_inputs

In [None]:
train_mrqa = mrqa['train'].map(
    preprocess_training,
    batched=True,
    remove_columns=mrqa['train'].column_names,
)

val_mrqa = mrqa['validation'].map(
    preprocess_training,
    batched=True,
    remove_columns=mrqa['validation'].column_names,
)

mrqa, train_mrqa, val_mrqa

#plots

In [None]:
result = tokenizer('this is T5 tokenizer! is dog the same as dogs?')
result

In [None]:
print(type(tokenizer))
print(tokenizer.vocab_size)

In [None]:
for id in result['input_ids']:
    print(tokenizer.decode(id))

print(tokenizer.decode(result['input_ids'], skip_special_tokens=True))

In [None]:
import matplotlib.pyplot as plt

def plot_data_lengths(tok_dataset):
    lengths = [len(x['input_ids']) for x in tok_dataset]
    print(len(lengths))
    print(lengths)

    # Plotting the histogram
    plt.figure(figsize=(10, 6))
    plt.hist(lengths, bins=20, alpha=0.7, color='blue')
    plt.xlabel('Length of input_ids')
    plt.ylabel('Frequency')
    plt.title('Distribution of Lengths of input_ids')
    plt.show()


In [None]:
#plot_data_lengths(tokenized_mrqa['train']), plot_data_lengths(tokenized_mrqa['test'])

#train

In [None]:
my_model_id = "flan-t5-base-mrqa-16"

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

model.train()

"""
# base
training_args = Seq2SeqTrainingArguments(
    output_dir=my_model_id,
    #eval_strategy="steps",
    #max_steps=5,
    do_train=True,
    fp16=True, #https://discuss.huggingface.co/t/training-loss-0-0-validation-loss-nan/27950/4
    eval_strategy="epoch",
    num_train_epochs=5, #overfit at >3
    learning_rate=3e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    gradient_accumulation_steps=3,
    weight_decay=0.01,
    report_to="wandb",
)
"""
"""
# flan base 2
training_args = Seq2SeqTrainingArguments(
    output_dir=my_model_id,
    #eval_strategy="steps",
    #max_steps=5,
    do_train=True,
    eval_strategy="epoch",
    num_train_epochs=5,#4
    learning_rate=3e-5,
    per_device_train_batch_size=6,#4
    per_device_eval_batch_size=6,#4
    gradient_accumulation_steps=3,#2
    weight_decay=0.01,
    report_to="wandb",
)
"""
"""
# flan small
training_args = Seq2SeqTrainingArguments(
    output_dir=my_model_id,
    #eval_strategy="steps",
    #max_steps=5,
    do_train=True,
    eval_strategy="epoch",
    num_train_epochs=7,
    learning_rate=3e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    gradient_accumulation_steps=3,
    weight_decay=0.01,
    report_to="wandb",
)
"""
"""
# flan base
training_args = Seq2SeqTrainingArguments(
    output_dir=my_model_id,
    #eval_strategy="steps",
    #max_steps=5,
    do_train=True,
    eval_strategy="epoch",
    num_train_epochs=2,
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    report_to="wandb",
)

"""
# small
training_args = Seq2SeqTrainingArguments(
    output_dir=my_model_id,
    #eval_strategy="steps",
    #max_steps=5,
    do_train=True,
    eval_strategy="epoch",
    num_train_epochs=8,
    learning_rate=3e-5,
    per_device_train_batch_size=14,
    per_device_eval_batch_size=14,
    gradient_accumulation_steps=3,
    weight_decay=0.01,
    report_to="wandb",
)



trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_mrqa,
    eval_dataset=val_mrqa,
    #eval_examples=tokenized_mrqa["test"],
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
!nvidia-smi

In [None]:
import torch, gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
!pip install numba

from numba import cuda
device = cuda.get_current_device()
device.reset()