In [1]:
from lm_finetuning_playground.dataset import moliere
from lm_finetuning_playground.dataset import personal
from lm_finetuning_playground.dataset.tools import extract_question_and_answers

In [2]:
from peft import LoraConfig, TaskType, PeftModel
from peft import get_peft_model

from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM

from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [3]:
from typing import List
import pandas as pd

In [4]:
import torch
if torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
device

'mps'

# Pick a model

In [5]:
MODEL_ID = "google/flan-t5-base"

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Load data
Just finding some dummy datasets to try PEFT

In [7]:
# get raw datasets as sentences
texts_moliere = moliere.get_moliere_sentences() # pure Molière to train an CausalLLM talking like Molière's characters
texts_perso = personal.load_personal_sentences() # to specialize an LLM making the difference between Molière talks vs my talks

In [8]:
# Prepare the question answering dataset
questions, answers, ids = extract_question_and_answers(texts_moliere)

### Prepare datasets

In [9]:
clf_df = pd.DataFrame(data={"text": texts_moliere + texts_perso, "label": [1] * len(texts_moliere + [0] * len(texts_perso))})
qna_df = pd.DataFrame(data={"question": questions, "answer": answers})

In [10]:
clf_ds = Dataset.from_pandas(clf_df)
qna_ds = Dataset.from_pandas(qna_df)

In [11]:
id2label = {0: "MOLIERE", 1: "OTHER"}
label2id = {"MOLIERE": 0, "OTHER": 1}

In [12]:
qna_ds[0]

{'question': 'charmante Élise, vous devenez mélancolique, après les obligeantes assurances que vous avez eu la bonté de me donner de votre foi ?',
 'answer': 'Je vous vois soupirer, hélas !'}

In [13]:
from datasets import concatenate_datasets
import numpy as np
# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
#tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["dialogue"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
tokenized_inputs = qna_ds.map(lambda x: tokenizer(x["question"], truncation=True), batched=True, remove_columns=["answer"])
input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
# take 85 percentile of max length for better utilization
max_source_length = max(input_lenghts)
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
# tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
tokenized_targets = qna_ds.map(lambda x: tokenizer(x["answer"], truncation=True), batched=True, remove_columns=["answer"])
target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
# take 90 percentile of max length for better utilization
max_target_length = max(target_lenghts)
print(f"Max target length: {max_target_length}")

Map: 100%|█████████████████████████████████████████████████████████████████████████████████| 2415/2415 [00:00<00:00, 112565.92 examples/s]


Max source length: 167


Map: 100%|█████████████████████████████████████████████████████████████████████████████████| 2415/2415 [00:00<00:00, 120256.96 examples/s]

Max target length: 297





In [14]:
def preprocess_function(sample, padding="max_length"):
    # add prefix to the input for t5
    prefix = "" ; "answer: "
    inputs = [prefix + item for item in sample["question"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["answer"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [15]:
qna_ds = qna_ds.map(preprocess_function, batched=True, remove_columns=["question", "answer"])

Map: 100%|███████████████████████████████████████████████████████████████████████████████████| 2415/2415 [00:00<00:00, 7212.49 examples/s]


In [16]:
tokenized_dataset = qna_ds.train_test_split(train_size=0.8)

# Try PEFT

In [17]:
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_ID
).to(device)

## LoRA

In [18]:
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=4, lora_alpha=32, lora_dropout=0.1)

In [19]:
model_ft = get_peft_model(model, peft_config, adapter_name="moliere")
model_ft.print_trainable_parameters()

trainable params: 442,368 || all params: 248,020,224 || trainable%: 0.17835964860672007


In [20]:
training_args = TrainingArguments(
    output_dir=f"./{MODEL_ID.split('/')[1]}/lora-4",
    learning_rate=1e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [21]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)


In [22]:
trainer = Trainer(
    model=model_ft,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.8686,3.12571
2,3.49,3.091448


Could not locate the best model at ./flan-t5-base/lora-4/checkpoint-1932/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=1932, training_loss=3.6073533842035457, metrics={'train_runtime': 696.6427, 'train_samples_per_second': 5.547, 'train_steps_per_second': 2.773, 'total_flos': 869909712666624.0, 'train_loss': 3.6073533842035457, 'epoch': 2.0})

# Eval

In [24]:
def give_answer(model, questions: List[str]):
    input_ids = tokenizer(questions, return_tensors="pt", truncation=True).input_ids.to(device)
    outputs = model.generate(input_ids=input_ids, max_new_tokens=10, do_sample=True, top_p=0.9)
    answers = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)
    for question, answer in zip(questions, answers):
        print(question, "-->", answer)

In [148]:
questions = [
    "Répond en une phrase: à quoi penses-tu ?",
]
give_answer(model_ft, questions)

Répond en une phrase: à quoi penses-tu ? --> Monsieur!
