In [1]:
! pip install transformers datasets




In [2]:
!pip install transformers datasets evaluate rouge_score



In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from datasets import load_dataset

billsum = load_dataset("billsum")

In [5]:
billsum["train"][0]

{'text': "SECTION 1. LIABILITY OF BUSINESS ENTITIES PROVIDING USE OF FACILITIES \n              TO NONPROFIT ORGANIZATIONS.\n\n    (a) Definitions.--In this section:\n            (1) Business entity.--The term ``business entity'' means a \n        firm, corporation, association, partnership, consortium, joint \n        venture, or other form of enterprise.\n            (2) Facility.--The term ``facility'' means any real \n        property, including any building, improvement, or appurtenance.\n            (3) Gross negligence.--The term ``gross negligence'' means \n        voluntary and conscious conduct by a person with knowledge (at \n        the time of the conduct) that the conduct is likely to be \n        harmful to the health or well-being of another person.\n            (4) Intentional misconduct.--The term ``intentional \n        misconduct'' means conduct by a person with knowledge (at the \n        time of the conduct) that the conduct is harmful to the health \n        or w

In [6]:
from transformers import AutoTokenizer

checkpoint = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [7]:
!pip install -qqq --upgrade bitsandbytes transformers peft accelerate datasets trl flash_attn
!pip install wandb -qqq

In [8]:
from transformers import BitsAndBytesConfig, AutoModelForSeq2SeqLM, AutoTokenizer
import torch

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type= "nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
)

model = AutoModelForSeq2SeqLM.from_pretrained(
          checkpoint,  quantization_config=bnb_config, device_map="auto",
          #attn_implementation='flash_attention_2',
)

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
from peft import prepare_model_for_kbit_training

# Assuming 'model' is your pre-trained model, e.g., a Hugging Face model
model = prepare_model_for_kbit_training(model)

In [10]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear4bit(in_features=768, out_features=768, bias=False)
              (k): Linear4bit(in_features=768, out_features=768, bias=False)
              (v): Linear4bit(in_features=768, out_features=768, bias=False)
              (o): Linear4bit(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear4bit(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear4bit(in_features=768, out_features=2048, bias=Fa

In [11]:
from peft import LoraConfig, get_peft_model,  TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r=256,
 lora_alpha=256,
 target_modules=["q", "k", "v", "o", "wi_0", "wi_1"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)


# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 91,226,112 || all params: 338,803,968 || trainable%: 26.9259


In [12]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [13]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/18949 [00:00<?, ? examples/s]

Map:   0%|          | 0/3269 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

In [14]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [15]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [16]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./phi-3-mini-LoRA",
        evaluation_strategy="steps",
        do_eval=True,
        optim="adamw_torch",
        per_device_train_batch_size=16,
        gradient_accumulation_steps=4,
        per_device_eval_batch_size=16,
        log_level="debug",
        save_strategy="epoch",
        logging_steps=10,
        learning_rate=1e-3,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        eval_steps=50,
        num_train_epochs=3,
        warmup_ratio=0.1,
        lr_scheduler_type="inverse_sqrt",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    #compute_metrics=compute_metrics,
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Using auto half precision backend
Currently training with a batch size of: 16
The following columns in the training set don't have a corresponding argument in `PeftModelForSeq2SeqLM.forward` and have been ignored: summary, text, title. If summary, text, title are not expected by `PeftModelForSeq2SeqLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 18,949
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 4
  Total optimization steps = 888
  Number of trainable parameters = 91,226,112
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a

 ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/user/.netrc


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
50,1.6034,1.353447
100,1.44,1.253438
150,1.39,1.204502
200,1.3075,1.170997
250,1.2984,1.148228
300,1.2933,1.149988
350,1.2286,1.138462
400,1.206,1.123653
450,1.2097,1.111188
500,1.1982,1.107443


The following columns in the evaluation set don't have a corresponding argument in `PeftModelForSeq2SeqLM.forward` and have been ignored: summary, text, title. If summary, text, title are not expected by `PeftModelForSeq2SeqLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 3269
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForSeq2SeqLM.forward` and have been ignored: summary, text, title. If summary, text, title are not expected by `PeftModelForSeq2SeqLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 3269
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `PeftModelForSeq2SeqLM.forward` and have been ignored: summary, text, title. If summary, text, title are not expected by `PeftModelForSeq2SeqLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  

TrainOutput(global_step=888, training_loss=1.267869283487131, metrics={'train_runtime': 5165.7392, 'train_samples_per_second': 11.005, 'train_steps_per_second': 0.172, 'total_flos': 1.0964375090233344e+17, 'train_loss': 1.267869283487131, 'epoch': 2.9974683544303797})

In [17]:
trainer.push_to_hub()

Saving model checkpoint to ./phi-3-mini-LoRA
loading configuration file config.json from cache at /home/user/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_le

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/365M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dhanishetty/phi-3-mini-LoRA/commit/5cca5631ab3451edcbda3bf5d8adbe9987862cd8', commit_message='End of training', commit_description='', oid='5cca5631ab3451edcbda3bf5d8adbe9987862cd8', pr_url=None, pr_revision=None, pr_num=None)