In [2]:
from datasets import load_dataset
HUGGINGFACE_DATASET_NAME = "knkarthick/dialogsum"
dataset = load_dataset(HUGGINGFACE_DATASET_NAME)

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [3]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
# Define the model name
model_name = "google/flan-t5-base"

# Load the model and tokenizer
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Print model and tokenizer information
print(original_model)
print(tokenizer)


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [4]:
index = 200
dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']
print(f"Dialogue: {dialogue}")
print(f"Summary: {summary}")

Dialogue: #Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.
Summary: #Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.


In [5]:
prompt = f"""
Summarize the following conversation

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)

In [6]:
print(output)

#Person1#: I'm thinking of upgrading my computer.


In [7]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q","v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM #FlLAN-T5
)

In [8]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f'trainable model parameters: {trainable_model_params}\n \
            all model parameters: {all_model_params} \n \
            percentage of trainable model parameters: {(trainable_model_params / all_model_params) * 100} %'


In [9]:
peft_model = get_peft_model(original_model, lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3538944
             all model parameters: 251116800 
             percentage of trainable model parameters: 1.4092820552029972 %


In [20]:
def tokenize_function(example):
    return tokenizer(
        example['dialogue'],               # Replace "source" with the input text field
        text_target=example['summary'],   # Replace "target" with the target text field
        max_length=512,                  # Maximum token length
        truncation=True,                 # Truncate sequences longer than `max_length`
        padding='max_length',            # Pad to `max_length`
    )

# Tokenize the dataset
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

# Access the tokenized train dataset
train_dataset = tokenized_datasets["train"]

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [21]:
from transformers import TrainingArguments, Trainer
from root import data_output_path
import time

output_dir = data_output_path() + f"peft-dialogue-summary-training-{str(int(time.time()))}"

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

peft_trainer.train()
peft_model_path= data_output_path() + "peft-dialogue-summary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
1,51.5


('/Users/gergelymadi-nagy/Library/CloudStorage/OneDrive-math.bme.hu/git/practicingLLM/data/output/peft-dialogue-summary-checkpoint-local/tokenizer_config.json',
 '/Users/gergelymadi-nagy/Library/CloudStorage/OneDrive-math.bme.hu/git/practicingLLM/data/output/peft-dialogue-summary-checkpoint-local/special_tokens_map.json',
 '/Users/gergelymadi-nagy/Library/CloudStorage/OneDrive-math.bme.hu/git/practicingLLM/data/output/peft-dialogue-summary-checkpoint-local/tokenizer.json')

In [26]:
from transformers import GenerationConfig
index = 200
dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']
prompt = f"""
Summarize the following conversation

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    peft_model.generate(input_ids=inputs.input_ids, generation_config=GenerationConfig(new_max_totokens=200))[0],
    skip_special_tokens=True
)
print(output)

#Person1#: I'd like to upgrade my computer. #Person2:


('4.46.3', '1.2.1')