In [1]:
import os
os.environ['all_proxy'] = "socks5://127.0.0.1:10808"

In [2]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [4]:
huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)

dataset

Found cached dataset csv (/home/frog/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-931380d0e19583fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
})

In [5]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("roneneldan/TinyStories-33M")
# model = AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-33M")

In [6]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


In [7]:
index = 200

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:

-------------------------------------------------------------------

In [8]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Loading cached processed dataset at /home/frog/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-931380d0e19583fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-3579c015ced2fa53.arrow


Loading cached processed dataset at /home/frog/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-931380d0e19583fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-657b7992ea4cc6bb.arrow
Loading cached processed dataset at /home/frog/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-931380d0e19583fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-ced6a81e5227fe5c.arrow


Shapes of the datasets:
Training: (12460, 2)
Validation: (500, 2)
Test: (1500, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
})


In [9]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-4,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1000
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

In [10]:
import wandb
wandb. init(mode="disabled") 

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.




In [11]:
trainer.train()

  0%|          | 0/1000 [00:00<?, ?it/s]

{'loss': 49.5, 'learning_rate': 9.99e-05, 'epoch': 0.0}
{'loss': 44.75, 'learning_rate': 9.98e-05, 'epoch': 0.0}
{'loss': 40.75, 'learning_rate': 9.970000000000001e-05, 'epoch': 0.0}
{'loss': 36.5, 'learning_rate': 9.960000000000001e-05, 'epoch': 0.0}
{'loss': 34.75, 'learning_rate': 9.95e-05, 'epoch': 0.0}
{'loss': 33.75, 'learning_rate': 9.94e-05, 'epoch': 0.0}
{'loss': 31.125, 'learning_rate': 9.93e-05, 'epoch': 0.0}
{'loss': 30.875, 'learning_rate': 9.92e-05, 'epoch': 0.01}
{'loss': 28.75, 'learning_rate': 9.910000000000001e-05, 'epoch': 0.01}
{'loss': 27.25, 'learning_rate': 9.900000000000001e-05, 'epoch': 0.01}
{'loss': 27.125, 'learning_rate': 9.89e-05, 'epoch': 0.01}
{'loss': 26.25, 'learning_rate': 9.88e-05, 'epoch': 0.01}
{'loss': 24.75, 'learning_rate': 9.87e-05, 'epoch': 0.01}
{'loss': 24.375, 'learning_rate': 9.86e-05, 'epoch': 0.01}
{'loss': 23.375, 'learning_rate': 9.850000000000001e-05, 'epoch': 0.01}
{'loss': 22.375, 'learning_rate': 9.84e-05, 'epoch': 0.01}
{'loss': 2

TrainOutput(global_step=1000, training_loss=1.57051953125, metrics={'train_runtime': 227.3612, 'train_samples_per_second': 35.186, 'train_steps_per_second': 4.398, 'train_loss': 1.57051953125, 'epoch': 0.64})

In [12]:
device = torch.device("cuda")
instruct_model = trainer.model
original_model = original_model.to(device)
instruct_model = instruct_model.to(device)

In [28]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=64, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [29]:
peft_model = get_peft_model(original_model, 
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 7077888
all model parameters: 254655744
percentage of trainable model parameters: 2.78%


In [30]:
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=10,
    logging_steps=1,
    max_steps=400   
)
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

In [31]:
peft_trainer.train()

  0%|          | 0/400 [00:00<?, ?it/s]

{'loss': 0.3027, 'learning_rate': 0.0009975000000000001, 'epoch': 0.0}
{'loss': 0.3262, 'learning_rate': 0.000995, 'epoch': 0.0}
{'loss': 0.3535, 'learning_rate': 0.0009925000000000001, 'epoch': 0.0}
{'loss': 0.3066, 'learning_rate': 0.00099, 'epoch': 0.0}
{'loss': 0.291, 'learning_rate': 0.0009875, 'epoch': 0.0}
{'loss': 0.3105, 'learning_rate': 0.000985, 'epoch': 0.0}
{'loss': 0.5312, 'learning_rate': 0.0009825, 'epoch': 0.0}
{'loss': 0.3359, 'learning_rate': 0.00098, 'epoch': 0.01}
{'loss': 0.3223, 'learning_rate': 0.0009775, 'epoch': 0.01}
{'loss': 0.2578, 'learning_rate': 0.000975, 'epoch': 0.01}
{'loss': 0.3652, 'learning_rate': 0.0009725000000000001, 'epoch': 0.01}
{'loss': 0.3184, 'learning_rate': 0.0009699999999999999, 'epoch': 0.01}
{'loss': 0.2871, 'learning_rate': 0.0009675, 'epoch': 0.01}
{'loss': 0.2734, 'learning_rate': 0.000965, 'epoch': 0.01}
{'loss': 0.2969, 'learning_rate': 0.0009625, 'epoch': 0.01}
{'loss': 0.2969, 'learning_rate': 0.00096, 'epoch': 0.01}
{'loss': 0

TrainOutput(global_step=400, training_loss=0.148619384765625, metrics={'train_runtime': 88.0268, 'train_samples_per_second': 36.353, 'train_steps_per_second': 4.544, 'train_loss': 0.148619384765625, 'epoch': 0.26})

In [32]:
peft_model = peft_trainer.model

In [50]:
index = 200
dialogue = dataset['test'][index]['dialogue']
baseline_human_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

peft_model_outputs = peft_model.to(device).generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{baseline_human_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')
print(dash_line)
print(f'PEFT MODEL:\n {peft_model_text_output}')

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
#Person1# wants to upgrade his system and asks #Person2# to help him with upgrading his hardware.
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
#Person2# wants to upgrade the system and #Person1# suggests adding a painting program to #Person2#'s software. #Person2# suggests adding a CD-ROM drive and a CD-ROM drive.
---------------------------------------------------------------------------------------------------
PEFT MODEL:
 #Person2# wants to upgrade their system and their hardware. #Person1# suggests adding a painting program to their software. #Person2# also recommends a CD-ROM drive.


In [None]:
import transformers
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_int8_training

target_modules = ["q_proj", "k_proj", "v_proj", "out_proj", "fc_in", "fc_out", "wte"]
config = LoraConfig(r=32, lora_alpha=32, target_modules=target_modules, lora_dropout=0.01, bias="none", task_type="CAUSAL_LM")
# rank = 32
# model = transformers.GPTJForCausalLM.from_pretrained(
#    "kakaobrain/kogpt",
#     revision="KoGPT6B-ryan1.5b-float16",  # or float32 version: revision=KoGPT6B-ryan1.5b
#      pad_token_id=tokenizer.eos_token_id,
#     use_cache=False,
#      device_map={"": rank},
#      torch_dtype=torch.float16,
#      load_in_8bit=True,)
# model = prepare_model_for_int8_training(model)
lora_model = get_peft_model(model, config)

In [None]:
print(print_number_of_trainable_model_parameters(lora_model))

trainable model parameters: 2419232
all model parameters: 70933280
percentage of trainable model parameters: 3.41%


In [None]:
output_dir = f'./peft'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=10,
    logging_steps=1,
    max_steps=100   
)
    
peft_trainer = Trainer(
    model=lora_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

In [None]:
import wandb
wandb. init(mode="disabled") 



In [None]:
peft_trainer.train()

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

{'loss': 8.3094, 'learning_rate': 0.00099, 'epoch': 0.0}
{'loss': 0.6848, 'learning_rate': 0.00098, 'epoch': 0.0}
{'loss': 0.4427, 'learning_rate': 0.0009699999999999999, 'epoch': 0.0}
{'loss': 0.377, 'learning_rate': 0.00096, 'epoch': 0.0}
{'loss': 0.2299, 'learning_rate': 0.00095, 'epoch': 0.0}
{'loss': 0.3646, 'learning_rate': 0.00094, 'epoch': 0.0}
{'loss': 0.2827, 'learning_rate': 0.00093, 'epoch': 0.0}
{'loss': 0.142, 'learning_rate': 0.00092, 'epoch': 0.0}
{'loss': 0.2537, 'learning_rate': 0.00091, 'epoch': 0.0}
{'loss': 0.2379, 'learning_rate': 0.0009000000000000001, 'epoch': 0.0}
{'loss': 0.179, 'learning_rate': 0.0008900000000000001, 'epoch': 0.0}
{'loss': 0.138, 'learning_rate': 0.00088, 'epoch': 0.0}
{'loss': 0.3155, 'learning_rate': 0.00087, 'epoch': 0.0}
{'loss': 0.2185, 'learning_rate': 0.00086, 'epoch': 0.0}
{'loss': 0.2874, 'learning_rate': 0.00085, 'epoch': 0.0}
{'loss': 0.2378, 'learning_rate': 0.00084, 'epoch': 0.01}
{'loss': 0.2416, 'learning_rate': 0.00083, 'epoch

TrainOutput(global_step=100, training_loss=0.23069281838834285, metrics={'train_runtime': 23.3152, 'train_samples_per_second': 17.156, 'train_steps_per_second': 4.289, 'train_loss': 0.23069281838834285, 'epoch': 0.03})

In [None]:
peft_trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./peft/tokenizer_config.json',
 './peft/special_tokens_map.json',
 './peft/vocab.json',
 './peft/merges.txt',
 './peft/added_tokens.json',
 './peft/tokenizer.json')

In [None]:
lora_model = peft_trainer.model

In [None]:
prompt = ''' one day'''
inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    lora_model.generate(
        # inputs["input_ids"], 
        input_ids=inputs.input_ids.to('cuda'),
        max_new_tokens=200,
        pad_token_id=tokenizer.eos_token_id
    )[0], 
    skip_special_tokens=True
)
output

' one day, he was to to the man, to to to the man, and, to to to to to to the man, and to to to to.,'

In [None]:
index = 200
dialogue = dataset['test'][index]['dialogue']
baseline_human_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to('cuda')
tokenizer.pad_token = tokenizer.eos_token
original_model_outputs = peft_trainer.model.generate(input_ids=input_ids, pad_token_id=tokenizer.eos_token_id, generation_config=GenerationConfig(max_new_tokens=2009, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

In [None]:
original_model_text_output

"\nSummarize the following conversation.\n\n#Person1#: Have you considered upgrading your system?\n#Person2#: Yes, but I'm not sure what exactly I would need.\n#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.\n#Person2#: That would be a definite bonus.\n#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.\n#Person2#: How can we do that?\n#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?\n#Person2#: No.\n#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.\n#Person2#: That sounds great. Thanks.\n\nSummary: "

In [None]:
output_dir = f'./peft'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1000   
)
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)