In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if torch.cuda.is_available():
    print("CUDA available, using CUDA")
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    print("MLX available, using MLX")
    device = torch.device("mps")
else:
    print("Using CPU")
    device = torch.device("cpu")

CUDA available, using CUDA


In [3]:
training_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(training_dataset_name)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [4]:
model_name = "google/flan-t5-base"
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"All parameters: {all_model_params} \n Trainable parameters: {trainable_model_params} \n Percentage Trainable: {trainable_model_params/all_model_params * 100}"

print_number_of_trainable_model_parameters(base_model)

'All parameters: 247577856 \n Trainable parameters: 247577856 \n Percentage Trainable: 100.0'

In [6]:
index = 150

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']
prompt = f"""
Summarize the following conversation.
{dialogue}
Summary:
"""
inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    base_model.generate(
        inputs["input_ids"],
        max_new_tokens = 200,
    )[0],
    skip_special_tokens=True
)
dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE SUMMARY:\n {summary}\n')
print(dash_line)
print(f'MODEL GENERATED SUMMARY:\n {output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.
#Person1#: Taxi!
#Person2#: Where will you go, sir?
#Person1#: Friendship Hotel.
#Person2#: OK, it's not far from here.
#Person1#: I have something important to do, can you fast the speed?
#Person2#: Sure, I'll try my best. Here we are.
#Person1#: It's fast! How much should I pay you?
#Person2#: The reading on the meter is 15 yuan.
#Person1#: Here's 20 yuan, keep the change.
#Person2#: Thank you very much.
Summary:

---------------------------------------------------------------------------------------------------
BASELINE SUMMARY:
 #Person1# takes a taxi to the Friendship Hotel for something important.

---------------------------------------------------------------------------------------------------
MODEL GENERATED SUMMARY:
 The taxi will pick up Person1 at Friendship Hotel.


In [7]:
def tokenize_function(example):
    start_prompt = "Summarize the following conversation.\n\n"
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
    example['input_ids'] = tokenizer(prompt, padding='max_length', truncation=True, return_tensors='pt').input_ids
    example['labels'] = tokenizer(example['summary'], padding='max_length', truncation=True, return_tensors='pt').input_ids
    return example

In [8]:
tokenized_datasets = dataset.map(tokenize_function, batched=False)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary'])

Map:   8%|▊         | 999/12460 [02:23<27:29,  6.95 examples/s]


ArrowMemoryError: realloc of size 4294967296 failed

In [25]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)
print(f"Shapes of datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")
print(tokenized_datasets)


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Filter: 100%|██████████| 12460/12460 [00:08<00:00, 1389.15 examples/s]

[A
Filter: 100%|██████████| 500/500 [00:00<00:00, 1331.43 examples/s]

[A
[A
Filter: 100%|██████████| 1500/1500 [00:01<00:00, 1388.28 examples/s]

Shapes of datasets:
Training: (125, 2)
Validation: (5, 2)
Test: (15, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})





In [26]:
output_dir = f'./training-{str(int(time.time()))}'
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1
)
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

In [27]:
trainer.train()

  0%|          | 0/1 [05:33<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 7.08 GiB is allocated by PyTorch, and 62.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trained_model_dir = './trained_model'
trainer.save_model(trained_model_dir)

In [None]:
trained_model_dir = './trained_model'
trained_model = AutoModelForSeq2SeqLM.from_pretrained(trained_model_dir)