In [1]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install rouge_score
!pip install nltk
!pip install sentencepiece



In [2]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [3]:
if torch.cuda.is_available():
    print("CUDA available, using CUDA")
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    print("MLX available, using MLX")
    device = torch.device("mps")
else:
    print("Using CPU")
    device = torch.device("cpu")

CUDA available, using CUDA


In [4]:
training_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(training_dataset_name)
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [5]:
model_name = "google/flan-t5-base"
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"All parameters: {all_model_params} \n Trainable parameters: {trainable_model_params} \n Percentage Trainable: {trainable_model_params/all_model_params * 100}"

print_number_of_trainable_model_parameters(base_model)

'All parameters: 247577856 \n Trainable parameters: 247577856 \n Percentage Trainable: 100.0'

In [7]:
index = 150

dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']
prompt = f"""
Summarize the following conversation.
{dialogue}
Summary:
"""
inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    base_model.generate(
        inputs["input_ids"],
        max_new_tokens = 200,
    )[0],
    skip_special_tokens=True
)
dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE SUMMARY:\n {summary}\n')
print(dash_line)
print(f'MODEL GENERATED SUMMARY:\n {output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following conversation.
#Person1#: Taxi!
#Person2#: Where will you go, sir?
#Person1#: Friendship Hotel.
#Person2#: OK, it's not far from here.
#Person1#: I have something important to do, can you fast the speed?
#Person2#: Sure, I'll try my best. Here we are.
#Person1#: It's fast! How much should I pay you?
#Person2#: The reading on the meter is 15 yuan.
#Person1#: Here's 20 yuan, keep the change.
#Person2#: Thank you very much.
Summary:

---------------------------------------------------------------------------------------------------
BASELINE SUMMARY:
 #Person1# takes a taxi to the Friendship Hotel for something important.

---------------------------------------------------------------------------------------------------
MODEL GENERATED SUMMARY:
 The taxi will pick you up at the Friendship Hotel.


In [8]:
def tokenize_function(example):
    start_prompt = "Summarize the following conversation.\n\n"
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
    example['input_ids'] = tokenizer(prompt, padding='max_length', truncation=True, return_tensors='pt').input_ids
    example['labels'] = tokenizer(example['summary'], padding='max_length', truncation=True, return_tensors='pt').input_ids
    return example

In [9]:
tokenized_datasets = dataset.map(tokenize_function, batched=True, batch_size=32)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary'])

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [10]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)
print(f"Shapes of datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")
print(tokenized_datasets)

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Shapes of datasets:
Training: (125, 2)
Validation: (5, 2)
Test: (15, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})


In [11]:
output_dir = f'./training-{str(int(time.time()))}'
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-3,
    num_train_epochs=50,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=100
)
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

In [12]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mdhrumeen[0m ([33mdhrumeen-umass[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
1,49.0
2,23.875
3,14.9375
4,6.0312
5,4.5625
6,4.3125
7,4.0
8,3.6406
9,3.1094
10,2.8438


TrainOutput(global_step=100, training_loss=1.41803466796875, metrics={'train_runtime': 292.467, 'train_samples_per_second': 2.735, 'train_steps_per_second': 0.342, 'total_flos': 535480249614336.0, 'train_loss': 1.41803466796875, 'epoch': 6.25})

In [25]:
trained_model_dir = './trained_model'
trainer.save_model(trained_model_dir)

In [14]:
trained_model_dir = './trained_model'
trained_model = AutoModelForSeq2SeqLM.from_pretrained(trained_model_dir)

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
input_ids = input_ids.to(device)
base_model.to(device)
trained_model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [16]:
generation_config = GenerationConfig(max_new_tokens=200, num_beams=1)
original_model_outputs = base_model.generate(input_ids=input_ids, generation_config=generation_config)
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

In [17]:
trained_model_outputs = trained_model.generate(input_ids=input_ids, generation_config=generation_config)
trained_model_text_output = tokenizer.decode(trained_model_outputs[0], skip_special_tokens=True)

In [18]:
human_baseline_summary = summary
dash_line = '-' * 50
print(dash_line)
print(f'BASELINE HUMAN SUMMARY: \n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL: \n{original_model_text_output}')
print(dash_line)
print(f'TRAINED MODEL: \n{trained_model_text_output}')

--------------------------------------------------
BASELINE HUMAN SUMMARY: 
#Person1# takes a taxi to the Friendship Hotel for something important.
--------------------------------------------------
ORIGINAL MODEL: 
#Person1# is a taxi to Friendship Hotel. #Person2# will try his best to get the meter, but #Person1# will try his best.
--------------------------------------------------
TRAINED MODEL: 
#Person1# will go to Friendship Hotel. #Person1# will try his best to pay #Person1#.


In [12]:
!pip install huggingface_hub

Python(59479) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
!hugginface-cli login

In [21]:
!git clone https://huggingface.co/dhrumeen/mt5-small_summarization
!cd mt5-small_summarization


Cloning into 'mt5-small_summarization'...
remote: Enumerating objects: 13, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 13 (delta 0), reused 0 (delta 0), pack-reused 3 (from 1)[K
Unpacking objects: 100% (13/13), 4.14 KiB | 303.00 KiB/s, done.


In [31]:
!ls
!cd ./mt5-small_summarization
!pwd
!git add .
!git commit -m "Upload of trained model"
!git push


mt5-small_summarization  trained_model	      training-1737356102  training-1737357642
sample_data		 training-1737355690  training-1737357497  wandb
/content
fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git
