# Clone the Dataset Repository


In [None]:
!git clone https://huggingface.co/datasets/billsum

Cloning into 'billsum'...
remote: Enumerating objects: 64, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 64 (delta 28), reused 16 (delta 16), pack-reused 28 (from 1)[K
Unpacking objects: 100% (64/64), 17.67 KiB | 754.00 KiB/s, done.
Filtering content: 100% (3/3), 108.46 MiB | 27.54 MiB/s, done.


# Install Necessary Libraries

In [None]:
! pip install -q transformers[torch]

In [None]:
!pip install transformers datasets rouge_score accelerate bitsandbytes

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsand

# Load the Dataset


In [None]:
from datasets import load_dataset
import pandas as pd
from datasets import Dataset
import re
import string

# Load the dataset from parquet files
train_df = pd.read_parquet('/content/billsum/data/train-00000-of-00001.parquet')
test_df = pd.read_parquet('/content/billsum/data/test-00000-of-00001.parquet')
ca_test_df = pd.read_parquet('/content/billsum/data/ca_test-00000-of-00001.parquet')

# Function to normalize text
def normalize_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Apply normalization
train_df['text'] = train_df['text'].apply(normalize_text)
train_df['summary'] = train_df['summary'].apply(normalize_text)
test_df['text'] = test_df['text'].apply(normalize_text)
test_df['summary'] = test_df['summary'].apply(normalize_text)
ca_test_df['text'] = ca_test_df['text'].apply(normalize_text)
ca_test_df['summary'] = ca_test_df['summary'].apply(normalize_text)

In [None]:
# Convert pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
ca_test_dataset = Dataset.from_pandas(ca_test_df)

# Split the dataset
billsum = train_dataset.train_test_split(test_size=1)

# Tokenize the Dataset


In [None]:
  import nltk
  nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Tokenize the dataset
from transformers import AutoTokenizer
from nltk.tokenize import sent_tokenize

tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Function for segmentation

tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Function for segmentation
def segment_text(text, max_len=512):
    sentences = sent_tokenize(text)
    segments = []
    current_segment = ""

    for sentence in sentences:
        sentence_tokenized_length = len(tokenizer(sentence)["input_ids"])

        # If the sentence alone is longer than max_len, truncate it
        if sentence_tokenized_length > max_len:
            truncated_sentence = tokenizer.decode(tokenizer(sentence, max_length=max_len, truncation=True)["input_ids"])
            segments.append(truncated_sentence)
        else:
            if len(tokenizer(current_segment + sentence)["input_ids"]) <= max_len:
                current_segment += sentence + " "
            else:
                segments.append(current_segment.strip())
                current_segment = sentence + " "

    if current_segment:
        segments.append(current_segment.strip())

    return segments

# Preprocessing function with normalization and segmentation
def preprocess_function(examples):
    all_model_inputs = []
    all_labels = []

    for doc, summary in zip(examples["text"], examples["summary"]):
        # Normalise text and summary
        doc = normalize_text(doc)
        summary = normalize_text(summary)

        # Segment the document
        segments = segment_text(doc)

        for segment in segments:
            inputs = "summarize: " + segment
            model_inputs = tokenizer(inputs, max_length=512, truncation=True)
            labels = tokenizer(summary, max_length=128, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            all_model_inputs.append(model_inputs)

    return {key: [d[key] for d in all_model_inputs] for key in all_model_inputs[0]}

# Tokenize the dataset
tokenized_billsum = billsum.map(preprocess_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/3789 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2370 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/15160 [00:00<?, ? examples/s]

# Prepare Data Collator

In [None]:
# Data collator
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="t5-small")

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


# Define Evaluation Metrics

In [None]:
# Evaluation metric
import evaluate

rouge = evaluate.load("rouge")
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

# Fine-Tune the Model


In [None]:
# Fine-tuning the model
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [None]:
!pip install accelerate>=0.21.0

# Train the Model

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_fine_tuned_flan_t5_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model("my_fine_tuned_flan_t5_model")

# Summarization example
from transformers import pipeline

text = test_df.iloc[100]['text']
text = "summarize: " + text

summarizer = pipeline("summarization", model="my_fine_tuned_flan_t5_model")
pred = summarizer(text)

# Display the summary
print(pred[0]['summary_text'])

# Evaluation
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("my_fine_tuned_flan_t5_model")
inputs = tokenizer(text, return_tensors="pt").input_ids

from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("my_fine_tuned_flan_t5_model")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
summary_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Compute ROUGE score
preds = [summary_text]
labels = [test_df.iloc[100]['summary']]
rouge.compute(predictions=preds, references=labels, use_stemmer=True)



Epoch,Training Loss,Validation Loss




Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.585985,0.2384,0.1661,0.2222,0.2222,19.0
2,No log,2.484938,0.2474,0.179,0.2323,0.2323,19.0
3,2.898200,2.445721,0.25,0.1821,0.2348,0.2348,19.0
4,2.898200,2.433619,0.2507,0.1829,0.2355,0.2354,19.0


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Token indices sequence length is longer than the specified maximum sequence length for this model (1579 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1577 > 512). Running this sequence through the model will result in indexing errors


afghanistan and central asian republics sustainable food production act of 2001 authorizes the administrator of the united states agency for international development to provide assistance in accordance with the provisions of this act to develop durable food production for the u.s. and the mountainous regions of central asia through restocking seed replacing breeding livestock restoring basic irrigation systems and providing access to credit for food production processing or marketing enterprises through rural microenterprise loans and v providing technical assistance to the national and regional governments of such countries


{'rouge1': 0.6214689265536724,
 'rouge2': 0.44571428571428573,
 'rougeL': 0.5423728813559322,
 'rougeLsum': 0.5423728813559322}