# Clone the Dataset Repository


In [1]:
!git clone https://huggingface.co/datasets/billsum

Cloning into 'billsum'...
remote: Enumerating objects: 64, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 64 (delta 28), reused 17 (delta 17), pack-reused 28 (from 1)[K
Unpacking objects: 100% (64/64), 17.67 KiB | 670.00 KiB/s, done.


# Install Necessary Libraries

In [2]:
! pip install -q transformers[torch]

In [3]:
!pip install transformers datasets rouge_score accelerate bitsandbytes

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsand

# Load the Dataset


In [10]:
from datasets import load_dataset
import pandas as pd
from datasets import Dataset
import re
import string

# Load the dataset from parquet files
train_df = pd.read_parquet('/content/billsum/data/train-00000-of-00001.parquet')
test_df = pd.read_parquet('/content/billsum/data/test-00000-of-00001.parquet')
ca_test_df = pd.read_parquet('/content/billsum/data/ca_test-00000-of-00001.parquet')

# Function to normalize text
def normalize_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Apply normalization
train_df['text'] = train_df['text'].apply(normalize_text)
train_df['summary'] = train_df['summary'].apply(normalize_text)
test_df['text'] = test_df['text'].apply(normalize_text)
test_df['summary'] = test_df['summary'].apply(normalize_text)
ca_test_df['text'] = ca_test_df['text'].apply(normalize_text)
ca_test_df['summary'] = ca_test_df['summary'].apply(normalize_text)

# Convert pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
ca_test_dataset = Dataset.from_pandas(ca_test_df)

# Split the dataset
billsum = train_dataset.train_test_split(test_size=1)

In [11]:
billsum

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 18948
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1
    })
})

# Tokenize the Dataset


In [13]:
  import nltk
  nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
# Tokenize the dataset
from transformers import AutoTokenizer
from nltk.tokenize import sent_tokenize

tokenizer = AutoTokenizer.from_pretrained("t5-base")

# def preprocess_function(examples):
#     inputs = ["summarize: " + doc for doc in examples["text"]]
#     model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
#     labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs

# Function for segmentation

tokenizer = AutoTokenizer.from_pretrained("t5-base")

# Function for segmentation
def segment_text(text, max_len=512):
    sentences = sent_tokenize(text)
    segments = []
    current_segment = ""

    for sentence in sentences:
        sentence_tokenized_length = len(tokenizer(sentence)["input_ids"])

        # If the sentence alone is longer than max_len, truncate it
        if sentence_tokenized_length > max_len:
            truncated_sentence = tokenizer.decode(tokenizer(sentence, max_length=max_len, truncation=True)["input_ids"])
            segments.append(truncated_sentence)
        else:
            if len(tokenizer(current_segment + sentence)["input_ids"]) <= max_len:
                current_segment += sentence + " "
            else:
                segments.append(current_segment.strip())
                current_segment = sentence + " "

    if current_segment:
        segments.append(current_segment.strip())

    return segments

# Preprocessing function with normalization and segmentation
def preprocess_function(examples):
    all_model_inputs = []
    all_labels = []

    for doc, summary in zip(examples["text"], examples["summary"]):
        # Normalize text and summary
        doc = normalize_text(doc)
        summary = normalize_text(summary)

        # Segment the document
        segments = segment_text(doc)

        for segment in segments:
            inputs = "summarize: " + segment
            model_inputs = tokenizer(inputs, max_length=512, truncation=True)
            labels = tokenizer(summary, max_length=128, truncation=True)
            model_inputs["labels"] = labels["input_ids"]
            all_model_inputs.append(model_inputs)

    return {key: [d[key] for d in all_model_inputs] for key in all_model_inputs[0]}

# Tokenize the dataset
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/18948 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

# Prepare Data Collator

In [15]:
# Data collator
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="t5-base")

In [16]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


# Define Evaluation Metrics

In [17]:
# Evaluation metric
import evaluate

rouge = evaluate.load("rouge")
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

# Fine-Tune the Model


In [18]:
# Fine-tuning the model
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

# model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model.to(device)

In [19]:
!pip install accelerate>=0.21.0

# Train the Model

In [20]:
# Load flan-t5-tiny model and tokenizer
model_tiny = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
tokenizer_tiny = AutoTokenizer.from_pretrained("google/flan-t5-base")
model_tiny.to("cuda")
training_args_tiny = Seq2SeqTrainingArguments(
    output_dir="my_fine_tuned_flan_t5_tiny_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
)

trainer_tiny = Seq2SeqTrainer(
    model=model_tiny,
    args=training_args_tiny,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer_tiny,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer_tiny.train()
trainer_tiny.save_model("my_fine_tuned_flan_t5_tiny_model")
from transformers import pipeline
from transformers import AutoTokenizer


text = test_df.iloc[100]['text']
text = "summarize: " + text
# Summarization example with the tiny model
summarizer_tiny = pipeline("summarization", model="my_fine_tuned_flan_t5_tiny_model")
pred_tiny = summarizer_tiny(text)

# Display the summary from the tiny model
print(pred_tiny[0]['summary_text'])



config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.0,,0.1129,0.0984,0.1129,0.1129,9.0


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Token indices sequence length is longer than the specified maximum sequence length for this model (1579 > 512). Running this sequence through the model will result in indexing errors


afghanistan and central asian republics sustainable food production act of 2001 sec 3 assistance a assistancethe administrator of the united states agency for international development shall provide assistance to nongovernmental organizations for the purpose of carrying out the activities described in paragraph 2 activities supported by this section shall be i procurement of seed for local food production ii replacement of breeding livestock iv establishment of access to credit for food production processing or marketing enterprises through rural microenterprise loan programs and v providing technical assistance


In [21]:
# Evaluation
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("my_fine_tuned_flan_t5_tiny_model")
inputs = tokenizer(text, return_tensors="pt").input_ids

from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("my_fine_tuned_flan_t5_tiny_model")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
summary_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Compute ROUGE score
preds = [summary_text]
labels = [test_df.iloc[100]['summary']]
rouge.compute(predictions=preds, references=labels, use_stemmer=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (1577 > 512). Running this sequence through the model will result in indexing errors


{'rouge1': 0.11320754716981131,
 'rouge2': 0.09615384615384615,
 'rougeL': 0.11320754716981131,
 'rougeLsum': 0.11320754716981131}