In [1]:
!pip install transformers[torch] datasets scikit-learn
!pip install accelerate -U
!pip install --upgrade transformers
!pip install evaluate

import numpy as np
import pandas as pd
import torch
import transformers
import re
import evaluate

from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments, AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support



In [2]:
from datasets import load_dataset

ds = load_dataset("pszemraj/qmsum-cleaned", "default")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
ds['train']

Dataset({
    features: ['id', 'pid', 'input', 'output', 'input_token_count', 'output_token_count'],
    num_rows: 1257
})

In [4]:
train_size = 0.8
test_size = 0.2

from sklearn.model_selection import train_test_split

train_ds, test_ds = ds['train'].train_test_split(test_size=test_size, seed=42).values()

# Create a new DatasetDict
split_ds = DatasetDict({
    'train': train_ds,
    'test': test_ds,
})

# Check the sizes of the splits
print(f"Train size: {len(split_ds['train'])}")
print(f"Test size: {len(split_ds['test'])}")


Train size: 1005
Test size: 252


In [5]:
from transformers import AutoModelForSeq2SeqLM

checkpoint = "google-t5/t5-base" # de modif dupa cu "google-t5/t5-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [6]:
inputs = tokenizer("I loved reading the Hunger Games!")
inputs

{'input_ids': [27, 1858, 1183, 8, 26049, 5880, 55, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
tokenizer.convert_ids_to_tokens(inputs.input_ids)

['▁I', '▁loved', '▁reading', '▁the', '▁Hunger', '▁Games', '!', '</s>']

In [8]:
max_input_length = 512
max_target_length = 30

def preprocess_function(examples):
    # Tokenize the inputs (full text)
    model_inputs = tokenizer(
        examples["input"],  # Adjusted to use 'input' column
        max_length=max_input_length,
        truncation=True,
    )

    # Tokenize the targets (summarization)
    labels = tokenizer(
        examples["output"],  # Adjusted to use 'output' column
        max_length=max_target_length,
        truncation=True
    )

    # Add labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
tokenized_train_ds = train_ds.map(preprocess_function, batched=True)
tokenized_test_ds = test_ds.map(preprocess_function, batched=True)

# Print to check the results
print("Tokenized Train Dataset:")
print(tokenized_train_ds)

print("\nTokenized Test Dataset:")
print(tokenized_test_ds)

Map:   0%|          | 0/1005 [00:00<?, ? examples/s]

Map:   0%|          | 0/252 [00:00<?, ? examples/s]

Tokenized Train Dataset:
Dataset({
    features: ['id', 'pid', 'input', 'output', 'input_token_count', 'output_token_count', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1005
})

Tokenized Test Dataset:
Dataset({
    features: ['id', 'pid', 'input', 'output', 'input_token_count', 'output_token_count', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 252
})


In [10]:
!pip install rouge_score



In [11]:
from rouge_score import rouge_scorer

In [12]:
rouge_score = evaluate.load("rouge")

In [13]:
!pip install nltk

import nltk

nltk.download("punkt")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Define the batch size and number of epochs
batch_size = 8
num_train_epochs = 8

# Calculate logging steps
logging_steps = len(tokenized_train_ds) // batch_size

# Define the model checkpoint name
model_name = checkpoint.split("/")[-1]

# Set up the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-qmsum",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
)



In [16]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [17]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Initialize the ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)  # Fixed initialization of ROUGE scorer
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    results = [scorer.score(pred, label) for pred, label in zip(decoded_preds, decoded_labels)]  # Fixed ROUGE score computation
    # Aggregate ROUGE scores
    rouge_results = {
        'rouge1': np.mean([r['rouge1'].fmeasure for r in results]) * 100,
        'rouge2': np.mean([r['rouge2'].fmeasure for r in results]) * 100,
        'rougeL': np.mean([r['rougeL'].fmeasure for r in results]) * 100,
    }
    return {k: round(v, 4) for k, v in rouge_results.items()}

In [18]:
# Remove the columns with strings since the collator won’t know how to pad these elements
tokenized_train_ds = tokenized_train_ds.remove_columns(split_ds["train"].column_names)
tokenized_test_ds = tokenized_test_ds.remove_columns(split_ds["test"].column_names)

# Wrangle the data into the expected format for the data collator
features = [tokenized_train_ds[i] for i in range(2)]
batch = data_collator(features)

# Display the batch to see what the data collator produces
print("Batch produced by data collator:")
print(batch)

Batch produced by data collator:
{'input_ids': tensor([[  363,   410, 10771,  ..., 11475,   269,     1],
        [  363,  1275,   410,  ...,     3,    76,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[10771,   205,  3665,     8,  2859,    11,  5243,    24,     8,   126,
           800,    47,    12,   995,  7404,    12,  3806,   306,   593,  4145,
             6,   114,   352,  2309,     6,    57,  1452,     5, 19237,     1],
        [ 4329,  5259,    24,     8,  4322,    54,    36,    15,   102,    42,
             3,     9,   659,    16,     8,  4322,    54, 25063,   116,   151,
             3,  4651,   102,     5,  2786,  3440,   974,    24,    34,     1]]), 'decoder_input_ids': tensor([[    0, 10771,   205,  3665,     8,  2859,    11,  5243,    24,     8,
           126,   800,    47,    12,   995,  7404,    12,  3806,   306,   593,
          4145,     6,   114,   352,  2309,     6,    57,  1452,     5, 19237],


In [19]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [20]:
from nltk.tokenize import sent_tokenize

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,3.5399,3.292869,27.9871,8.2442,23.2939
2,3.1401,3.207594,27.7588,7.6926,22.8498
3,2.9706,3.167818,28.9533,8.4516,23.4899
4,2.8244,3.150926,28.274,8.0721,22.897
5,2.7238,3.14722,27.9718,8.26,22.7717
6,2.6687,3.151297,28.3972,8.4436,22.9446
7,2.5844,3.155423,28.6233,8.5011,23.1638
8,2.5715,3.156747,28.3882,8.4191,22.8604




TrainOutput(global_step=1008, training_loss=2.874473747752962, metrics={'train_runtime': 796.1716, 'train_samples_per_second': 10.098, 'train_steps_per_second': 1.266, 'total_flos': 4896021440102400.0, 'train_loss': 2.874473747752962, 'epoch': 8.0})

cu t5 small: (dublu click si se vede mai bine)

Epoch	Training Loss	Validation Loss	Rouge1	Rouge2	Rougel
1	3.395600	3.535436	27.651900	8.074600	23.132100
2	3.407000	3.511521	27.495900	8.111100	23.100400
3	3.360000	3.489753	27.761100	8.336600	23.186300
4	3.303200	3.480370	27.567600	8.237600	23.138700
5	3.260200	3.472662	28.163800	8.681900	23.487800
6	3.258000	3.464433	27.880200	8.563400	23.381500
7	3.216700	3.462648	27.649000	8.553300	23.210100
8	3.203000	3.461719	27.642300	8.516300	23.150500

In [22]:
trainer.evaluate()

{'eval_loss': 3.1567471027374268,
 'eval_rouge1': 28.3882,
 'eval_rouge2': 8.4191,
 'eval_rougeL': 22.8604,
 'eval_runtime': 20.7152,
 'eval_samples_per_second': 12.165,
 'eval_steps_per_second': 1.545,
 'epoch': 8.0}

In [23]:
trainer.push_to_hub(commit_message="Training complete", tags="summarization")

events.out.tfevents.1720738867.7a7653d1f0ac.6172.0:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

events.out.tfevents.1720739721.7a7653d1f0ac.6172.1:   0%|          | 0.00/509 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ecat3rina/t5-base-finetuned-qmsum/commit/59c024e3c2f75da83b16b7e803165e9527aee737', commit_message='Training complete', commit_description='', oid='59c024e3c2f75da83b16b7e803165e9527aee737', pr_url=None, pr_revision=None, pr_num=None)

In [24]:
# Replace 'your-username/your-model-name' with your actual model hub path
tokenizer = AutoTokenizer.from_pretrained("ecat3rina/t5-small-finetuned-qmsum")
model = AutoModelForSeq2SeqLM.from_pretrained("ecat3rina/t5-small-finetuned-qmsum")

# Prepare the inputs
input_text = "Recent research has highlighted the significant impact of gut microbiota on human health and disease. The gut microbiota, a diverse community of microorganisms residing in the gastrointestinal tract, plays a crucial role in digestion, metabolism, and immune system regulation. Studies have shown that an imbalance in gut microbiota, known as dysbiosis, can lead to various health issues including obesity, diabetes, and inflammatory bowel diseases. Researchers are exploring ways to modify the gut microbiota through diet, probiotics, and fecal transplants to prevent or treat these conditions. The growing body of evidence suggests that maintaining a healthy gut microbiota is essential for overall well-being and may offer new therapeutic strategies for chronic diseases."  # Replace with your actual input text
inputs = tokenizer(input_text, max_length=512, truncation=True, return_tensors="pt")

# Generate summaries
outputs = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=30,  # or another value depending on your max_target_length
    num_beams=4,  # Optional: use beam search
    early_stopping=True  # Optional: stop early if all beams finish
)

# Decode the generated tokens to text
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(summary)

The growing body of evidence suggests that maintaining a healthy gut microbiota is essential for overall well-being and may offer new therapeutic strategies
