In [1]:
# Step 1: Install dependencies
!pip install -q transformers datasets evaluate sentencepiece rouge_score huggingface_hub
!pip install -U datasets

# Step 2: Import libraries
from datasets import load_dataset
from transformers import (
    BartTokenizer, BartForConditionalGeneration,
    DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
)
import evaluate
import numpy as np
import torch
from huggingface_hub import notebook_login, create_repo, upload_folder

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m504.7 kB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3

In [2]:

# Step 3: Load XSUM dataset
dataset = load_dataset("EdinburghNLP/xsum", download_mode="force_redownload")
train_dataset = dataset["train"].shuffle(seed=42).select(range(5000))       # 5k
val_dataset = dataset["validation"].shuffle(seed=42).select(range(1000))   # 1k

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

xsum.py:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/304M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/16.7M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/17.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [3]:
# Step 4: Load pretrained tokenizer and model
model_ckpt = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_ckpt)
model = BartForConditionalGeneration.from_pretrained(model_ckpt)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [4]:
# Step 5: Preprocessing
max_input_length = 512
max_target_length = 64

def preprocess(example):
    inputs = tokenizer(example["document"], max_length=max_input_length, truncation=True, padding="max_length")
    targets = tokenizer(example["summary"], max_length=max_target_length, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_train = train_dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)
tokenized_val = val_dataset.map(preprocess, batched=True, remove_columns=dataset["validation"].column_names)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [5]:
# Step 6: Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [6]:
# Step 7: Load ROUGE evaluator
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [7]:
# Step 8: Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    report_to="none"  # disable WandB
)

In [8]:
# Step 9: Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [9]:
# Step 10: Train!
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.9186,0.931361,0.2381,0.075,0.1716,0.1714
2,0.5128,0.944562,0.2929,0.1048,0.2114,0.2113
3,0.2589,1.095783,0.3041,0.1099,0.2191,0.219




TrainOutput(global_step=3750, training_loss=0.5860455444335938, metrics={'train_runtime': 5717.9183, 'train_samples_per_second': 2.623, 'train_steps_per_second': 0.656, 'total_flos': 1.625328451584e+16, 'train_loss': 0.5860455444335938, 'epoch': 3.0})

In [10]:
# Step 11: Save model and tokenizer
model.save_pretrained("bart-xsum-finetuned")
tokenizer.save_pretrained("bart-xsum-finetuned")

('bart-xsum-finetuned/tokenizer_config.json',
 'bart-xsum-finetuned/special_tokens_map.json',
 'bart-xsum-finetuned/vocab.json',
 'bart-xsum-finetuned/merges.txt',
 'bart-xsum-finetuned/added_tokens.json')

In [11]:
# Step 12: Inference example
input_text = "The BBC reported that heavy rainfall has caused major flooding across the city. Emergency services have been deployed to assist affected residents."

inputs = tokenizer([input_text], max_length=512, truncation=True, return_tensors="pt").to(model.device)
summary_ids = model.generate(inputs["input_ids"], max_length=64, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Summary:", summary)

Summary: Heavy rain has continued to lash parts of the south-east Belfast city centre, causing extensive flooding in the early hours of Monday.


In [18]:
# Step 13: Upload to Hugging Face Hub
notebook_login()  # Will ask for token from https://huggingface.co/settings/tokens

repo_name = "Farizkuy/bart-xsum-finetuned-fariz"  # Ganti dengan nama repo kamu
create_repo(repo_name, exist_ok=True)

upload_folder(
    repo_id=repo_name,
    folder_path="bart-xsum-finetuned",
    path_in_repo=".",
)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/farizkuy/bart-xsum-finetuned-fariz/commit/19facfe6f64f1d676c285f9dcd889c4b8c8b1597', commit_message='Upload folder using huggingface_hub', commit_description='', oid='19facfe6f64f1d676c285f9dcd889c4b8c8b1597', pr_url=None, repo_url=RepoUrl('https://huggingface.co/farizkuy/bart-xsum-finetuned-fariz', endpoint='https://huggingface.co', repo_type='model', repo_id='farizkuy/bart-xsum-finetuned-fariz'), pr_revision=None, pr_num=None)