In [1]:
!pip install transformers datasets evaluate rouge_score accelerate peft bitsandbytes -q

import warnings
warnings.filterwarnings('ignore')

import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import PrefixTuningConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
import evaluate

import importlib
import transformers.utils.import_utils as import_utils
from transformers.utils import is_bitsandbytes_available
import_utils._bitsandbytes_available = True
is_bitsandbytes_available.cache_clear()
import transformers.integrations.bitsandbytes
importlib.reload(transformers.integrations.bitsandbytes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Device: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
print(f"Using: {device}")


PyTorch: 2.5.1+cu121
CUDA: True
Device: NVIDIA GeForce RTX 2050
Memory: 4.29 GB
Using: cuda


In [2]:
import torch
print("CUDA:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))
print("VRAM (GB):", torch.cuda.get_device_properties(0).total_memory / 1024**3)


CUDA: True
GPU: NVIDIA GeForce RTX 2050
VRAM (GB): 3.99951171875


In [3]:
base_url = "https://huggingface.co/datasets/EdinburghNLP/xsum/resolve/refs%2Fconvert%2Fparquet/default"
train_url = f"{base_url}/train/0000.parquet"
val_url = f"{base_url}/validation/0000.parquet"
test_url = f"{base_url}/test/0000.parquet"

print("Loading XSum dataset...")
train_df = pd.read_parquet(train_url)
val_df = pd.read_parquet(val_url)
test_df = pd.read_parquet(test_url)

print(f"Train: {len(train_df):,}")
print(f"Validation: {len(val_df):,}")
print(f"Test: {len(test_df):,}")

Loading XSum dataset...
Train: 204,045
Validation: 11,332
Test: 11,334


In [4]:
sample = train_df.iloc[0]
print(f"Document: {sample['document'][:300]}...")
print(f"\nSummary: {sample['summary']}")

Document: The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many...

Summary: Clean-up operations are continuing across the Scottish Borders and Dumfries and Galloway after flooding caused by Storm Frank.


In [5]:
TRAIN_SIZE = 50000
VAL_SIZE = 500

train_subset_df = train_df.sample(n=TRAIN_SIZE, random_state=42).reset_index(drop=True)
val_subset_df = val_df.sample(n=VAL_SIZE, random_state=42).reset_index(drop=True)

train_dataset = Dataset.from_pandas(train_subset_df)
val_dataset = Dataset.from_pandas(val_subset_df)

print(f"Training: {len(train_dataset)}")
print(f"Validation: {len(val_dataset)}")

Training: 50000
Validation: 500


In [6]:
MODEL_NAME = "microsoft/phi-2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    padding_side="left"
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print(f"Tokenizer loaded: {MODEL_NAME}")
print(f"Vocab size: {tokenizer.vocab_size:,}")

Tokenizer loaded: microsoft/phi-2
Vocab size: 50,257


In [7]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)

model = prepare_model_for_kbit_training(model)

print(f"Model loaded: {MODEL_NAME}")
print(f"Parameters: {model.num_parameters():,}")

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 2/2 [00:17<00:00,  8.67s/it]


Model loaded: microsoft/phi-2
Parameters: 2,779,683,840


In [8]:
model.gradient_checkpointing_disable()
model.config.use_cache = True

prefix_config = PrefixTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    num_virtual_tokens=20,
    encoder_hidden_size=2560,
    prefix_projection=True
)

model = get_peft_model(model, prefix_config)
model.print_trainable_parameters()


trainable params: 426,201,600 || all params: 3,205,885,440 || trainable%: 13.2943


In [9]:
INSTRUCTION_TEMPLATE = "Summarize: {document}\n\nSummary: "
MAX_LENGTH = 1024

def preprocess_function(examples):
    prompts = []
    full_texts = []
    
    for doc, summ in zip(examples['document'], examples['summary']):
        prompt = INSTRUCTION_TEMPLATE.format(document=doc)
        full_text = prompt + summ + tokenizer.eos_token
        prompts.append(prompt)
        full_texts.append(full_text)
    
    model_inputs = tokenizer(
        full_texts,
        max_length=MAX_LENGTH,
        truncation=True,
        padding="max_length",
        return_tensors=None
    )
    
    labels = []
    for i, (prompt, input_ids) in enumerate(zip(prompts, model_inputs['input_ids'])):
        prompt_tokens = tokenizer(prompt, truncation=True, max_length=MAX_LENGTH)['input_ids']
        prompt_length = len(prompt_tokens)
        
        label = [-100] * prompt_length + input_ids[prompt_length:]
        label = label[:MAX_LENGTH]
        if len(label) < MAX_LENGTH:
            label = label + [-100] * (MAX_LENGTH - len(label))
        
        labels.append(label)
    
    model_inputs['labels'] = labels
    return model_inputs

print("Preprocessing function ready")

Preprocessing function ready


In [10]:
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    desc="Tokenizing train"
)

tokenized_val = val_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=val_dataset.column_names,
    desc="Tokenizing validation"
)

print(f"Tokenized train: {len(tokenized_train)}")
print(f"Tokenized val: {len(tokenized_val)}")

Tokenizing train: 100%|██████████| 50000/50000 [01:43<00:00, 483.44 examples/s]
Tokenizing validation: 100%|██████████| 500/500 [00:00<00:00, 584.59 examples/s]

Tokenized train: 50000
Tokenized val: 500





In [11]:
sample_labels = tokenized_train[0]['labels']
masked = sum(1 for l in sample_labels if l == -100)
unmasked = sum(1 for l in sample_labels if l != -100)

print(f"Total tokens: {len(sample_labels)}")
print(f"Masked (prompt): {masked}")
print(f"Unmasked (summary): {unmasked}")

Total tokens: 1024
Masked (prompt): 699
Unmasked (summary): 325


In [12]:
OUTPUT_DIR = "./phi2-xsum-prefix-tuned"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    learning_rate=1e-3,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    fp16=torch.cuda.is_available(),
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=50,
    save_strategy="epoch",
    eval_strategy="epoch",
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
    remove_unused_columns=False,
    label_names=["labels"]
)

print(f"Output: {OUTPUT_DIR}")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Effective batch: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"LR: {training_args.learning_rate}")
print(f"Epochs: {training_args.num_train_epochs}")

Output: ./phi2-xsum-prefix-tuned
Batch size: 2
Effective batch: 16
LR: 0.001
Epochs: 2


In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

print("Trainer initialized")

Trainer initialized


In [14]:
training_args = TrainingArguments(
    output_dir="./prefix_phi2",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="no",

    # ❌ MATIKAN AMP
    fp16=False,
    bf16=False,

    # ❌ MATIKAN checkpointing
    gradient_checkpointing=False,

    # ⚡ Biarkan CUDA tetap dipakai
    no_cuda=False,

    report_to="none"
)


In [15]:
model.config.use_cache = False


In [16]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    optimizers=(torch.optim.AdamW(model.parameters(), lr=2e-4), None),
)


In [17]:
model.save_pretrained(f"{OUTPUT_DIR}/final_model")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model")
print(f"Model saved to {OUTPUT_DIR}/final_model")

Model saved to ./phi2-xsum-prefix-tuned/final_model


In [18]:
rouge = evaluate.load("rouge")

def generate_summary(document, max_new_tokens=100):
    prompt = INSTRUCTION_TEMPLATE.format(document=document)
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_LENGTH - max_new_tokens
    ).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if "Summary: " in generated_text:
        summary = generated_text.split("Summary: ")[-1].strip()
    else:
        summary = generated_text[len(prompt):].strip()
    
    return summary

print("Generation function ready")

Generation function ready


In [19]:
NUM_EVAL = 100

predictions = []
references = []

model.eval()
for i in range(NUM_EVAL):
    sample = val_dataset[i]
    generated = generate_summary(sample['document'])
    
    predictions.append(generated)
    references.append(sample['summary'])
    
    if (i + 1) % 20 == 0:
        print(f"{i + 1}/{NUM_EVAL}")

print(f"Generated {len(predictions)} summaries")

20/100
40/100
60/100
80/100
100/100
Generated 100 summaries


In [20]:
results = rouge.compute(predictions=predictions, references=references, use_stemmer=True)

print("ROUGE Scores:")
print(f"ROUGE-1: {results['rouge1']*100:.2f}%")
print(f"ROUGE-2: {results['rouge2']*100:.2f}%")
print(f"ROUGE-L: {results['rougeL']*100:.2f}%")

ROUGE Scores:
ROUGE-1: 10.25%
ROUGE-2: 1.61%
ROUGE-L: 7.40%


In [21]:
for idx in [0, 25, 50]:
    if idx >= len(predictions):
        continue
    
    print(f"\n{'='*60}")
    print(f"EXAMPLE {idx + 1}")
    print(f"{'='*60}")
    
    doc = val_dataset[idx]['document']
    print(f"\nDocument: {doc[:400]}...")
    print(f"\nReference: {references[idx]}")
    print(f"\nGenerated: {predictions[idx]}")
    
    score = rouge.compute(predictions=[predictions[idx]], references=[references[idx]])
    print(f"\nROUGE-1: {score['rouge1']*100:.1f}% | ROUGE-2: {score['rouge2']*100:.1f}% | ROUGE-L: {score['rougeL']*100:.1f}%")


EXAMPLE 1

Document: Rita King, 81, who had dementia, died after being shot at De La Mer House, in Walton-on-the-Naze, Essex, in December.
Her husband, Ronald King, who has a mental health condition, admitted manslaughter on the grounds of diminished responsibility.
Judge Charles Gratwicke, at Chelmsford Crown Court, told him: "This was not a mercy killing."
He added: "This was without a doubt a carefully planned kill...

Reference: An 87-year-old man who shot his wife dead at a care home has been sentenced to six years in prison.

Generated: A man who claimed self-inflicted shotgun shots into his wife due to his own mental disorder was found guilty of manslaughter by reason of insanity and received a suspended sentence. The couple lived together in a care home where she suffered from advanced dementia while he had multiple mental illnesses including one affecting decision making abilities. He initially pleaded not guilty to murder but later changed it to manslaughter under defense of

In [22]:
test_doc = val_dataset[10]['document']
test_ref = val_dataset[10]['summary']

print(f"Document: {test_doc[:400]}...")
print(f"\nReference: {test_ref}")

for temp in [0.3, 0.7, 1.0]:
    summary = generate_summary(test_doc, max_new_tokens=100)
    print(f"\nTemp={temp}: {summary}")

Document: He was playing against opponents who, earlier that season, had wound him up so badly that it completely put him off his game. Players who 24 hours earlier had spoken to a newspaper to explain, in detail, how this imposter was going to be stopped.
The supporters didn't like him. Nothing was going right.
And then, taking possession 40 yards out, Ibrahimovic explodes.
A dart forward. The acceleration...

Reference: It was typical of Zlatan Ibrahimovic.

Temp=0.3: 

Temp=0.7: 

Temp=1.0: 


In [23]:
test_doc2 = val_dataset[30]['document']
test_ref2 = val_dataset[30]['summary']

print(f"Document: {test_doc2[:500]}...")
print(f"\nReference: {test_ref2}")

generated = generate_summary(test_doc2)
print(f"\nGenerated: {generated}")

score = rouge.compute(predictions=[generated], references=[test_ref2])
print(f"\nROUGE-1: {score['rouge1']*100:.1f}% | ROUGE-2: {score['rouge2']*100:.1f}% | ROUGE-L: {score['rougeL']*100:.1f}%")

Document: Stacey Burrows, 16, and Lucy Pygott, 17, were killed while on a training run in Aldershot, Hampshire, on 8 November.
Judge Phillip Gillibrand revealed he had lost his own brother in a car crash as he expressed sympathy for the girls' relatives.
Michael Casey, 24, is charged with causing death by dangerous driving.
Mr Casey, of St Paul's Road, Tottenham, London, was bailed to appear at Winchester Crown Court on 13 April.
Addressing the teenagers' relatives, Judge Gillibrand said: "I do not know t...

Reference: A judge has told a court he understood the "trauma and anguish" of the families of two teenage athletes who died after being hit by a car.

Generated: A tragic incident occurred recently during a routine running session near Aldershot, resulting in the loss of two young lives. The victims, both teenagers named Stacey Burrows (16) and Lucy Pygott (17), tragically met their untimely deaths due to a car accident that took place on November 8th outside Aldershot, located in

In [24]:
pred_lengths = [len(p.split()) for p in predictions]
ref_lengths = [len(r.split()) for r in references]

print("Length Statistics:")
print(f"\nGenerated: {np.mean(pred_lengths):.1f} ± {np.std(pred_lengths):.1f} words")
print(f"Reference: {np.mean(ref_lengths):.1f} ± {np.std(ref_lengths):.1f} words")
print(f"Difference: {np.mean(pred_lengths) - np.mean(ref_lengths):.1f} words")

Length Statistics:

Generated: 50.8 ± 37.8 words
Reference: 21.7 ± 4.9 words
Difference: 29.1 words


In [25]:
print("="*60)
print("FINAL SUMMARY")
print("="*60)
print(f"Model: microsoft/phi-2 (Prefix Tuning)")
print(f"Dataset: XSum")
print(f"Training: {TRAIN_SIZE} samples")
print(f"Validation: {VAL_SIZE} samples")
print(f"\nROUGE Scores:")
print(f"  ROUGE-1: {results['rouge1']*100:.2f}%")
print(f"  ROUGE-2: {results['rouge2']*100:.2f}%")
print(f"  ROUGE-L: {results['rougeL']*100:.2f}%")
print(f"\nSaved: {OUTPUT_DIR}/final_model")
print("="*60)

FINAL SUMMARY
Model: microsoft/phi-2 (Prefix Tuning)
Dataset: XSum
Training: 50000 samples
Validation: 500 samples

ROUGE Scores:
  ROUGE-1: 10.25%
  ROUGE-2: 1.61%
  ROUGE-L: 7.40%

Saved: ./phi2-xsum-prefix-tuned/final_model
