<a href="https://colab.research.google.com/github/emirunlu26/qwen3-4B-finetuned-turkish-youtube-text-summarizer/blob/main/finetune_text_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
!pip install -q -U bitsandbytes transformers peft accelerate scipy einops evaluate trl rouge_score

In [None]:
!pip install -U datasets huggingface_hub fsspec

**LOAD DATASET**

In [None]:
from datasets import load_dataset
dataset = load_dataset("emirunlu26/turkish-youtube-text-summarization")

**FINE-TUNE QWEN3-4B ON Turkish Youtube Text Summarization**

In [None]:
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig,
    EarlyStoppingCallback
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np

In [None]:
# SET CONFIGURATIONS FOR 4-BIT QUANTIZATION OF THE BASE MODEL
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )

In [None]:
# LOAD TOKENIZER AND 4-BIT QUANTIZED VERSION OF BASE MODEL
model_name = "Qwen/Qwen3-4B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="cuda:0",
    quantization_config=bnb_config
)

In [None]:
# GENERAT FORMATTED PROMPTS FROM SAMPLES
def generate_prompts(samples):
  instruction_prompt = "Bu Youtube videosunu, ana teması ve önemli noktalarına odaklanarak kısa ama öz ve soyutlayıcı bir şekilde özetle (abstractive summary):\n"

  titles = samples["title"]
  categories = samples["category"]
  channels = samples["channel"]
  texts = samples["text"]

  prompts = list()
  for title,category,channel,text in zip(titles,categories,channels,texts):
    data_prompt = f"Başlık: {title}\n" \
  + f"Kategori: {category}\n" \
  + f"Kanal: {channel}\n" \
  + f"Metin: {text}"
    prompts.append(instruction_prompt + data_prompt)

  return {"prompt":prompts}

In [None]:
# APPLY CHAT TEMPLATE TO EACH GENERATED PROMPT
def apply_chat_template(samples):
  def apply_to_prompt(prompt):
    messages = [{"role":"user","content":prompt}]
    chat_template = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )
    return chat_template
  prompts = samples["prompt"]
  chat_templates = list(map(apply_to_prompt,prompts))
  return {"prompt":chat_templates}

In [None]:
 # PREPROCESS SAMPLES
def preprocess_samples(samples):
  prompts = samples["prompt"]
  model_inputs = tokenizer(prompts,return_tensors="pt",padding="max_length",truncation=True,max_length=8000)
  outputs = samples["summary"]
  labels = tokenizer(outputs,return_tensors="pt",padding="max_length",truncation=True,max_length=8000)
  # SET PADDING TOKENS TO -100 IN ORDER TO MASK THEM. OTHERWISE THE MODEL OVERFITS TO GENERATE PADDING TOKENS
  model_inputs["labels"] = [
      [(label_id if label_id != tokenizer.pad_token_id else -100) for label_id in label_ids]
      for label_ids in labels["input_ids"]
  ]
  return model_inputs

In [None]:
# APPLY PREPROCESSING TO SAMPLES IN THE DATASET
dataset = dataset.map(generate_prompts,batched=True)
dataset = dataset.map(apply_chat_template,batched=True)
dataset = dataset.map(preprocess_samples,batched=True)

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# CREATE PARAMETER-EFFICIENT MODEL FROM THE BASE MODEL USING LoRA
lora_config = LoraConfig(
    r=32, #Rank
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

# Enabling gradient checkpointing to reduce memory usage during fine-tuning
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

peft_model = get_peft_model(model, lora_config)

In [None]:
# DEFINE WHICH METRICS ARE CALCULATED DURING TRAINING (NOT USED DURING TRAINING DUE TO MEMORY CONSTRAINTS)
def build_compute_metrics(tokenizer):
  def compute_metrics(eval_preds):
    preds, labels = eval_preds
    tokenizer.batch_decode()
    ref_summary = tokenizer.decode(labels,skip_special_tokens=True).strip("\n")
    candidate_summary = tokenizer.decode(preds,skip_special_tokens=True).strip("\n")
    score = scorer.score(ref_summary, candidate_summary)
    return modify_rouge_score(score)

  return compute_metrics

In [None]:
!huggingface-cli login

In [None]:
# DEFINE TRAINING ARGUMENTS. BATCH SIZE SET TO 2 IN ORDER TO REDUCE GPU MEMORY USAGE
batch_size = 2
logging_steps = 50
output_dir = "qwen3-4B-finetuned-turkish-youtube-text-summarizer"
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs =2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.1,
    eval_strategy="steps",
    logging_steps=logging_steps,
    fp16=True,
    push_to_hub=True
)

# DEFINE TRAINER
trainer = Trainer(
    model=peft_model,
    args=training_args,
    compute_metrics=None, # Set to None to reduce memory usage of GPU
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
)

In [None]:
# FINE-TUNE THE MODEL
trainer.train()

In [None]:
# SAVE THE MODEL
trainer.save_model(output_dir)

In [None]:
# EVALUATE MODEL ON TEST SET
trainer.evaluate(eval_dataset=dataset["test"])

**TESTING FINETUNED MODEL - CALCULATING ROUGE-1, ROUGE-2 AND ROUGE-L F1 SCORE**

In [None]:
from transformers import AutoModelForCausalLM,AutoTokenizer
from peft import PeftModel
import torch

base_model_name = "Qwen/Qwen3-4B"
adapter_model_name = "emirunlu26/qwen3-4B-finetuned-turkish-youtube-text-summarizer"

base_model = AutoModelForCausalLM.from_pretrained(base_model_name,device_map="cuda",torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = PeftModel.from_pretrained(base_model, adapter_model_name).to("cuda")
model.eval()

In [None]:
def generate_prompt(sample):
  instruction_prompt = "Bu Youtube videosunu, ana teması ve önemli noktalarına odaklanarak kısa ama öz ve soyutlayıcı bir şekilde özetle (abstractive summary):\n"

  title = sample["title"]
  category = sample["category"]
  channel = sample["channel"]
  text = sample["text"]

  data_prompt = f"Başlık: {title}\n" \
  + f"Kategori: {category}\n" \
  + f"Kanal: {channel}\n" \
  + f"Metin: {text}"
  return (instruction_prompt + data_prompt)

In [None]:
def generate_summary(model,model_input):
  generated_ids = model.generate(
      **model_input,
      max_new_tokens=2000
      )
  output_ids = generated_ids[0][len(model_input.input_ids[0]):].tolist()
  summary = tokenizer.decode(output_ids,skip_special_tokens=True).strip("\n")
  return summary

In [None]:
def preprocess_sample(sample):
  prompt = generate_prompt(sample)
  messages = [
      {"role": "user", "content": prompt}
  ]

  text = tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True,
      enable_thinking=False
  )
  model_input = tokenizer([text],return_tensors="pt").to(model.device)
  return model_input

In [None]:
test_set = dataset["test"]
test_samples = list(map(preprocess_sample,test_set))
summaries = list()

for index,sample in enumerate(test_samples):
  candidate_summary = generate_summary(model,sample)
  print(f"{index+1}. özet hazırlandı.")
  summaries.append((test_set[index]["summary"],candidate_summary))

In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [None]:
def modify_rouge_score(score):
  return {key: score[key].fmeasure for key in score}

In [None]:
import numpy as np

rouge_scores = list(map(lambda s: scorer.score(s[0],s[1]),summaries))
rouge_scores = list(map(modify_rouge_score,rouge_scores))

rouge_1_scores = [score["rouge1"] for score in rouge_scores]
rouge_1_scores = np.array(rouge_1_scores)

rouge_2_scores = [score["rouge2"] for score in rouge_scores]
rouge_2_scores = np.array(rouge_2_scores)

rouge_l_scores = [score["rougeL"] for score in rouge_scores]
rouge_2_scores = np.array(rouge_l_scores)

average_result = {
    "rouge1": round(float(np.mean(rouge_1_scores)),2),
    "rouge2": round(float(np.mean(rouge_2_scores)),2),
    "rougeL": round(float(np.mean(rouge_l_scores)),2)
}

In [None]:
print(average_result)