In [1]:
from transformers import TextStreamer
import time
import torch
import json
import gc
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# Import our library components
from transfer import Trainer, SFTConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import gc
import time
import torch

def clear_memory(extra_names=()):
    """
    Aggressively free GPU memory across *all* CUDA devices.
    Pass any extra global var names via extra_names if needed.
    """
    # 0) Early exit if no CUDA
    if not torch.cuda.is_available():
        gc.collect()
        return print("CUDA not available. Collected CPU garbage only.")

    # 1) Try to move known models off GPU (if still around)
    for name in ("model", "base_model"):
        obj = globals().get(name, None)
        try:
            if obj is not None and hasattr(obj, "to"):
                obj.to("cpu")
        except Exception:
            pass

    # 2) Delete common globals (if present)
    for name in ("inputs", "base_model", "model", "tokenizer", *extra_names):
        globals().pop(name, None)

    # 3) Delete any stray CUDA tensors lingering in globals()
    for name, obj in list(globals().items()):
        try:
            if torch.is_tensor(obj) and obj.is_cuda:
                del globals()[name]
        except Exception:
            pass

    # 4) Full GC pass
    gc.collect()
    time.sleep(0.1)

    # 5) Clear *each* CUDA device
    for idx in range(torch.cuda.device_count()):
        try:
            with torch.cuda.device(idx):
                torch.cuda.synchronize()
                torch.cuda.empty_cache()
                # Collect interprocess memory (helps when using multiple processes / dataloaders)
                torch.cuda.ipc_collect()
                # Reset peak stats (optional, for cleaner diagnostics)
                try:
                    torch.cuda.reset_peak_memory_stats(idx)
                except Exception:
                    pass
        except Exception:
            # Keep going even if one device throws
            pass

    # 6) One more GC + sync
    gc.collect()
    for idx in range(torch.cuda.device_count()):
        try:
            with torch.cuda.device(idx):
                torch.cuda.synchronize()
        except Exception:
            pass

    # 7) Report per-device
    for idx in range(torch.cuda.device_count()):
        alloc = torch.cuda.memory_allocated(idx) / (1024 ** 3)
        reserv = torch.cuda.memory_reserved(idx) / (1024 ** 3)
        print(f"cuda:{idx} -> allocated: {alloc:.2f} GB | reserved: {reserv:.2f} GB")

# run it
# clear_memory()


In [3]:
import yaml

def load_dataset_config(config_path):
    """
    Loads a YAML configuration file for dataset fine-tuning.

    Args:
        config_path (str): The path to the YAML configuration file.

    Returns:
        dict: A dictionary containing the loaded configuration.
    """
    with open(config_path, 'r') as file:
        config = yaml.safe_load(file)
    return config

# Example usage:
config_file = 'skk_dataset_config.yaml'
dataset_config = load_dataset_config(config_file)

print("Loaded Dataset Configuration:")
for key, value in dataset_config.items():
    print(f"- {key}: {value}")

# Accessing specific parameters:
print(f"\nDataset Name: {dataset_config['dataset_name']}")
print(f"Train File: {dataset_config['train']}")
print(f"Validation File: {dataset_config['val']}")
print(f"Test File: {dataset_config['test']}")

Loaded Dataset Configuration:
- dataset_name: skk_dataset
- train: {'type': 'json', 'path': 'dataset/ksmi_train_topic_modelled.jsonl'}
- val: {'type': 'json', 'path': 'dataset/ksmi_val_topic_modelled.jsonl'}
- train_rl: {'type': 'json', 'path': 'dataset/ksmi_train_rl_topic_modelled.jsonl'}
- val_rl: {'type': 'json', 'path': 'dataset/ksmi_val_rl_topic_modelled.jsonl'}
- test: {'type': 'json', 'path': 'dataset/ksmi_test_topic_modelled.jsonl'}
- data_sft: {'input_field': ['instruction', 'input'], 'target_field': 'output'}
- data_rl: {'input_field': 'prompt', 'chosen_field': 'chosen', 'rejected_field': 'rejected'}
- prompt_template: # Question:
{query_text}

# Note:
If the question contains technical terms use the RAG tools to get definitions first, DO NOT interpret the meaning by yourself! Then, create a plan to answer the user's question.

Dataset Name: skk_dataset
Train File: {'type': 'json', 'path': 'dataset/ksmi_train_topic_modelled.jsonl'}
Validation File: {'type': 'json', 'path': 'd

In [4]:
from random import randrange
from datasets import load_dataset, concatenate_datasets, DatasetDict

# Load dataset from jsonl files specified in the config
train_data = load_dataset(dataset_config['train']['type'], data_files={'train': dataset_config['train']['path']})
val_data = load_dataset(dataset_config['val']['type'], data_files={'val': dataset_config['val']['path']})
test_data = load_dataset(dataset_config['test']['type'], data_files={'test': dataset_config['test']['path']})

# take 20 samples from train and test for quick testing
# train_data['train'] = train_data['train'].select(range(1))
# test_data['test'] = test_data['test'].shuffle(seed=42).select(range(20))

combined_data = DatasetDict({"train": train_data['train'], "val": val_data['val'], "test": test_data['test']})
combined_data

DatasetDict({
    train: Dataset({
        features: ['id', 'file_name', 'instruction', 'input', 'output', 'complexity', 'input_length', 'output_length', 'dominant_topic', 'topic_keywords', 'topic_name'],
        num_rows: 2222
    })
    val: Dataset({
        features: ['id', 'file_name', 'instruction', 'input', 'output', 'complexity', 'input_length', 'output_length', 'dominant_topic', 'topic_keywords', 'topic_name'],
        num_rows: 247
    })
    test: Dataset({
        features: ['instruction', 'output', 'dominant_topic', 'topic_keywords', 'topic_name'],
        num_rows: 200
    })
})

In [5]:
train_data = combined_data['train']
dataset_size = len(combined_data['train'])
print(combined_data['train'][0])
train_data

{'id': 1529, 'file_name': 'ksmi_complex_dataset_full', 'instruction': 'Jelaskan alur yang tepat untuk memperbarui status kelas dan level pada akhir tahun ketika selama satu tahun proyek berpindah dari Prospect, menemukan hidrokarbon, mencapai Discovery under Evaluation, lalu mendapatkan persetujuan pengakhiran eksplorasi.', 'input': '', 'output': '## Alur Pembaruan Status Kelas dan Level Proyek Eksplorasi\n\nPembaruan status kelas dan level proyek eksplorasi pada akhir tahun mengikuti alur yang logis berdasarkan perkembangan data dan evaluasi yang dilakukan selama satu tahun. Dalam kasus ini, proyek dimulai dari tahap Prospect, menemukan hidrokarbon, mencapai Discovery under Evaluation, dan akhirnya mendapatkan persetujuan pengakhiran eksplorasi. Berikut adalah penjelasan rinci mengenai alur tersebut:\n\n### 1. Tahap Awal: Prospect\nProyek pada awalnya berada di level Prospect, yaitu tahap di mana potensi akumulasi petroleum telah teridentifikasi berdasarkan data yang cukup untuk menen

Dataset({
    features: ['id', 'file_name', 'instruction', 'input', 'output', 'complexity', 'input_length', 'output_length', 'dominant_topic', 'topic_keywords', 'topic_name'],
    num_rows: 2222
})

In [6]:
train_data = train_data.rename_column("instruction", "prompt")
train_data = train_data.rename_column("output", "response")
train_data[0]

{'id': 1529,
 'file_name': 'ksmi_complex_dataset_full',
 'prompt': 'Jelaskan alur yang tepat untuk memperbarui status kelas dan level pada akhir tahun ketika selama satu tahun proyek berpindah dari Prospect, menemukan hidrokarbon, mencapai Discovery under Evaluation, lalu mendapatkan persetujuan pengakhiran eksplorasi.',
 'input': '',
 'response': '## Alur Pembaruan Status Kelas dan Level Proyek Eksplorasi\n\nPembaruan status kelas dan level proyek eksplorasi pada akhir tahun mengikuti alur yang logis berdasarkan perkembangan data dan evaluasi yang dilakukan selama satu tahun. Dalam kasus ini, proyek dimulai dari tahap Prospect, menemukan hidrokarbon, mencapai Discovery under Evaluation, dan akhirnya mendapatkan persetujuan pengakhiran eksplorasi. Berikut adalah penjelasan rinci mengenai alur tersebut:\n\n### 1. Tahap Awal: Prospect\nProyek pada awalnya berada di level Prospect, yaitu tahap di mana potensi akumulasi petroleum telah teridentifikasi berdasarkan data yang cukup untuk mene

In [7]:
eval_data = combined_data['val']
dataset_size = len(combined_data['val'])
print(combined_data['val'][0])
eval_data

{'id': 2119, 'file_name': 'ksmi_complex_dataset_full', 'instruction': 'Jelaskan bagaimana faktor teknologi produksi (misalnya fracking) memengaruhi keyakinan alir konklusif dan mengubah status proyek, serta bagaimana perubahan ini menggeser penilaian dari risiko ke ketidakpastian dalam kerangka P90–P10.', 'input': '', 'output': '## Pengaruh Teknologi Produksi terhadap Keyakinan Alir Konklusif dan Perubahan Penilaian Risiko ke Ketidakpastian\n\nTeknologi produksi, seperti fracking, memiliki pengaruh signifikan terhadap keyakinan alir konklusif dalam proyek eksplorasi migas. Dalam konteks eksplorasi, alir konklusif merujuk pada kemampuan fluida petroleum untuk mengalir secara terus-menerus ke permukaan. Pada tahap awal eksplorasi, keyakinan terhadap kemampuan alir ini sering kali terbatas karena kurangnya data atau uji sumur yang memadai. Namun, penerapan teknologi produksi yang tepat dapat meningkatkan keyakinan bahwa fluida dapat mengalir secara berkelanjutan, bahkan dalam kondisi rese

Dataset({
    features: ['id', 'file_name', 'instruction', 'input', 'output', 'complexity', 'input_length', 'output_length', 'dominant_topic', 'topic_keywords', 'topic_name'],
    num_rows: 247
})

In [8]:
eval_data = eval_data.rename_column("instruction", "prompt")
eval_data = eval_data.rename_column("output", "response")


In [9]:
test_data = combined_data['test']
dataset_size = len(combined_data['test'])
print(combined_data['test'][randrange(dataset_size)])
test_data

{'instruction': 'Mengapa suatu proyek bisa mengalami project level maturity X2. Development Undetermined?', 'output': 'Suatu proyek dapat berada pada level maturity X2. Development Undetermined apabila belum mendapatkan persetujuan Penentuan Status Eksplorasi (PSE) atau belum memiliki Rencana Pengembangan (POD) yang disetujui SKK Migas. Kondisi ini menunjukkan bahwa proyek memerlukan data tambahan untuk menentukan status komersialitasnya.\n\nBeberapa faktor yang dapat menyebabkan proyek berada pada level X2 antara lain:\n\nBeralih dari level X1. Discovery under Evaluation\nProyek belum memiliki kajian teknis yang memadai, misalnya data subsurface yang belum lengkap atau terverifikasi.\n\nBeralih dari level X0. Development Pending, dengan kendala sebagai berikut:\n\nEvaluasi keekonomian belum selesai atau belum memenuhi ambang batas kelayakan investasi.\n\nKajian teknis belum memadai, seperti keterbatasan data subsurface atau desain fasilitas produksi yang belum lengkap atau terverifika

Dataset({
    features: ['instruction', 'output', 'dominant_topic', 'topic_keywords', 'topic_name'],
    num_rows: 200
})

In [10]:
test_data = test_data.rename_column("instruction", "prompt")
test_data = test_data.rename_column("output", "response")

In [11]:
test_data

Dataset({
    features: ['prompt', 'response', 'dominant_topic', 'topic_keywords', 'topic_name'],
    num_rows: 200
})

In [12]:
import torch
torch.cuda.is_available()

True

In [13]:
BASE_MODEL_NAME = "Qwen/Qwen3-4B"

In [14]:
# Define max memory to force CPU offloading if needed
max_memory = {0: "3.5GiB", "cpu": "30GiB"}

In [15]:
def clear_gpu_memory():
    """Aggressively clear GPU memory cache"""
    print("--- Clearing GPU memory ---")
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    if torch.cuda.is_available():
        print(
            f"GPU memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
        print(
            f"GPU memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")


In [16]:
# Create a robust quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)


In [17]:
def run_inference(
    model_to_load,
    adapter_path=None,
    prompts = None,
    max_new_tokens=1024,
    temperature=0.7,
    top_p=0.9,
    enable_thinking=False,
    do_stream=True,
):
    print(f"\n--- Loading model: {model_to_load} ---")

    tokenizer = AutoTokenizer.from_pretrained(model_to_load)

    model = AutoModelForCausalLM.from_pretrained(
        model_to_load,
        device_map="auto",
        quantization_config=bnb_config,
        max_memory=max_memory,
        dtype=torch.bfloat16,
    )

    if adapter_path:
        print(f"--- Applying LoRA adapter from: {adapter_path} ---")
        model = PeftModel.from_pretrained(model, adapter_path)

    responses = []

    # ========================================================
    # LOOP THROUGH ALL PROMPTS GIVEN
    # ========================================================
    for prompt_text in prompts:
        print(f"\n========== PROMPT ==========\n{prompt_text}\n")

        messages = [{"role": "user", "content": prompt_text}]
        input_text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=enable_thinking,
        )

        model_inputs = tokenizer(
            input_text, return_tensors="pt").to(model.device)

        start_time = time.time()

        with torch.no_grad():
            if do_stream:
                print("Streaming response:\n")
                streamer = TextStreamer(tokenizer, skip_prompt=True)

                _ = model.generate(
                    **model_inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=temperature,
                    top_p=top_p,
                    do_sample=(temperature > 0),
                    streamer=streamer,
                    pad_token_id=tokenizer.eos_token_id,
                )

                end_time = time.time()
                print(
                    f"\n[Streamed Answer Took {end_time - start_time:.2f} sec]")

                responses.append(
                    {"prompt": prompt_text, "response": "(streamed)"})
                continue

            else:
                generated_ids = model.generate(
                    **model_inputs,
                    max_new_tokens=max_new_tokens,
                    temperature=temperature,
                    top_p=top_p,
                    do_sample=(temperature > 0),
                    pad_token_id=tokenizer.eos_token_id,
                )

                prompt_len = model_inputs.input_ids.shape[-1]
                new_ids = generated_ids[0][prompt_len:]
                text = tokenizer.decode(new_ids, skip_special_tokens=True)

                end_time = time.time()
                tps = len(new_ids) / (end_time - start_time)

                print(
                    f"[Answer Took {end_time - start_time:.2f} sec, TPS={tps:.2f}]")
                print("\nRESPONSE:\n", text)

                responses.append({"prompt": prompt_text, "response": text})

    # Cleanup
    del model
    del tokenizer
    clear_gpu_memory()
    return responses

In [18]:
config = SFTConfig(
    model_name=BASE_MODEL_NAME,
    num_epochs=3,
    use_lora=True,
    lora_r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    prompt_column="prompt",
    response_column="response",
    output_dir="./Qwen3-4B-sft-output",

    enable_evaluation=True,
    evaluation_dataset=eval_data,
    evaluation_metrics=["perplexity", "semantic_entropy", "token_entropy"],
    evaluation_batch_size=2,
    save_evaluation_results=True,
    evaluation_results_path="./evaluation_results.json"
)

In [18]:
# Clear memory before training
clear_gpu_memory()

# === FINE-TUNING ===
print("\n" + "="*50)
print("         STARTING FINE-TUNING")
print("="*50)
print("Note: This may be slow on a 4GB GPU due to RAM-CPU offloading, but it will work.")

trainer = Trainer(task="sft", config=config)
trainer.train(train_data)
trainer.save_model()
print("Fine-tuning complete. Model saved to:", config.output_dir)


--- Clearing GPU memory ---
GPU memory allocated: 0.00 GB
GPU memory reserved: 0.00 GB

         STARTING FINE-TUNING
Note: This may be slow on a 4GB GPU due to RAM-CPU offloading, but it will work.
✅ Quantization enabled: nf4 with bfloat16


Fetching 3 files:   0%|          | 0/3 [13:30<?, ?it/s]


KeyboardInterrupt: 

KeyboardInterrupt: 

In [None]:
PROMPT_TO_TEST = test_data["prompt"]
import random
PROMPT_TO_TEST = random.sample(test_data["prompt"], 5)


In [None]:
# === AFTER FINE-TUNING ===
print("\n" + "="*50)
print("        INFERENCE AFTER FINE-TUNING")
print("="*50)

response_after = run_inference(
    BASE_MODEL_NAME, adapter_path=config.output_dir, prompts=PROMPT_TO_TEST, do_stream=True)
# for r in response_after:
#     print(f"Prompt: {r['prompt']}\nResponse: {r['response']}\n")

# Final cleanup
clear_gpu_memory()
