## Cluster Finetuning notebook for Divan10 Case study

In [1]:
import pandas as pd
import numpy as np
import openai
import re
import os
import glob
import json
import seaborn as sns
import matplotlib.pyplot as plt
from loading_helpers import *

In [None]:
from huggingface_hub import login
HUGGINGFACE_TOKEN = "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"
login(token=HUGGINGFACE_TOKEN)

## Loading model

In [3]:
from transformers import AutoProcessor, Gemma3ForConditionalGeneration, BitsAndBytesConfig
import torch 


model_id = "ArmGPT/ArmenianGPT-0.1-12B"

# BitsAndBytesConfig for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                   # enable 4-bit loading
    bnb_4bit_use_double_quant=True,      # nested quantization for stability
    bnb_4bit_quant_type="nf4",           # best quantization type for LLMs
    bnb_4bit_compute_dtype=torch.bfloat16  # compute in bf16 (safer than fp16 on older GPUs)
)

processor = AutoProcessor.from_pretrained(model_id)

model = Gemma3ForConditionalGeneration.from_pretrained(
    model_id,
    #quantization_config=bnb_config,
    device_map="auto",
    cache_dir="/rcp-scratch/iccluster040_scratch/students/nour/hf"
)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
    Found GPU0 NVIDIA GeForce GTX TITAN X which is of cuda capability 5.2.
    Minimum and Maximum cuda capability supported by this version of PyTorch is
    (7.0) - (12.0)
    
    Please install PyTorch with a following CUDA
    configurations:  12.6 following instructions at
    https://pytorch.org/get-started/locally/
    
    Found GPU1 NVIDIA GeForce GTX TITAN X which is of cuda capability 5.2.
    Minimum and Maximum cuda capability supported by this version of PyTorch is
    (7.0) - (12.0)
    
NVIDIA GeForce GTX TITAN X with CUDA capability sm_52 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities s

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


## Loading Dataset

In [4]:
from datasets import Dataset, DatasetDict
import json
import pandas as pd
from pathlib import Path

def load_ner_json_files(train_path, eval_path, test_path):
    """
    Loads train, eval, and test JSON files into a DatasetDict.

    Args:
        train_path (str or Path): path to train JSON file
        eval_path  (str or Path): path to eval JSON file
        test_path  (str or Path): path to test JSON file

    Returns:
        DatasetDict with keys: 'train', 'eval', 'test'
    """
    def load_json_to_df(path):
        # If JSON lines format
        try:
            return pd.read_json(path, lines=True)
        except ValueError:
            # Fallback: normal JSON array
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
            return pd.DataFrame(data)
    
    train_df = load_json_to_df(train_path)
    eval_df  = load_json_to_df(eval_path)
    test_df  = load_json_to_df(test_path)

    # Convert pandas DataFrames to Hugging Face Datasets
    train_ds = Dataset.from_pandas(train_df)
    eval_ds  = Dataset.from_pandas(eval_df)
    test_ds  = Dataset.from_pandas(test_df)

    return DatasetDict({
        "train": train_ds,
        "validation": eval_ds,
        "test": test_ds
    })


dataset=load_ner_json_files("train_ner.jsonl","eval_ner.jsonl","test_ner.jsonl")



###  Map empty outputs

In [5]:
def normalize_output(example):
    if not example["output"].strip():
        example["output"] = "O"  # or "No entities found."
    return example

dataset = {
    split: ds.map(normalize_output)
    for split, ds in dataset.items()
}


Map:   0%|          | 0/3650 [00:00<?, ? examples/s]

Map:   0%|          | 0/512 [00:00<?, ? examples/s]

Map:   0%|          | 0/535 [00:00<?, ? examples/s]

## Preprocess dataset

In [6]:
max_input_length = 256 
max_target_length = 128
def preprocess(example):

    output_text = example["output"].strip()
    if not output_text:
        output_text = "[]"  # or "No entities found."

    messages = [
        {"role": "system", "content": [{"type": "text", "text": "You are an Armenian AI assistant for NER tasks."}]},
        {"role": "user", "content": [{"type": "text", "text": example["input"]}]},
        {"role": "assistant", "content": [{"type": "text", "text": output_text}]},
    ]

    processed = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=False,
        max_length=max_input_length + max_target_length,
        padding="max_length",
        truncation=True,
        return_dict=True,
        return_tensors=None
    )

    input_ids = processed["input_ids"][0]
    attention_mask = processed["attention_mask"][0]
    
    labels = input_ids.copy()
    labels = [tok if mask == 1 else -100 for tok, mask in zip(input_ids, attention_mask)]
    if len(labels) > 1:
        labels[-1] = -100
    
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

tokenized_dataset = { split: ds.map(preprocess, batched=False, remove_columns=ds.column_names) for split, ds in dataset.items() }

Map:   0%|          | 0/3650 [00:00<?, ? examples/s]

Map:   0%|          | 0/512 [00:00<?, ? examples/s]

Map:   0%|          | 0/535 [00:00<?, ? examples/s]

## Load Trainer

In [7]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="./gemma3_ner", per_device_train_batch_size=1,
                                   per_device_eval_batch_size=1, gradient_accumulation_steps=4, num_train_epochs=2,
                                     learning_rate=5e-5, save_strategy="steps", save_steps=200, eval_strategy="steps",
                                        eval_steps=10, logging_steps=50, fp16=False, bf16=False, save_total_limit=2, load_best_model_at_end=True )

In [8]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=processor,  # processor handles tokenization
    model_init=None,             # don't try to reload model
)

  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
trainer.train()


AcceleratorError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
metrics = trainer.evaluate()
print(metrics)
with open("./gemma3_ner_prototype/metrics.json", "w") as f:
    json.dump(metrics, f, indent=4)
trainer.save_model("./gemma3_ner_prototype")
processor.save_pretrained("./gemma3_ner_prototype")