In [None]:
## Import all necessary library
import torch
import pandas as pd
import numpy as nn
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer,BitsAndBytesConfig,DataCollatorForLanguageModeling
    )
from peft import LoraConfig, get_peft_model, PeftModel

## AutoModelforCausalLM : This is a library by higgingface to load any open source language model
## AutoTokenizer : Pre trained Tokenized Vocab list
## TrainingArguments : Hyper parameter setup
## Trainer : initiate the finetune process
## BitsAndBytesCongig : To configure Large language model quantization
## DataCollatorForLanguageModeling : This simpify traineing by creating batch, padding and labels for the input data
## LoraConfig : to configure Low Rank Adaption by injecting adapters and freezing the main layers
## get_peft_model : Adding adapter layers
## PeftModel : Save and retrive the peft model

In [None]:
## This line of the code will check the model output before finetuning with our own custom dataset.

# Load the model and tokenizer
model_name = "meta-llama/Llama-3.1-8B"

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)



config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [None]:
# Example input for testing from the finetuned dataset
input_text = "What significant invention did NVIDIA create in 1999?"
inputs = tokenizer(input_text, return_tensors="pt")

# Generate a response
outputs = model.generate(**inputs)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(response)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What significant invention did NVIDIA create in 1999? - Computer Studies
In 1999, NVIDIA created the GeForce 256, the first GPU (


In [None]:
# Example input for testing from the finetuned dataset
input_text = "Where can NVIDIA's financial reports be accessed?"
inputs = tokenizer(input_text, return_tensors="pt")

# Generate a response
outputs = model.generate(**inputs)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(response)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Where can NVIDIA's financial reports be accessed? 
NVIDIA's financial reports can be accessed via the Financial Reports section of our Investor Relations website.


In [None]:
# Example input for testing from the finetuned dataset
input_text = "What was the main reason for the termination of NVIDIA's Share Purchase Agreement with SoftBank?"
inputs = tokenizer(input_text, return_tensors="pt")

# Generate a response
outputs = model.generate(**inputs)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(response)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What was the main reason for the termination of NVIDIA's Share Purchase Agreement with SoftBank? The answer is the COVID-19 pandemic. NVIDIA’s revenue is heavily dependent on the gaming market.


In [None]:
# Example input for testing from the finetuned dataset
input_text = "How much did NVIDIA record as an acquisition termination cost in fiscal year 2023 related to the Arm Share Purchase Agreement?"
inputs = tokenizer(input_text, return_tensors="pt")

# Generate a response
outputs = model.generate(**inputs)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(response)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


How much did NVIDIA record as an acquisition termination cost in fiscal year 2023 related to the Arm Share Purchase Agreement? In the fiscal year 2023, NVIDIA recorded a $1.25 billion acquisition termination cost related


In [None]:
import pandas as pd

# Load the .parquet file
df = pd.read_parquet("/content/train-00000-of-00001.parquet")
print(df.head())  # Display the first few rows


                                            question  \
0  What area did NVIDIA initially focus on before...   
1  What are some of the recent applications of GP...   
2  What significant invention did NVIDIA create i...   
3  How does NVIDIA's platform strategy contribute...   
4  What does NVIDIA's CUDA programming model enable?   

                                              answer  \
0           NVIDIA initially focused on PC graphics.   
1  Recent applications of GPU-powered deep learni...   
2                   NVIDIA invented the GPU in 1999.   
3  NVIDIA's platform strategy brings together har...   
4  NVIDIA's CUDA programming model opened the par...   

                                             context ticker    filing  
0  Since our original focus on PC graphics, we ha...   NVDA  2023_10K  
1  Some of the most recent applications of GPU-po...   NVDA  2023_10K  
2  Our invention of the GPU in 1999 defined moder...   NVDA  2023_10K  
3  NVIDIA has a platform strategy, bri

In [None]:
# Print the columns of the DataFrame
print(df.columns)
# Check the columns and their data types
print(df.dtypes)


Index(['question', 'answer', 'context', 'ticker', 'filing'], dtype='object')
question    object
answer      object
context     object
ticker      object
filing      object
dtype: object


In [None]:
## Input data text pre processing

# Step 1: Create the input_text column (Context + Question)
df['input_text'] = 'Context: ' + df['context'] + ' Question: ' + df['question']

# Step 2: Create the label column (Answer)
df['label'] = df['answer']

# Step 3: Keep only the columns we need
df = df[['input_text', 'label']]

print(df)

                                             input_text  \
0     Context: Since our original focus on PC graphi...   
1     Context: Some of the most recent applications ...   
2     Context: Our invention of the GPU in 1999 defi...   
3     Context: NVIDIA has a platform strategy, bring...   
4     Context: With our introduction of the CUDA pro...   
...                                                 ...   
6995  Context: The 5.400% Senior Notes due in 2028 h...   
6996  Context: On January 30, 2023, LVSC entered int...   
6997  Context: Following the downgrades, each series...   
6998  Context: The amended and restated facility agr...   
6999  Context: As of December 31, 2023, SGD 3.69 bil...   

                                                  label  
0              NVIDIA initially focused on PC graphics.  
1     Recent applications of GPU-powered deep learni...  
2                      NVIDIA invented the GPU in 1999.  
3     NVIDIA's platform strategy brings together har...  
4

In [None]:
import pandas as pd

pd.set_option('display.max_colwidth', None)

print(df.iloc[0])

input_text    Context: Since our original focus on PC graphics, we have expanded to several other large and important computationally intensive fields. Question: What area did NVIDIA initially focus on before expanding to other computationally intensive fields?
label                                                                                                                                                                                                                        NVIDIA initially focused on PC graphics.
Name: 0, dtype: object


In [None]:
df.to_csv("finetune_dataset.csv", index=False)
## Saving the pre processed data locally


In [None]:
# Model / dataset
MODEL_NAME   = "meta-llama/Llama-3.1-8B-Instruct"      # change to your base model
DATASET_SPEC = "/content/train-00000-of-00001.parquet"  # HF repo id OR local path OR glob
OUTPUT_DIR   = "./output_adapter"              # where PEFT adapter checkpoints go

# Hugging Face Hub target (merged model + tokenizer)
HF_REPO_ID   = "iamAbhishek01/llama-finance-finetuned"  # change this

# Splits
VAL_FRAC  = 0.10
TEST_FRAC = 0.05
SEED      = 42

# Tokenization & training
MAX_LENGTH       = 512
EPOCHS           = 3
BATCH_SIZE       = 4
GRAD_ACCUM       = 1
LEARNING_RATE    = 2e-4
MAX_NEW_TOKENS   = 128          # for post-training generation eval
EVAL_BATCH_SIZE  = 8

# Weights & Biases
WANDB_PROJECT = "llama-finance-finetune_test1"
WANDB_ENTITY  = None
RUN_NAME      = ""


In [None]:
import os, glob
from typing import Tuple, Dict, Any
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer

# ---------------------------
# Spec detection (HF id, file, dir, glob)
# ---------------------------
def _is_path_like(s: str) -> bool:
    return any(p in s for p in ("/", "\\", "*", ".parquet", ".csv", ".json"))

def _detect_loader(spec: str) -> Tuple[str, Dict[str, Any]]:
    if not _is_path_like(spec):
        # HF hub dataset id
        return spec, {}

    # local path(s)
    def infer_ext(p):
        for ext in ("parquet", "csv", "json"):
            if p.endswith(f".{ext}"):
                return ext
        return None

    if os.path.isdir(spec):
        for ext in ("parquet", "csv", "json"):
            files = glob.glob(os.path.join(spec, f"*.{ext}"))
            if files:
                return ext, {"data_files": {"train": files}}
        raise FileNotFoundError(f"No parquet/csv/json files found in dir: {spec}")

    if "*" in spec:
        matches = glob.glob(spec)
        if not matches:
            raise FileNotFoundError(f"No files match glob: {spec}")
        ext = infer_ext(matches[0])
        if not ext:
            raise ValueError(f"Unsupported file type: {matches[0]}")
        return ext, {"data_files": {"train": matches}}

    ext = infer_ext(spec)
    if not ext:
        raise ValueError(f"Unsupported file type: {spec}")
    return ext, {"data_files": {"train": spec}}

# ---------------------------
# Load + split
# ---------------------------
def load_and_split(dataset_spec: str, val_frac=0.1, test_frac=0.05, seed=42):
    """
    Loads a dataset from HF Hub or local files and returns (train, val, test) Datasets.
    Prefers columns: context, question, answer.
    Falls back to input_text/label, then to a single text column.
    """
    loader, kwargs = _detect_loader(dataset_spec)
    raw = load_dataset(loader, split="train", **kwargs) if kwargs else load_dataset(loader, split="train")

    df = raw.to_pandas()

    # Canonicalize columns
    cols = {c.lower(): c for c in df.columns}
    has_cqa = all(k in cols for k in ("context", "question", "answer"))
    has_il = all(k in cols for k in ("input_text", "label"))

    if has_cqa:
        ctx = cols["context"]; q = cols["question"]; a = cols["answer"]
        df["context"]  = df[ctx].astype(str)
        df["question"] = df[q].astype(str)
        df["answer"]   = df[a].astype(str)
    elif has_il:
        it = cols["input_text"]; lb = cols["label"]
        # If user already built input_text/label, keep them, but also expose context/question/answer placeholders
        df["context"]  = ""
        df["question"] = df[it].astype(str)  # treat whole input_text as a 'question' block
        df["answer"]   = df[lb].astype(str)
    else:
        # last resort: single first column duplicated
        col = df.columns[0]
        df["context"]  = ""
        df["question"] = df[col].astype(str)
        df["answer"]   = df[col].astype(str)

    dataset = Dataset.from_pandas(df[["context", "question", "answer"]])

    frac = val_frac + test_frac
    if not (0 < frac < 0.9):
        raise ValueError("VAL_FRAC + TEST_FRAC must be between 0 and 0.9")

    split = dataset.train_test_split(test_size=frac, seed=seed, shuffle=True)
    train_ds, testval = split["train"], split["test"]
    val_prop = val_frac / frac if frac > 0 else 0.0
    tv = testval.train_test_split(test_size=(1 - val_prop), seed=seed, shuffle=True)
    val_ds, test_ds = tv["train"], tv["test"]
    return train_ds, val_ds, test_ds

# ---------------------------
# Tokenization for LLaMA chat SFT with masking
# ---------------------------
def make_tokenizer(model_id: str):
    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    return tok

def tokenize_for_llama_chat(dataset: Dataset, tokenizer, max_length=1024):
    """
    Builds a chat-style prompt:
      user: "Context: ...\nQuestion: ..."
      assistant: "<answer>"
    Uses tokenizer.apply_chat_template and masks prompt tokens to -100.
    """
    def _encode(ex):
        msgs = [
            {"role": "user", "content": f"Context: {ex['context']}\nQuestion: {ex['question']}"},
            {"role": "assistant", "content": ex["answer"]},
        ]

        # full prompt + answer ids
        full = tokenizer.apply_chat_template(
            msgs, tokenize=True, add_generation_prompt=False, return_tensors=None
        )
        # prompt-only ids (stop before assistant content)
        prompt_only = tokenizer.apply_chat_template(
            msgs[:-1], tokenize=True, add_generation_prompt=True, return_tensors=None
        )

        input_ids = full[:max_length]
        prompt_len = min(len(prompt_only), len(input_ids))
        labels = [-100] * prompt_len + input_ids[prompt_len:]
        attention_mask = [1] * len(input_ids)

        # pad if needed
        if len(input_ids) < max_length:
            pad_id = tokenizer.pad_token_id
            pad_len = max_length - len(input_ids)
            input_ids += [pad_id] * pad_len
            attention_mask += [0] * pad_len
            labels += [-100] * pad_len

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

    return dataset.map(_encode, remove_columns=dataset.column_names)


In [62]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import (
    LoraConfig, get_peft_model, PeftModel,
    prepare_model_for_kbit_training
)

def load_tokenizer(model_name: str):
    tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    tok.padding_side = "right"
    return tok

def _prefer_bf16() -> bool:
    # Ampere+ (sm>=80) supports bf16
    return torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8

def load_quantized_model(model_name: str, use_sdpa: bool = True):
    """
    Load 4-bit NF4 model. Force SDPA to avoid FlashAttention issues in Colab.
    """
    dtype = torch.bfloat16 if _prefer_bf16() else torch.float16
    bnb = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=dtype,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb,
        torch_dtype=dtype,
        low_cpu_mem_usage=True,
        device_map="auto",
        attn_implementation=("sdpa" if use_sdpa else "eager"),
    )
    if hasattr(model.config, "use_cache"):
        model.config.use_cache = False
    return model

def apply_lora(
    model,
    r=16,
    alpha=32,
    target_modules=None,
    dropout=0.05,
    add_mlp=False,
    use_gradient_checkpointing=True,
):
    """
    Prepare 4-bit model for training, then wrap with LoRA.
    Default targets q/k/v/o. Set add_mlp=True to also adapt gate/up/down.
    """
    # 1) Prepare for k-bit training so grads flow
    model = prepare_model_for_kbit_training(
        model,
        use_gradient_checkpointing=use_gradient_checkpointing
    )
    if use_gradient_checkpointing:
        # extra safety for some builds
        try:
            model.gradient_checkpointing_enable()
        except Exception:
            pass
    if hasattr(model.config, "use_cache"):
        model.config.use_cache = False

    # 2) Choose LoRA targets
    if target_modules is None:
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]
        if add_mlp:
            target_modules += ["gate_proj", "up_proj", "down_proj"]

    cfg = LoraConfig(
        r=r,
        lora_alpha=alpha,
        target_modules=target_modules,
        lora_dropout=dropout,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, cfg)
    model.print_trainable_parameters()
    return model

def merge_and_save(
    base_model_name: str,
    adapter_dir: str,
    out_dir: str = "merged_model",
    push_to_hub: bool = False,
    hf_repo_id: str | None = None,
    tokenizer=None,
    commit_message: str = "merge LoRA",
    save_safetensors: bool = True,
):
    """
    Load the base in bf16/fp16, merge LoRA, save locally, optionally push.
    """
    dtype = torch.bfloat16 if _prefer_bf16() else torch.float16
    base = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=dtype,
        low_cpu_mem_usage=True,
        device_map="auto",
    )
    peft_model = PeftModel.from_pretrained(base, adapter_dir, from_transformers=True)
    merged = peft_model.merge_and_unload()

    merged.save_pretrained(out_dir, safe_serialization=save_safetensors)
    if tokenizer is not None:
        tokenizer.save_pretrained(out_dir)

    if push_to_hub:
        assert hf_repo_id, "Provide hf_repo_id when push_to_hub=True"
        merged.push_to_hub(hf_repo_id, commit_message=commit_message)
        if tokenizer is not None:
            tokenizer.push_to_hub(hf_repo_id)

    return out_dir


In [None]:
import math
import re
from typing import List
import evaluate

rouge_metric = evaluate.load("rouge")
berts_metric = evaluate.load("bertscore")

def perplexity_from_loss(loss: float):
    return math.exp(loss) if loss is not None else None

# Simple SQuAD-style normalization
_ARTICLES = {"a", "an", "the"}
def _normalize_text(s: str) -> str:
    s = s.lower().strip()
    s = re.sub(r"\s+", " ", s)                    # collapse spaces
    s = re.sub(r"[^a-z0-9\s]", "", s)             # drop punctuation (rough but effective)
    tokens = [t for t in s.split() if t not in _ARTICLES]
    return " ".join(tokens)

def _prep_pairs(preds: List[str], refs: List[str]):
    assert len(preds) == len(refs), "preds and refs must be same length"
    preds = [p if isinstance(p, str) else "" for p in preds]
    refs  = [r if isinstance(r, str) else "" for r in refs]
    return preds, refs

def compute_rouge(preds: List[str], refs: List[str]):
    preds, refs = _prep_pairs(preds, refs)
    preds_n = [_normalize_text(p) for p in preds]
    refs_n  = [_normalize_text(r) for r in refs]
    if not preds_n:  # empty list
        return {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0, "rougeLsum": 0.0}
    return rouge_metric.compute(
        predictions=preds_n,
        references=refs_n,
        use_stemmer=True
    )

def compute_bertscore(
    preds: List[str],
    refs: List[str],
    lang: str = "en",
    model_type: str = "roberta-large",
    use_idf: bool = True,
):
    preds, refs = _prep_pairs(preds, refs)
    if not preds:
        return 0.0
    res = berts_metric.compute(
        predictions=preds,
        references=refs,
        lang=lang,
        model_type=model_type,
        idf=use_idf,
        rescale_with_baseline=True,
    )
    f1s = res.get("f1", [])
    return float(sum(f1s) / len(f1s)) if f1s else 0.0


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# ===
import torch
import wandb
from typing import List, Tuple
from transformers import (
    Trainer, TrainingArguments, default_data_collator, TrainerCallback
)

# import your metrics from the other cell/module
# that cell should define: perplexity_from_loss, compute_rouge, compute_bertscore
#from metrics import perplexity_from_loss, compute_rouge, compute_bertscore


class PerplexityLogger(TrainerCallback):
    """Logs eval_loss and perplexity to W&B at end of each evaluation (epoch)."""
    def on_evaluate(self, args, state, control, **kwargs):
        metrics = kwargs.get("metrics", {})
        ev_loss = metrics.get("eval_loss")
        if ev_loss is not None:
            ppl = perplexity_from_loss(ev_loss)
            print(f"[eval] loss={ev_loss:.4f}  ppl={ppl:.2f}")
            if wandb.run is not None:
                wandb.log({"eval_loss": ev_loss, "perplexity": ppl, "epoch": state.epoch})


def create_trainer(
    model, tokenizer, train_dataset, eval_dataset, output_dir,
    epochs, lr, batch_size, grad_accum=1, run_name=None, use_bf16=True
):
    args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=grad_accum,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        learning_rate=lr,
        lr_scheduler_type="cosine",
        warmup_ratio=0.03,
        weight_decay=0.05,
        logging_strategy="steps",
        logging_steps=100,
        report_to=("wandb" if wandb.run is not None else "none"),
        run_name=run_name,
        push_to_hub=False,
        remove_unused_columns=False,       # keep pre-tokenized columns
        gradient_checkpointing=True,
        bf16=use_bf16 and torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8,
        fp16=(not use_bf16) and torch.cuda.is_available(),
    )
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=default_data_collator,
        compute_metrics=None
    )
    trainer.add_callback(PerplexityLogger())
    return trainer


@torch.no_grad()
def _batch_generate_chat(model, tokenizer, user_msgs: List[str], max_new_tokens=128, device=None) -> List[str]:
    """
    Build prompts via the model's chat template and return only the assistant continuations.
    """
    if device is None:
        device = next(model.parameters()).device

    # Build prompt ids per message
    prompt_id_seqs = []
    for m in user_msgs:
        msgs = [{"role": "user", "content": m}]
        ids = tokenizer.apply_chat_template(
            msgs, add_generation_prompt=True, tokenize=True, return_tensors="pt"
        ).squeeze(0)
        prompt_id_seqs.append(ids)

    # Pad & stack
    pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
    input_ids = torch.nn.utils.rnn.pad_sequence(prompt_id_seqs, batch_first=True, padding_value=pad_id).to(device)
    attention_mask = (input_ids != pad_id).long().to(device)

    gen = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        eos_token_id=tokenizer.eos_token_id,
    )

    # Slice off the prompt portion to keep only the continuation
    outs = []
    for i, pref in enumerate(prompt_id_seqs):
        pref_len = pref.numel()
        continuation_ids = gen[i, pref_len:]
        outs.append(tokenizer.decode(continuation_ids, skip_special_tokens=True).strip())
    return outs


def run_post_training_eval(
    model, tokenizer, dataset, max_new_tokens=128, batch_size=8
) -> Tuple[List[str], dict]:
    """
    Expects dataset with columns: 'context', 'question', 'answer'.
    Builds user messages as:
        "Context: ...\nQuestion: ..."
    Generates answers and computes ROUGE + BERTScore using your metrics cell.
    """
    model.eval()
    preds, refs = [], []

    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i+batch_size]
        user_msgs = [f"Context: {x['context']}\nQuestion: {x['question']}" for x in batch]
        gens = _batch_generate_chat(model, tokenizer, user_msgs, max_new_tokens=max_new_tokens)
        preds.extend(gens)
        refs.extend([x["answer"] for x in batch])

    rouge = compute_rouge(preds, refs)
    bsf1  = compute_bertscore(preds, refs)

    metrics = {
        "rouge1": rouge.get("rouge1"),
        "rouge2": rouge.get("rouge2"),
        "rougeL": rouge.get("rougeL"),
        "bertscore_f1": bsf1,
        "n_samples": len(preds),
    }
    return preds, metrics


In [63]:
import os
import wandb
from transformers import set_seed

# from config import *  # make sure MODEL_NAME, DATASET_SPEC, etc. are defined
# from data_utils import load_and_split, make_tokenizer, tokenize_for_llama_chat
# from model_utils import load_quantized_model, apply_lora, merge_and_save
# from trainer_utils import create_trainer, run_post_training_eval

def main():
    # --- Repro ---
    set_seed(SEED)

    # --- W&B login/init (safe) ---
    wandb_enabled = False
    try:
        api = os.environ.get("WANDB_API_KEY")
        if api:
            wandb.login(key=api)
        else:
            wandb.login()
        wandb.init(project=WANDB_PROJECT, entity=WANDB_ENTITY, name=RUN_NAME, reinit=True)
        wandb_enabled = True
    except Exception:
        print("W&B disabled (no/invalid key).")
        os.environ["WANDB_MODE"] = "disabled"

    # --- Data ---
    print("Loading dataset...")
    train_ds, val_ds, test_ds = load_and_split(DATASET_SPEC, VAL_FRAC, TEST_FRAC, seed=SEED)

    print("Loading tokenizer...")
    tokenizer = make_tokenizer(MODEL_NAME)  # sets pad_token, padding_side

    print("Tokenizing train/val with chat template + masking...")
    train_tok = tokenize_for_llama_chat(train_ds, tokenizer, max_length=MAX_LENGTH)
    val_tok   = tokenize_for_llama_chat(val_ds, tokenizer, max_length=MAX_LENGTH)

    # --- Model ---
    print("Loading quantized model...")
    model = load_quantized_model(MODEL_NAME)
    # recommended for grad checkpointing stability
    if hasattr(model.config, "use_cache"):
        model.config.use_cache = False

    print("Applying LoRA...")
    model = apply_lora(model)

    # --- Trainer (loss + ppl during training) ---
    print("Creating Trainer...")
    trainer = create_trainer(
        model, tokenizer,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        output_dir=OUTPUT_DIR,
        epochs=EPOCHS,
        lr=LEARNING_RATE,
        batch_size=BATCH_SIZE,
        grad_accum=GRAD_ACCUM,
        run_name=RUN_NAME
    )

    print("Training...")
    trainer.train()

    # Ensure final adapter checkpoint saved
    try:
        trainer.save_model(OUTPUT_DIR)  # saves PEFT adapter weights
    except Exception as e:
        print("save_model warning:", e)

    # --- Post-training eval: ROUGE & BERTScore on raw val/test (generation) ---
    print("Post-training evaluation on validation...")
    _, val_metrics = run_post_training_eval(
        model, tokenizer, val_ds,
        max_new_tokens=MAX_NEW_TOKENS, batch_size=EVAL_BATCH_SIZE
    )
    print("Validation metrics:", val_metrics)
    if wandb_enabled:
        wandb.log({f"post/val_{k}": v for k, v in val_metrics.items()})

    print("Post-training evaluation on test...")
    _, test_metrics = run_post_training_eval(
        model, tokenizer, test_ds,
        max_new_tokens=MAX_NEW_TOKENS, batch_size=EVAL_BATCH_SIZE
    )
    print("Test metrics:", test_metrics)
    if wandb_enabled:
        wandb.log({f"post/test_{k}": v for k, v in test_metrics.items()})

    # --- Merge & (optionally) push ---
    print("Merging LoRA adapter into base model...")
    merged_dir = merge_and_save(
        base_model_name=MODEL_NAME,
        adapter_dir=OUTPUT_DIR,
        out_dir="merged_model",
        push_to_hub=False,
        hf_repo_id=HF_REPO_ID,
        tokenizer=tokenizer,
        commit_message="merge LoRA + push tokenizer"
    )
    print("Merged model saved to:", merged_dir)

    if wandb_enabled:
        try:
            wandb.finish()
        except Exception:
            pass

if __name__ == "__main__":
    main()




Loading dataset...
Loading tokenizer...
Tokenizing train/val with chat template + masking...


Map:   0%|          | 0/5949 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Loading quantized model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Applying LoRA...
trainable params: 13,631,488 || all params: 8,043,892,736 || trainable%: 0.1695
Creating Trainer...
Training...


Epoch,Training Loss,Validation Loss
1,0.4516,0.498204
2,0.3391,0.524135
3,0.126,0.647145


[eval] loss=0.4982  ppl=1.65
[eval] loss=0.5241  ppl=1.69
[eval] loss=0.6471  ppl=1.91
Post-training evaluation on validation...


TypeError: string indices must be integers, not 'str'

In [67]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

BASE_ID = "meta-llama/Llama-3.1-8B-Instruct"
ADAPTER_DIR = "/content/output_adapter/checkpoint-2976"

tok = AutoTokenizer.from_pretrained(BASE_ID, use_fast=True)
if tok.pad_token is None: tok.pad_token = tok.eos_token

dtype = torch.bfloat16 if (torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8) else torch.float16
bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=dtype,
)

base = AutoModelForCausalLM.from_pretrained(
    BASE_ID,
    quantization_config=bnb,
    device_map="auto",
    attn_implementation="sdpa",
    low_cpu_mem_usage=True,
)
model = PeftModel.from_pretrained(base, ADAPTER_DIR)
model.eval()


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [69]:
import torch

def chat_with_model(question, context=None, max_new_tokens=200):
    # Create chat-style prompt
    messages = []
    if context:
        messages.append({"role": "system", "content": context})
    messages.append({"role": "user", "content": question})

    prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tok(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tok.eos_token_id
        )

    answer = tok.decode(outputs[0], skip_special_tokens=True)
    return answer


print(chat_with_model("What significant invention did NVIDIA create in 1999?"))


system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

user

What significant invention did NVIDIA create in 1999?assistant

In 1999, NVIDIA released the GeForce 256, which was the first consumer-grade graphics processing unit (GPU) to introduce a significant increase in performance and features.


In [70]:
import torch

def chat_with_model(question, context=None, max_new_tokens=200):

    messages = []
    if context:
        messages.append({"role": "system", "content": context})
    messages.append({"role": "user", "content": question})

    prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tok(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tok.eos_token_id
        )

    answer = tok.decode(outputs[0], skip_special_tokens=True)
    return answer


print(chat_with_model("Where can NVIDIA's financial reports be accessed?"))


system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

user

Where can NVIDIA's financial reports be accessed?assistant

NVIDIA's financial reports can be accessed through their website and on the EDGAR database on the Securities and Exchange Commission's website.


In [71]:
print(chat_with_model("How much did NVIDIA record as an acquisition termination cost in fiscal year 2023 related to the Arm Share Purchase Agreement?"))

system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

user

How much did NVIDIA record as an acquisition termination cost in fiscal year 2023 related to the Arm Share Purchase Agreement?assistant

In fiscal year 2023, NVIDIA recorded an acquisition termination cost of $1,500 million related to the Arm Share Purchase Agreement.


In [78]:
## ------------------------------------------
## This lines of codes used form ChatGPT
## ------------------------------------------

import os
from huggingface_hub import HfApi, create_repo, login
from huggingface_hub.errors import RepositoryNotFoundError, HfHubHTTPError

# 0) Paste a REAL fine-grained token with Models: Read + Write (Create optional)
TOKEN =
assert TOKEN.startswith("hf_"), "Paste the actual token string, not its label."

# 1) Kill stale env tokens that Colab might be using
for k in ["HF_TOKEN", "HUGGINGFACE_HUB_TOKEN"]:
    os.environ.pop(k, None)
os.environ["HUGGINGFACE_HUB_TOKEN"] = TOKEN


login(token=TOKEN, add_to_git_credential=False)

api = HfApi()
print("whoami:", api.whoami(token=TOKEN))


ADAPTER_DIR = "/content/output_adapter/checkpoint-2976"       # must contain adapter_model.safetensors
REPO_ID     = "iamAbhishek01/llama-finance-adapter"           # exact name on your account

# quick local check
print("local files:", os.listdir(ADAPTER_DIR))

# 4) Ensure the repo exists (if your token lacks Create, pre-create it on the website once)
try:
    create_repo(repo_id=REPO_ID, repo_type="model", private=True, exist_ok=True, token=TOKEN)
    print("repo ensured:", REPO_ID)
except Exception as e:
    print("create_repo failed (likely no Create permission). If so, create it in the UI:", e)

# verify repo exists and you have WRITE
try:
    info = api.repo_info(REPO_ID, repo_type="model", token=TOKEN)
    print("repo exists, permission:", getattr(info, "permission", "unknown"))
except RepositoryNotFoundError:
    raise SystemExit(f"Repo {REPO_ID} does not exist. Create it at https://huggingface.co/new-model and rerun.")
except HfHubHTTPError as e:
    raise SystemExit(f"Repo exists but token lacks access. Check permissions on {REPO_ID}: {e}")

# 5) Upload, PASS THE TOKEN EXPLICITLY so it cannot fall back to a read-only one
api.upload_folder(
    folder_path=ADAPTER_DIR,
    repo_id=REPO_ID,
    repo_type="model",
    allow_patterns=["adapter_*","*.json","README.md","LICENSE*"],
    token=TOKEN,
)
print("Upload complete -> https://huggingface.co/" + REPO_ID)


whoami: {'type': 'user', 'id': '6735f3017e5caf2c6177235e', 'name': 'iamAbhishek01', 'fullname': 'Abhishek Das', 'email': 'dummy1cx@gmail.com', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': '/avatars/687f3443c12597972b1076e6a0c33054.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'LLaMA_Finetune', 'role': 'write', 'createdAt': '2025-08-12T16:27:25.444Z'}}}
local files: ['rng_state.pth', 'training_args.bin', 'README.md', 'optimizer.pt', 'adapter_config.json', 'scheduler.pt', 'trainer_state.json', 'adapter_model.safetensors']
repo ensured: iamAbhishek01/llama-finance-adapter
repo exists, permission: unknown


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...oint-2976/adapter_model.safetensors:   1%|1         |  555kB / 54.6MB            

✅ Upload complete -> https://huggingface.co/iamAbhishek01/llama-finance-adapter
