# Data download & calling

In [None]:
from huggingface_hub import login
login(new_session = False)

In [None]:
!pip install -U bitsandbytes trl peft

In [None]:
import os
import torch
import math

from datasets import load_dataset, load_from_disk, DatasetDict
from tqdm import tqdm
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, PeftModel
from peft.utils.other import prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset = load_dataset("RZ412/PokerBench")

dataset.save_to_disk('/content/drive/MyDrive/Fall2025/CSCI_544/Project/dataset')

README.md: 0.00B [00:00, ?B/s]

postflop_500k_train_set_prompt_and_label(…):   0%|          | 0.00/561M [00:00<?, ?B/s]

preflop_60k_train_set_prompt_and_label.j(…):   0%|          | 0.00/59.2M [00:00<?, ?B/s]

postflop_10k_test_set_prompt_and_label.j(…):   0%|          | 0.00/11.2M [00:00<?, ?B/s]

(…)reflop_1k_test_set_prompt_and_label.json: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/563200 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11000 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/563200 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/11000 [00:00<?, ? examples/s]

In [None]:
dataset = load_from_disk("/content/drive/MyDrive/Fall2025/CSCI_544/Project/dataset")

In [None]:
split_files = {
    "preflop": "/content/drive/MyDrive/Fall2025/CSCI_544/Project/split_test_dataset/preflop/preflop_1k_test_set_prompt_and_label.json",
    "postflop": "/content/drive/MyDrive/Fall2025/CSCI_544/Project/split_test_dataset/postflop/postflop_10k_test_set_prompt_and_label.json",
    }

split_dataset = load_dataset("json", data_files = split_files)

Generating preflop split: 0 examples [00:00, ? examples/s]

Generating postflop split: 0 examples [00:00, ? examples/s]

# Preparing Data for LLM Feeding
## Change to a specific template (Instruction-Tuning)

In [None]:
def to_sft_text(ex):
  text = f"### Instruction:\n{ex['instruction'].strip()}\n\n### Response:\n{ex['output'].strip()}"
  return {"text": text}

In [None]:
dataset_proc = DatasetDict()

dataset_proc['train'] = dataset['train'].map(
  to_sft_text, remove_columns=dataset['train'].column_names
)

dataset_proc['test'] = dataset['test'].map(
    to_sft_text, remove_columns=dataset['test'].column_names
)

dataset_proc['preflop_test'] = split_dataset['preflop'].map(
    to_sft_text, remove_columns=split_dataset['preflop'].column_names
)

dataset_proc['postflop_test'] = split_dataset['postflop'].map(
    to_sft_text, remove_columns=split_dataset['postflop'].column_names
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

# Train & Save Model

In [None]:
# Call Model & Tokenizer
MODEL_ID = 'meta-llama/Llama-3.2-1B-Instruct'
MAX_SEQ_LEN = 512

bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config = bnb,
    device_map = 'auto',
)

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
# QLoRA setting
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r = 32,
    lora_alpha = 32,
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],
    lora_dropout = 0.05,
    bias = 'none',
    task_type = 'CAUSAL_LM',
)

model = get_peft_model(model, lora_config)

In [None]:
from transformers.trainer_utils import get_last_checkpoint

In [None]:
args = SFTConfig(
    output_dir="/content/drive/MyDrive/Fall2025/CSCI_544/Project/fine_tuned_llama",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    warmup_ratio=0.05,
    max_steps=18000,
    bf16=False, fp16=True,
    gradient_checkpointing=True,
    lr_scheduler_type="cosine",
    logging_steps=10, save_steps=200, eval_steps=200,
    save_total_limit=2, report_to="none",
    optim="paged_adamw_8bit",
    completion_only_loss = True,

    # Early stopping
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    eval_strategy="steps",
    save_strategy="steps",
  )

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset_proc['train'],
    eval_dataset=dataset_proc['test'],
    processing_class = tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)]
)

In [None]:
trainer.train()
ADAPTER_DIR = "/content/drive/MyDrive/Fall2025/CSCI_544/Project/llama_qlora_adapter_third_try"
trainer.model.save_pretrained(ADAPTER_DIR)
tokenizer.save_pretrained(ADAPTER_DIR)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
9200,0.0645,0.0644,0.073112,1721550.0,0.971174


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
9200,0.0645,0.0644,0.073112,1721550.0,0.971174
9400,0.0639,0.06465,0.075916,3445385.0,0.970969
9600,0.0643,0.064224,0.074972,5169065.0,0.971163
9800,0.0644,0.0643,0.075271,6895052.0,0.971133
10000,0.0645,0.064039,0.07591,8616867.0,0.971309
10200,0.0643,0.064219,0.075328,10339528.0,0.971182
10400,0.064,0.064018,0.075937,12063979.0,0.971183
10600,0.0648,0.063841,0.076687,13788199.0,0.971268
10800,0.0637,0.06389,0.075663,15511905.0,0.97126
11000,0.0636,0.063808,0.074484,17236962.0,0.97136


TrainOutput(global_step=18000, training_loss=0.031725343979067275, metrics={'train_runtime': 30283.3387, 'train_samples_per_second': 19.02, 'train_steps_per_second': 0.594, 'total_flos': 9.825571631162327e+17, 'train_loss': 0.031725343979067275, 'epoch': 1.0227272727272727})

# Call the saved model

In [None]:
ADAPTER_DIR = "/content/drive/MyDrive/Fall2025/CSCI_544/Project/llama_qlora_adapter_third_try"
BASE_MODEL  = "meta-llama/Llama-3.2-1B-Instruct"

bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config = bnb,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_DIR)
if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.eos_token
model = PeftModel.from_pretrained(base_model, ADAPTER_DIR)
model.eval()

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

## Total AA/EM

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

import re
import torch
from tqdm.auto import tqdm

def _normalize_text(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "").strip()).lower()

def evaluate_on_dataset(model, tokenizer, test_ds, batch_size=8, max_new_tokens=16):
    device = model.device
    total = exact_correct = actual_correct = 0

    ACTIONS = {"check","call","fold","bet","raise","allin"}
    ALIAS   = {"all-in":"allin","all_in":"allin"}

    def action_only(s: str):
        if not s:
            return None
        toks = re.split(r"[\s:;/,]+", s.strip().lower())
        for t in toks:
            a = ALIAS.get(t, t)
            if a in ACTIONS:
                return a
        return None

    for i in tqdm(range(0, len(test_ds), batch_size), desc="Evaluating"):
        batch = [test_ds[j] for j in range(i, min(i+batch_size, len(test_ds)))]


        prompts, golds = [], []
        for ex in batch:
            if "prompt" in ex and "completion" in ex:
                prompts.append(ex["prompt"]); golds.append(ex["completion"])
            elif "instruction" in ex and "output" in ex:
                prompts.append(ex["instruction"]); golds.append(ex["output"])
            elif "text" in ex and "### Response:" in ex["text"]:
                inst = ex["text"].split("### Instruction:",1)[-1].split("### Response:",1)[0].strip()
                gold = ex["text"].split("### Response:",1)[-1].strip()
                prompts.append(inst); golds.append(gold)


        if not prompts:
            continue


        enc = tokenizer(
            [f"### Instruction:\n{p.strip()}\n\n### Response:\n" for p in prompts],
            return_tensors="pt", padding=True, truncation=True, max_length=512
        )
        enc = {k: v.to(device) for k, v in enc.items()}

        with torch.no_grad():
            outs = model.generate(
                **enc,
                max_new_tokens=max_new_tokens,
                do_sample=False,             # deterministic
                top_p=1.0,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )


        gen_only = outs[:, enc["input_ids"].shape[1]:]
        preds = tokenizer.batch_decode(gen_only, skip_special_tokens=True)

        for pred_raw, gold_raw in zip(preds, golds):
            total += 1


            pred_first = (pred_raw or "").strip().splitlines()
            pred_line  = pred_first[0].strip() if pred_first else ""

            gold_first = (gold_raw or "").strip().splitlines()
            gold_line  = gold_first[0].strip() if gold_first else ""

            # Exact match
            if _normalize_text(pred_line) == _normalize_text(gold_line):
                exact_correct += 1

            # Action accuracy
            pa, ga = action_only(pred_line), action_only(gold_line)
            if pa is not None and ga is not None and pa == ga:
                actual_correct += 1

    return {
        "total": total,
        "actual_accuracy": actual_correct / max(1, total),
        "exact_match":     exact_correct   / max(1, total),
    }
metrics = evaluate_on_dataset(model, tokenizer, dataset_proc["test"], batch_size=8, max_new_tokens=16)
print(metrics)


Evaluating:   0%|          | 0/1375 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'total': 11000, 'actual_accuracy': 0.8306363636363636, 'exact_match': 0.8253636363636364}


## Preflop AA/EM

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"


metrics = evaluate_on_dataset(model, tokenizer, dataset_proc["preflop_test"], batch_size=8, max_new_tokens=16)
print(metrics)


Evaluating:   0%|          | 0/125 [00:00<?, ?it/s]

{'total': 1000, 'actual_accuracy': 0.9, 'exact_match': 0.891}


## Postflop AA/EM

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"


metrics = evaluate_on_dataset(model, tokenizer, dataset_proc["postflop_test"], batch_size=8, max_new_tokens=16)
print(metrics)


Evaluating:   0%|          | 0/1250 [00:00<?, ?it/s]

{'total': 10000, 'actual_accuracy': 0.8237, 'exact_match': 0.8188}
