In [1]:
import logging
import os
import random
import sys
import time
from copy import deepcopy

import datasets
import pandas as pd
import torch
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import get_cosine_schedule_with_warmup

sys.path.append("../scripts")

try:
    from ai_dataset import AiDataset
    from ai_loader import AiCollator, AiCollatorTrain, show_batch
    from ai_model import AiModel
    from ai_optimizer import get_optimizer
    from metric_utils import compute_metrics
    from train_utils import AverageMeter, as_minutes, get_lr, save_checkpoint

except Exception as e:
    print(e)
    raise ImportError

logger = get_logger(__name__)


pd.options.display.max_colwidth = 1000

# -------- Evaluation -------------------------------------------------------------#


def run_evaluation(accelerator, model, valid_dl, valid_ids):
    model.eval()

    all_predictions = []
    all_truths = []

    progress_bar = tqdm(
        range(len(valid_dl)), disable=not accelerator.is_local_main_process
    )

    for batch in valid_dl:
        with torch.no_grad():
            logits, _ = model(**batch)
            logits = logits.reshape(-1)
        predictions = torch.sigmoid(logits)
        predictions, references = accelerator.gather_for_metrics(
            (predictions, batch["labels"].to(torch.long).reshape(-1))
        )
        predictions, references = (
            predictions.cpu().numpy().tolist(),
            references.cpu().numpy().tolist(),
        )

        all_predictions.extend(predictions)
        all_truths.extend(references)

        progress_bar.update(1)
    progress_bar.close()

    # compute metric
    eval_dict = compute_metrics(all_predictions, all_truths)

    result_df = pd.DataFrame()
    result_df["id"] = valid_ids
    result_df["predictions"] = all_predictions
    result_df["truths"] = all_truths

    oof_df = deepcopy(result_df)
    oof_df = oof_df.rename(columns={"predictions": "generated"})
    oof_df = oof_df[["id", "generated"]].copy()

    to_return = {
        "scores": eval_dict,
        "result_df": result_df,
        "oof_df": oof_df,
    }

    return to_return


# -------- Main Function ---------------------------------------------------------#
def run_training():
    # ------- Accelerator ---------------------------------------------------------------#

    accelerator = Accelerator(
        gradient_accumulation_steps=1,
    )

    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state, main_process_only=False)

    def print_line():
        prefix, unit, suffix = "#", "~~", "#"
        accelerator.print(prefix + unit * 50 + suffix)

    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # ------- Runtime Configs -----------------------------------------------------------#
    print_line()
    accelerator.print(f"setting seed: {42}")
    set_seed(42)

    if accelerator.is_main_process:
        os.makedirs("../models/r_ranking", exist_ok=True)
    print_line()

    # ------- load data ----------------------------------------------------------#
    print_line()

    # load query dataframe
    df_gen = pd.read_csv("../../data/generated/gen_solutions.csv")
    df_gen = df_gen[["spec", "solution"]]
    df_gen.columns = ["task", "text"]
    df_gen["generated"] = 1
    df_real = pd.read_csv("../../data/db_attempts.csv")
    df_real = df_real[["task", "programText"]]
    df_real.columns = ["task", "text"]
    df_real["generated"] = 0
    df = pd.concat([df_gen, df_real], axis=0, ignore_index=True)
    df = df.dropna(subset=["text"])
    df["text"] = df["text"].astype(str)
    df["id"] = df.index + 1

    df = df.reset_index(drop=True)

    # ------- Data Split ----------------------------------------------------------------#

    # sample validation data
    rng = random.Random(42)
    df["fold"] = df["text"].apply(lambda x: "train" if rng.random() < 0.8 else "valid")
    train_df = df[df["fold"] == "train"].copy()
    valid_df = df[df["fold"] == "valid"].copy()

    # train_df = train_df.sort_values(by="prompt_id", ascending=True)
    train_df = train_df.reset_index(drop=True)
    valid_df = valid_df.reset_index(drop=True)

    prompt_ids = train_df["task"].unique().tolist()
    gdf = train_df.groupby("task")["id"].apply(list).reset_index()
    prompt2ids = dict(zip(gdf["task"], gdf["id"]))

    accelerator.print(f"shape of train data: {train_df.shape}")
    accelerator.print(f"{train_df.head()}")
    accelerator.print(f"shape of validation data: {valid_df.shape}")
    accelerator.print(f"Prompts: {prompt_ids}")

    with accelerator.main_process_first():
        dataset_creator = AiDataset()

        train_ds = dataset_creator.get_dataset(train_df)
        valid_ds = dataset_creator.get_dataset(valid_df)

    tokenizer = dataset_creator.tokenizer

    # ------- data loaders ----------------------------------------------------------------#
    train_ds.set_format(
        type=None, columns=["id", "input_ids", "attention_mask", "generated"]
    )

    # sort valid dataset for faster evaluation
    valid_ds = valid_ds.sort("input_length")

    valid_ds.set_format(
        type=None, columns=["id", "input_ids", "attention_mask", "generated"]
    )
    valid_ids = valid_df["id"]

    # ---
    kwargs = dict(
        train_ds=train_ds,
        prompt_ids=prompt_ids,
        prompt2ids=prompt2ids,
    )

    data_collector_train = AiCollatorTrain(
        tokenizer=tokenizer,
        pad_to_multiple_of=64,
        kwargs=kwargs,
    )

    data_collector = AiCollator(tokenizer=tokenizer, pad_to_multiple_of=64)

    train_dl = DataLoader(
        train_ds,
        batch_size=4,
        shuffle=True,
        collate_fn=data_collector_train,
    )

    valid_dl = DataLoader(
        valid_ds,
        batch_size=4,
        shuffle=False,
        collate_fn=data_collector,
    )

    accelerator.print("data preparation done...")
    print_line()

    # --- show batch -------------------------------------------------------------------#
    print_line()

    for b in train_dl:
        break
    show_batch(b, tokenizer, task="training", print_fn=print, n_examples=4)

    print_line()

    for b in valid_dl:
        break
    show_batch(b, tokenizer, task="validation", print_fn=accelerator.print)

    print_line()

    # ------- Config -------------------------------------------------------------------#
    accelerator.print("config for the current run:")
    print_line()

    # ------- Model --------------------------------------------------------------------#
    print_line()
    print("creating the LLM Detection model...")
    model = AiModel(accelerator.device)
    print_line()

    # ------- Optimizer ----------------------------------------------------------------#
    print_line()
    print("creating the optimizer...")
    optimizer = get_optimizer(model)
    # ------- Prepare -------------------------------------------------------------------#

    model, optimizer, train_dl, valid_dl = accelerator.prepare(
        model, optimizer, train_dl, valid_dl
    )

    # ------- Scheduler -----------------------------------------------------------------#
    print_line()
    num_epochs = 1
    grad_accumulation_steps = 1
    warmup_pct = 0.1

    num_update_steps_per_epoch = len(train_dl) // grad_accumulation_steps
    num_training_steps = num_epochs * num_update_steps_per_epoch
    num_warmup_steps = int(warmup_pct * num_training_steps)

    accelerator.print(f"# training updates per epoch: {num_update_steps_per_epoch}")
    accelerator.print(f"# training steps: {num_training_steps}")
    accelerator.print(f"# warmup steps: {num_warmup_steps}")

    scheduler = get_cosine_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
    )

    # ------- training setup --------------------------------------------------------------#
    best_lb = -1  # track recall@1000

    patience_tracker = 0
    current_iteration = 0

    # ------- training  --------------------------------------------------------------------#
    start_time = time.time()
    accelerator.wait_for_everyone()

    for epoch in range(num_epochs):
        # close and reset progress bar
        if epoch != 0:
            progress_bar.close()

        progress_bar = tqdm(
            range(num_update_steps_per_epoch),
            disable=not accelerator.is_local_main_process,
        )
        loss_meter = AverageMeter()

        # Training ------
        model.train()
        for step, batch in enumerate(train_dl):
            with accelerator.accumulate(model):
                _, loss = model(**batch)
                accelerator.backward(loss)

                if accelerator.sync_gradients:
                    accelerator.clip_grad_norm_(model.parameters(), 1.0)

                    optimizer.step()  # gradient_state.sync_gradients check is performed inside optimizer.step
                    scheduler.step()
                    optimizer.zero_grad()

                loss_meter.update(loss.item())

            if accelerator.sync_gradients:
                progress_bar.set_description(
                    f"STEP: {current_iteration + 1:5}/{num_update_steps_per_epoch:5}. "
                    f"LR: {get_lr(optimizer):.4f}. "
                    f"Loss: {loss_meter.avg:.4f}. "
                )

                progress_bar.update(1)
                current_iteration += 1

            # >--------------------------------------------------|
            # >-- evaluation ------------------------------------|
            # >--------------------------------------------------|

            if (accelerator.sync_gradients) & (current_iteration % 500 == 0):
                # set model in eval mode
                model.eval()
                eval_response = run_evaluation(accelerator, model, valid_dl, valid_ids)

                scores_dict = eval_response["scores"]
                result_df = eval_response["result_df"]
                oof_df = eval_response["oof_df"]
                lb = scores_dict["lb"]

                print_line()
                et = as_minutes(time.time() - start_time)
                accelerator.print(
                    f">>> Epoch {epoch + 1} | Step {step} | Total Step {current_iteration} | Time: {et}"
                )
                print_line()
                accelerator.print(f">>> Current LB (AUC) = {round(lb, 4)}")

                print_line()

                is_best = False
                if lb >= best_lb:
                    best_lb = lb
                    is_best = True
                    patience_tracker = 0

                    # -----
                    best_dict = dict()
                    for k, v in scores_dict.items():
                        best_dict[f"{k}_at_best"] = v
                else:
                    patience_tracker += 1

                if is_best:
                    oof_df.to_csv(
                        os.path.join("../models/r_ranking", "oof_df_best.csv"),
                        index=False,
                    )
                    result_df.to_csv(
                        os.path.join("../models/r_ranking", "result_df_best.csv"),
                        index=False,
                    )
                else:
                    accelerator.print(f">>> patience reached {patience_tracker}/{10}")
                    accelerator.print(f">>> current best score: {round(best_lb, 4)}")

                oof_df.to_csv(
                    os.path.join("../models/r_ranking", "oof_df_last.csv"), index=False
                )
                result_df.to_csv(
                    os.path.join("../models/r_ranking", "result_df_last.csv"),
                    index=False,
                )

                # saving -----
                accelerator.wait_for_everyone()
                unwrapped_model = accelerator.unwrap_model(model)
                model_state = {
                    "step": current_iteration,
                    "epoch": epoch + 1,
                    "state_dict": unwrapped_model.state_dict(),
                    "lb": lb,
                }

                if accelerator.is_main_process:
                    save_checkpoint(model_state, is_best=is_best)

                # -- post eval
                model.train()
                torch.cuda.empty_cache()
                print_line()

                # early stopping ----
                if patience_tracker >= 10:
                    print("stopping early")
                    model.eval()
                    accelerator.end_training()
                    return


if __name__ == "__main__":
    run_training()

  from .autonotebook import tqdm as notebook_tqdm
04/06/2025 20:50:50 - INFO - __main__ - Distributed environment: DistributedType.NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda

Mixed precision type: no



#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
setting seed: 42
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
shape of train data: (7430, 5)
                                   task  \
0  c0df7d49-26f5-451c-b44a-1e0bca60bca5   
1  c0df7d49-26f5-451c-b44a-1e0bca60bca5   
2  c0df7d49-26f5-451c-b44a-1e0bca60bca5   
3  4e5b21c0-e86f-4eac-82b6-1a0d00ae4199   
4  4e5b21c0-e86f-4eac-82b6-1a0d00ae4199   

                                                                                                                                                                                                                                                                          text  \
0                                                                                                             

loading file spm.model from cache at C:\Users\Kiaver\.cache\huggingface\hub\models--microsoft--deberta-v3-large\snapshots\64a8c8eab3e352a784c658aef62be1662607476f\spm.model
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\Kiaver\.cache\huggingface\hub\models--microsoft--deberta-v3-large\snapshots\64a8c8eab3e352a784c658aef62be1662607476f\tokenizer_config.json
loading file tokenizer.json from cache at None
loading file chat_template.jinja from cache at None
loading configuration file config.json from cache at C:\Users\Kiaver\.cache\huggingface\hub\models--microsoft--deberta-v3-large\snapshots\64a8c8eab3e352a784c658aef62be1662607476f\config.json
Model config DebertaV2Config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  

Column name ['__index_level_0__'] not in the dataset. Current columns in the dataset: ['task', 'text', 'generated', 'id', 'fold', 'input_ids', 'attention_mask', 'input_length']


Map: 100%|██████████| 1821/1821 [00:00<00:00, 6794.76 examples/s]
Map: 100%|██████████| 1821/1821 [00:00<00:00, 35020.00 examples/s]


Column name ['__index_level_0__'] not in the dataset. Current columns in the dataset: ['task', 'text', 'generated', 'id', 'fold', 'input_ids', 'attention_mask', 'input_length']
setting random seed in data collator as: 1743961879336
data preparation done...
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
################################################################################
batch size: 4
shape of input_ids: torch.Size([4, 128])
Showing 4 from a training batch...



Example 1
Input:

[CLS] n = int(input())\r\nm = list(map(int, input().split()))\r\nres = 0\r\nma = max(m)\r\n\r\nfor i in m:\r\n res += i\r\n\r\nif res % ma == 0:\r\n print(res//ma)\r\nelse:\r\n print((res//ma) +1)\r\n[SEP][PAD][PAD]
----------------------------------------
Label: 0
Example 2
Input:

[CLS] n = int(input())\nweights = sorted(list(map(int, input(

loading configuration file config.json from cache at C:\Users\Kiaver\.cache\huggingface\hub\models--microsoft--deberta-v3-large\snapshots\64a8c8eab3e352a784c658aef62be1662607476f\config.json
Model config DebertaV2Config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "legacy": true,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.51.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

loading weights file p

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
creating the optimizer...
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# training updates per epoch: 1858
# training steps: 1858
# warmup steps: 185


  0%|          | 0/1858 [00:00<?, ?it/s]Safetensors PR exists
100%|██████████| 456/456 [00:11<00:00, 38.68it/s]27%|██▋       | 500/1858 [01:22<03:20,  6.76it/s]


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
>>> Epoch 1 | Step 499 | Total Step 500 | Time: 1m33s
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
>>> Current LB (AUC) = 0.9999
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#


100%|██████████| 456/456 [00:12<00:00, 35.71it/s]54%|█████▍    | 1000/1858 [02:55<02:15,  6.32it/s] 


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
>>> Epoch 1 | Step 999 | Total Step 1000 | Time: 3m8s
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
>>> Current LB (AUC) = 0.9999
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#


100%|██████████| 456/456 [00:12<00:00, 35.62it/s]1%|████████  | 1500/1858 [04:30<00:56,  6.35it/s]   


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
>>> Epoch 1 | Step 1499 | Total Step 1500 | Time: 4m43s
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
>>> Current LB (AUC) = 1.0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#


STEP:  1858/ 1858. LR: 0.0000. Loss: 0.5436. : 100%|██████████| 1858/1858 [05:42<00:00,  5.43it/s]
