# Binary Human-Written vs Machine-Generated Text Classification

In [None]:
# Uncomment to run the notebook on Kaggle

# !pip install transformers==4.40.1
# !pip install evaluate
# !pip install bitsandbytes
# !pip install peft
# !pip install wandb

### Imports

In [None]:
# TODO: clean imports

from datasets import Dataset
import pandas as pd
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, set_seed, BitsAndBytesConfig, AutoConfig, AdamW, get_cosine_schedule_with_warmup
import os
from scipy.special import softmax
import argparse
import logging
import datetime
import bitsandbytes as bnb
from peft import LoraConfig, PeftConfig, PeftModel, TaskType, AutoPeftModelForSequenceClassification, prepare_model_for_kbit_training, get_peft_model, PeftModelForSequenceClassification
import torch
import torch.nn.functional as F
from safetensors.torch import load_model
import os.path as path
import wandb

In [None]:
# Weights and biases login

wandb.login(key="14a7d0e7554bbddd13ca1a8d45472f7a95e73ca4")

os.environ["WANDB_PROJECT"] = "SemEval8"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

In [None]:
# !rm -rf /kaggle/working/*
# TODO: remove this cell

In [None]:
# TODO: riscrivere un po' le variabili / codice per non farlo ricondurre troppo alla baseline lool

### Data retrieval and processing

In [None]:
def preprocess_function(examples, **fn_kwargs):
    return fn_kwargs['tokenizer'](examples["text"], truncation=True, max_length=512)


def get_data(train_path, val_path, test_path):
    """
    function to read dataframe with columns
    """

    train_df = pd.read_json(train_path, lines=True)
    val_df = pd.read_json(val_path, lines=True)
    test_df = pd.read_json(test_path, lines=True)
    
    return train_df, val_df, test_df

### Metrics

The official evaluation metric for Subtask A is **accuracy**. In the [challenge website](https://www.codabench.org/competitions/1752/#/pages-tab) the scorer also reports **macro-F1** and **micro-F1**.

Since it's a binary classification problem though, we limited ourselves to computing **F1-Score**, as it didn't make sense to consider the macro- and micro- interpretations of the metrics (also because the two classes are balanced). (TODO: in realt√† le classi non sono perfettamente bilanciate, quindi magari riguardare le definizioni di micro e macro, che non me le ricordo)

In [None]:
def compute_metrics(eval_pred):
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    results = {}
    results.update(accuracy_metric.compute(predictions=predictions, references=labels))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average="micro"))
    # TODO: remove?
    results.update(f1_metric.compute(predictions=predictions, references=labels, average="macro"))

    return results

### Utils

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
def fine_tune_and_test(train_df, valid_df, test_df, checkpoints_path, model_name, hparams, predictions_path):
    # TODO: documentation

    # functions to map labels to ids and vice versa
    id2label = {0: "human", 1: "machine"}
    label2id = {"human": 0, "machine": 1}
    
    # if train_on_subset is True, then train on a balanced subset of the training data
    if hparams['train_on_subset']:
        min_samples = train_df[train_df['label'] == 0]['source'].value_counts().min()
        train_df = train_df.groupby(['label', 'source']).sample(min_samples, random_state=42).reset_index(drop=True)
    
    # pandas dataframe to huggingface Dataset
    train_dataset = Dataset.from_pandas(train_df)
    valid_dataset = Dataset.from_pandas(valid_df)
    
    # load tokenizer and model from huggingface using the model name
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        trust_remote_code=True,
        device_map = {"": 0},
        num_labels=len(label2id), id2label=id2label, label2id=label2id
    )
    
    # if lr_backbone == 0, then freeze the backbone (deberta)
    if hparams['lora'] == False and hparams['lr_backbone'] == 0:
        for param in model.deberta.parameters():
            param.requires_grad = False
    
    # tokenize data for train/valid
    tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
    tokenized_valid_dataset = valid_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    

    if hparams['lora']:
        # parameter-efficient fine-tuning (PEFT) configuration
        peft_config = LoraConfig(
            lora_alpha=hparams['lora_alpha'],
            lora_dropout=0.1,
            r=hparams['lora_rank'],
            bias="none",
            task_type=TaskType.SEQ_CLS,
            target_modules=['query_proj', 'value_proj', 'key_proj', 'dense'],
            modules_to_save=["score"]
        )
        
        model = get_peft_model(model, peft_config)
        
        lora_parameters = []
        for param in model.base_model.model.deberta.parameters():
            if param.requires_grad:
                lora_parameters.append(param)
        
    output_dir = checkpoints_path
    # batches & accumulation
    per_device_train_batch_size = hparams['per_device_train_batch_size']
    gradient_accumulation_steps = hparams['gradient_accumulation_steps']
    per_device_eval_batch_size = per_device_train_batch_size
    # number of training steps & validation frequency
    max_steps = len(train_df) // per_device_train_batch_size // 2 # because training on 2 GPUs
    save_steps = hparams['save_steps']
    # optimizer and learning rate
    optim="adamw_torch"  # default for TrainingArguments class
    warmup_ratio = 0.2
    if hparams['lora']:
        optimizer=AdamW([{'params': model.base_model.model.classifier.modules_to_save.parameters(), 'lr': hparams['lr_classifier']},
                     {'params': lora_parameters}],
                    lr=hparams['lr_backbone'])
    else:
        optimizer=AdamW([{'params': model.classifier.parameters(), 'lr': hparams['lr_classifier']},
                     {'params': model.deberta.parameters()}],
                    lr=hparams['lr_backbone'])
    lr_scheduler = get_cosine_schedule_with_warmup(optimizer, max_steps * warmup_ratio, max_steps)
    optimizers = (optimizer, lr_scheduler)
    learning_rate = hparams['lr_backbone']
    lr_scheduler_type = hparams['lr_scheduler_type']
    # optimizations
    fp16 = True
    torch_compile = True
    # logging & checkpointing
    logging_steps = save_steps
    run_name = f"{model_name}_bs{per_device_train_batch_size}_lr{learning_rate}_{lr_scheduler_type}{'_lora' if hparams['lora'] else ''}{'_subset' if  hparams['train_on_subset'] else '_full'}"
    
    print_trainable_parameters(model)
    

    # training phase
    training_arguments = TrainingArguments(
        output_dir=output_dir,
        # number of training steps & validation frequency
        max_steps=max_steps,
        save_steps=save_steps,
        # batches & accumulation
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        # optimizer and learning rate
        optim=optim,
        learning_rate=learning_rate,
        lr_scheduler_type=lr_scheduler_type,
        warmup_ratio=warmup_ratio,
        # optimizations
        fp16=fp16,
        torch_compile=torch_compile,
        # logging & checkpointing
        logging_steps=logging_steps,
        report_to="wandb",
        run_name=run_name,
        evaluation_strategy="steps",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        save_total_limit=1,
    )
    
    trainer = Trainer(
        model=model,
        args=training_arguments,
        data_collator=data_collator,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_valid_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        optimizers=optimizers
    )

    # cast normalization layers to float32 for numerical stability
    for name, module in trainer.model.named_modules():
        if "norm" in name:
            module = module.to(torch.float32)

    trainer.train()
    
    wandb.finish()
    
    # test phase
    test_dataset = Dataset.from_pandas(test_df)
    tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
    
    predictions_test = trainer.predict(tokenized_test_dataset)
    preds_test = np.argmax(predictions_test.predictions, axis=-1)

    # TODO: remove below
    # bstrai_results = evaluate.load("bstrai/classification_report").compute(predictions=preds_test, references=predictions_test.label_ids)
    # TODO: use the same 2 metrics (acc, f1) used when training? save the results as df?
    # print("Results using classification_report:", bstrai_results)

    results_test = compute_metrics((predictions_test.predictions, predictions_test.label_ids))
    print("Results using compute_metrics:", results_test)

    # TODO: log test results to wandb
    
    predictions_df = pd.DataFrame({'id': test_df['id'], 'label': preds_test})
    predictions_df.to_csv(predictions_path)
    
    # 2. save model - TODO: double check if wandb saves the best model, then remove below
    # GOAL: avere i checkpoint solo su wandb
#     best_model_path = '/kaggle/working/local_save/'
    
#     if not os.path.exists(best_model_path):
#         os.makedirs(best_model_path)
    
#     # TODO: capire cosa salvare
#     trainer.save_model(best_model_path)
#     trainer.model.save_pretrained(best_model_path)
#     tokenizer.save_pretrained(best_model_path)
#     torch.save(trainer.model.score.state_dict(), f'{best_model_path}/score-params.pt')



In [None]:
train_path =  "/kaggle/input/subtaskA_train_monolingual.jsonl" 
val_path = "/kaggle/input/subtaskA_dev_monolingual.jsonl"
test_path =  "/kaggle/input/subtaskA_test_monolingual.jsonl"

train_df, valid_df, test_df = get_data(train_path, val_path, test_path)

In [None]:
train = True

In [None]:
seeds = [6, 11, 79, 101, 253]   # TODO: every run will use ONE model and a different seed, ONE SEED ONLY, for the big models

model_names = ['distilbert-base-uncased', 
               'distilbert-base-uncased', 
               'microsoft/deberta-v3-large',
               'microsoft/deberta-v3-large']


hparams_list = [
#   BASELINE 1 ...
#   BASELINE 2 ...
#   FULLY FINETUNED DEBERTA
    {"per_device_train_batch_size": 1,
     "gradient_accumulation_steps": 4,
     "lr_backbone": 2e-5, 
     "lr_classifier": 2e-3,
     "lr_scheduler_type": "cosine",
     "save_steps": 3500
     "lora": False,
     "train_on_subset": True,
    },
#   LORA
    {"per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 1,
     "lr_backbone": 8e-5,
     "lr_classifier": 8e-5,
     "lr_scheduler_type": "cosine",
     "save_steps": 1000,
     "lora": True,
     "lora_rank": 64,
     "lora_alpha": 16,
     "train_on_subset": False,
    },
]


if train == True:
    for model_name, hparams in zip(model_names, hparams_list):
        for seed in seeds:
            if "/" in model_name:
                ref_model_name = model_name.split("/")[1]

            print(f"====Training {ref_model_name} with seed {seed}====")
            set_seed(seed)
            
            ct = datetime.datetime.now()

            predictions_path = f'/kaggle/working/subtaskA_predictions_{ref_model_name}_{seed}.csv'

            # train and test model
            fine_tune_and_test(train_df, valid_df, test_df, f"models/{ref_model_name}/{seed}/{ct}", model_name, hparams, predictions_path)
else:
    print("Skipping training.")