# Binary Human-Written vs Machine-Generated Text Classification - DistilBERT and DeBERTaV3-Large

In [None]:
# # Uncomment to run the notebook on Kaggle

# !pip install transformers==4.40.1
# !pip install evaluate
# !pip install peft
# !pip install wandb

### Imports

In [5]:
from datasets import Dataset
import pandas as pd
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, set_seed, AdamW, get_cosine_schedule_with_warmup
import os
import datetime
from peft import LoraConfig, TaskType, get_peft_model
import torch
import wandb

In [None]:
# Weights and biases login

wandb.login(key="14a7d0e7554bbddd13ca1a8d45472f7a95e73ca4")

os.environ["WANDB_PROJECT"] = "SemEval8"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

### Data retrieval and processing

In [None]:
def preprocess_function(input, **fn_kwargs):
    """Function to preprocess input by tokenizing it, truncating to 512 tokens

    Args:
        input (pandas.DataFrame): the set of texts to tokenize 

    Returns:
        The input texts tokenized
    """
    return fn_kwargs['tokenizer'](input["text"], truncation=True, max_length=512)


def get_data(train_path, val_path, test_path):
    """Function to retrieve data from files

    Args:
        train_path (str): the path to the json training dataset
        val_path (str): the path to the json validation dataset
        test_path (str): the path to the json test dataset

    Returns:
        (pandas.Dataframe, pandas.Dataframe, pandas.Dataframe): the respective pandas dataframes
    """

    train_df = pd.read_json(train_path, lines=True)
    val_df = pd.read_json(val_path, lines=True)
    test_df = pd.read_json(test_path, lines=True)
    
    return train_df, val_df, test_df

### Metrics

The official evaluation metric for Subtask A is **accuracy**. In the [challenge website](https://www.codabench.org/competitions/1752/#/pages-tab) the scorer also reports **macro-F1** and **micro-F1**.

Since it's a binary classification problem though, we limited ourselves to computing **F1-Score** using "binary" as the averaging method.

In [None]:
def compute_metrics(eval_pred):
    """Function to compute accuracy and F1-score metrics on the validation/test predictions

    Args:
        eval_pred (<np.ndarray, np.ndarray>): the set of predictions and respective labels

    Returns:
        dict: a dictionary containing the accuracy and f1-score metrics
    """
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    results = {}
    results.update(accuracy_metric.compute(predictions=predictions, references=labels))
    results.update(f1_metric.compute(predictions=predictions, references=labels, average="binary"))

    return results

### Utils

In [None]:
def print_trainable_parameters(model):
    """prints the number and percentage of trainable parameters of the model

    Args:
        model (torch.nn.Module): the model of which to print the number of trainable parameters
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

### Setup, training and testing

In [None]:
def fine_tune_and_test(train_df, valid_df, test_df, checkpoints_path, model_name, ref_model_name, seed, hparams, predictions_dir):
    """Main body of the notebook. Trains/fine-tunes the model and saves the validation and test set predictions to a csv file

    Args:
        train_df (pandas.DataFrame): training dataset
        valid_df (pandas.DataFrame): validation dataset
        test_df (pandas.DataFrame): test dataset
        checkpoints_path (str): the directory where the model checkpoints will be written 
        model_name (str): the name of the model to train/fine-tune and test
        ref_model_name (str): the string representing the model-hyperparameters combination
        seed (int): the seed to use for training
        hparams (dict): the dictionary containing the set of hyperparameters
        predictions_dir (str): path of the directory in which to save the predictions
    """

    # functions to map labels to ids and vice versa
    id2label = {0: "human", 1: "machine"}
    label2id = {"human": 0, "machine": 1}
    
    # if train_on_subset is True, then train on a balanced subset of the training data
    if hparams['train_on_subset']:
        min_samples = train_df[train_df['label'] == 0]['source'].value_counts().min()
        train_df = train_df.groupby(['label', 'source']).sample(min_samples, random_state=42).reset_index(drop=True)
    
    # pandas dataframe to huggingface Dataset
    train_dataset = Dataset.from_pandas(train_df)
    valid_dataset = Dataset.from_pandas(valid_df)
    
    # load tokenizer and model from huggingface using the model name
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        trust_remote_code=True,
        device_map = {"": 0},
        num_labels=len(label2id), id2label=id2label, label2id=label2id
    )
    
    # if lr_backbone == 0, then freeze the backbone
    if hparams['lora'] == False and hparams['lr_backbone'] == 0:
        if 'deberta' in model_name:
            for param in model.deberta.parameters():
                param.requires_grad = False
        elif 'distilbert' in model_name:
            for param in model.distilbert.parameters():
                param.requires_grad = False
        else:
            raise ValueError("Model not supported")
    
    # tokenize data for train/valid
    tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
    tokenized_valid_dataset = valid_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    # get the LoRA model if lora hyperparameter is True
    if hparams['lora']:
        if 'deberta' not in model_name:
            raise ValueError("LoRA is only supported with DeBERTa models")
        
        # parameter-efficient fine-tuning (PEFT) configuration
        peft_config = LoraConfig(
            lora_alpha=hparams['lora_alpha'],
            lora_dropout=0.1,
            r=hparams['lora_rank'],
            bias="none",
            task_type=TaskType.SEQ_CLS,
            target_modules=['query_proj', 'value_proj', 'key_proj', 'dense'],
            modules_to_save=["score"]
        )
        
        model = get_peft_model(model, peft_config)
        

    # TRAINING ARGUMENTS extraction

    output_dir = checkpoints_path
    # batches & accumulation
    per_device_train_batch_size = hparams['per_device_train_batch_size']
    gradient_accumulation_steps = hparams['gradient_accumulation_steps']
    per_device_eval_batch_size = per_device_train_batch_size
    # number of training steps & validation frequency
    max_steps = int(hparams['epochs'] * len(train_df)) // per_device_train_batch_size // gradient_accumulation_steps // 2 # because training on 2 GPUs
    save_steps = hparams['save_steps']
    # optimizer and learning rate
    optim="adamw_torch"  # default for TrainingArguments class
    warmup_ratio = 0.2
    if hparams['lora']:
        lora_parameters = []
        for param in model.base_model.model.deberta.parameters():
            if param.requires_grad:
                lora_parameters.append(param)

        for param in model.base_model.model.pooler.parameters():
            if param.requires_grad:
                lora_parameters.append(param)
        grouped_parameters = [{'params': model.base_model.model.classifier.modules_to_save.parameters(), 'lr': hparams['lr_classifier']},
                              {'params': lora_parameters, 'lr': hparams['lr_backbone']}]
    else:
        if 'deberta' in model_name:
            grouped_parameters = [{'params': model.deberta.parameters(), 'lr': hparams['lr_backbone']},
                                  {'params': model.pooler.parameters(), 'lr': hparams['lr_backbone']},
                                  {'params': model.classifier.parameters(), 'lr': hparams['lr_classifier']}]
        elif 'distilbert' in model_name:
            grouped_parameters = [{'params': model.distilbert.parameters(), 'lr': hparams['lr_backbone']}, 
                                  {'params': model.pre_classifier.parameters(), 'lr': hparams['lr_classifier']},
                                  {'params': model.classifier.parameters(), 'lr': hparams['lr_classifier']},]
        else:
            raise ValueError("Model not supported")
    optimizer=AdamW(grouped_parameters)
    lr_scheduler = get_cosine_schedule_with_warmup(optimizer, max_steps * warmup_ratio, max_steps)
    optimizers = (optimizer, lr_scheduler)
    learning_rate = hparams['lr_backbone']
    lr_scheduler_type = hparams['lr_scheduler_type']
    # optimizations
    fp16 = True
    torch_compile = True
    # logging & checkpointing
    logging_steps = save_steps
    run_name = f"{ref_model_name}_bs{per_device_train_batch_size}_{'subset' if  hparams['train_on_subset'] else 'full'}_{seed}"
    
    print_trainable_parameters(model)
    

    # training phase
    training_arguments = TrainingArguments(
        output_dir=output_dir,
        # number of training steps & validation frequency
        max_steps=max_steps,
        save_steps=save_steps,
        # batches & accumulation
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        # optimizer and learning rate
        optim=optim,
        learning_rate=learning_rate,
        lr_scheduler_type=lr_scheduler_type,
        warmup_ratio=warmup_ratio,
        # optimizations
        fp16=fp16,
        torch_compile=torch_compile,
        # logging & checkpointing
        logging_steps=logging_steps,
        report_to="wandb",
        run_name=run_name,
        evaluation_strategy="steps",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        save_total_limit=1,
    )
    
    trainer = Trainer(
        model=model,
        args=training_arguments,
        data_collator=data_collator,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_valid_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        optimizers=optimizers
    )

    # cast normalization layers to float32 for numerical stability
    for name, module in trainer.model.named_modules():
        if "norm" in name:
            module = module.to(torch.float32)

    trainer.train()


    # test phase
    test_dataset = Dataset.from_pandas(test_df)
    tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
    
    # test the model on the test set and compute the metrics
    output_test = trainer.predict(tokenized_test_dataset)
    results_test = compute_metrics((output_test.predictions, output_test.label_ids))
    print("Results using compute_metrics:", results_test)

    # log test results to wandb
    wandb.run.summary["final_test_accuracy"] = round(results_test["accuracy"], 4)
    wandb.run.summary["final_test_f1"] = round(results_test["f1"], 4)

    # save the test predictions to a csv file
    preds_test = np.argmax(output_test.predictions, axis=-1)
    predictions_df = pd.DataFrame({'id': test_df['id'], 'label': preds_test})
    predictions_df.to_csv(predictions_dir + f"subtaskA_test_predictions_{ref_model_name}_{'subset' if  hparams['train_on_subset'] else 'full'}_{seed}.csv")



    # repeat for validation set as well, to have that as set of predictions as well
    output_val = trainer.predict(tokenized_valid_dataset)
    results_val = compute_metrics((output_val.predictions, output_val.label_ids))
    print("Results using compute_metrics:", results_val)

    wandb.run.summary["final_val_accuracy"] = round(results_val["accuracy"], 4)
    wandb.run.summary["final_val_f1"] = round(results_val["f1"], 4)
    
    preds_val = np.argmax(output_val.predictions, axis=-1)
    predictions_df = pd.DataFrame({'id': valid_df['id'], 'label': preds_val})
    predictions_df.to_csv(predictions_dir + f"subtaskA_val_predictions_{ref_model_name}_{'subset' if  hparams['train_on_subset'] else 'full'}_{seed}.csv")

    wandb.finish()

In [None]:
train_path =  "/kaggle/input/subtaskA_train_monolingual.jsonl" 
val_path = "/kaggle/input/subtaskA_dev_monolingual.jsonl"
test_path =  "/kaggle/input/subtaskA_test_monolingual.jsonl"

train_df, valid_df, test_df = get_data(train_path, val_path, test_path)

In [None]:
train = True

### Configurations and launch

In [None]:
seeds = [
    42, 
    91, 
    184, 
    333,
    647
]

model_names = [
    'distilbert-base-uncased', 
    'distilbert-base-uncased', 
    'microsoft/deberta-v3-large',
    'microsoft/deberta-v3-large'
]

reference_names = [
    'distilbert_frozen',
    'distilbert_finetuned',
    'deberta_finetuned',
    'deberta_LoRA'
]


hparams_list = [
#   BASELINE 1 - FROZEN BACKBONE DISTILBERT
    {"epochs": 3,
     "per_device_train_batch_size": 32,
     "gradient_accumulation_steps": 1,
     "lr_backbone": 0, 
     "lr_classifier": 2e-2,
     "lr_scheduler_type": "cosine",
     "save_steps": 300,
     "lora": False,
     "train_on_subset": False,  # True
    },
#   BASELINE 2 - FULLY FINETUNED DISTILBERT
    {"epochs": 3,
     "per_device_train_batch_size": 32,
     "gradient_accumulation_steps": 1,
     "lr_backbone": 1e-5,
     "lr_classifier": 2e-2,
     "lr_scheduler_type": "cosine",
     "save_steps": 300,
     "lora": False,
     "train_on_subset": False,  # True
    },
#   FULLY FINETUNED DEBERTA
    {"epochs": 3,
     "per_device_train_batch_size": 1,
     "gradient_accumulation_steps": 4,
     "lr_backbone": 5e-6, 
     "lr_classifier": 1e-2,
     "lr_scheduler_type": "cosine",
     "save_steps": 4000,
     "lora": False,
     "train_on_subset": True,
    },
#   LORA
    {"epochs": 1,
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 1,
     "lr_backbone": 8e-5,
     "lr_classifier": 8e-5,
     "lr_scheduler_type": "cosine",
     "save_steps": 1000,
     "lora": True,
     "lora_rank": 64,
     "lora_alpha": 16,
     "train_on_subset": False,  # True
    },
]

predictions_dir = '/kaggle/working/'
if train == True:
    for model_name, ref_model_name, hparams in zip(model_names, reference_names, hparams_list):
        for seed in seeds:
            print(f"==================Training {ref_model_name} with seed {seed}==================")
            set_seed(seed)
            
            ct = datetime.datetime.now()

            # train and test model
            fine_tune_and_test(train_df, valid_df, test_df, f"models/{ref_model_name}/{seed}/{ct}", model_name, ref_model_name, seed, hparams, predictions_dir)
else:
    print("Skipping training.")