### RoBERTa model training & validation

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import mean_absolute_error, median_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import Dataset
from datasets import concatenate_datasets
import json
import os
import shutil
import torch

case = "Case_N"
code = f"20x1_{case}"
dataset_splitted_path = f"datasets/{case}"
output_dir = f"./models/{case}/{code}/roberta"

batch_1 = ['APSTUD', 'BAM', 'CLOV', 'DM']
batch_2 = ['DURACLOUD', 'JRESERVER', 'MDL', 'MESOS']
batch_3 = ['MULE', 'MULESTUDIO', 'TIMOB', 'USERGRID']
batch_4 = ['TISTUD', 'XD']

# to run which batch?
dataset_names = batch_1 + batch_2 + batch_3 + batch_4

BASE_MODEL = "roberta-base"
LEARNING_RATE = 1e-4     
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 20

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)
    
    mae = mean_absolute_error(labels, logits)
    mdae = median_absolute_error(labels, logits)
    return {"mae": mae, "mdae": mdae}


def preprocess_function(examples):
    encoded = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=MAX_LENGTH)
    dataset = MakeTorchData(encoded, examples['storypoint'])
    return dataset


class MakeTorchData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        item["labels"] = float(item["labels"])
        return item

    def __len__(self):
        return len(self.labels)

class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.l1_loss(logits, labels)  # Use l1_loss for MAE
        return (loss, outputs) if return_outputs else loss
    
def rename_checkpoint_folder(output_dir):
    dirs = os.listdir(output_dir)
    # Loop over the directories
    for dir in dirs:
        # Check if the directory is a checkpoint directory
        if dir.startswith("checkpoint-"):
            # Specify the current name and the new name of the checkpoint directory
            current_name = os.path.join(output_dir, dir)
            new_name = os.path.join(output_dir, "model" )

            # Rename the checkpoint directory
            shutil.move(current_name, new_name)

print('done')

In [None]:

# load the cleaned data
for dataset_name in dataset_names:

    print(f"start processing - {dataset_name}...")

    # load the json data
    raw_train_data = Dataset.from_json(f'{dataset_splitted_path}/{dataset_name}/train.json')
    raw_val_data = Dataset.from_json(f'{dataset_splitted_path}/{dataset_name}/val.json')
    raw_test_data = Dataset.from_json(f'{dataset_splitted_path}/{dataset_name}/test.json')

    ds = {"train": preprocess_function(raw_train_data), "validation": preprocess_function(raw_val_data), "test": preprocess_function(raw_test_data)}


    model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1)

    file_dir = f"{output_dir}/{dataset_name}"

    training_args = TrainingArguments(
        output_dir= file_dir,
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        metric_for_best_model="eval_loss",
        load_best_model_at_end=True,
        weight_decay=0.01,
        report_to="none",
        push_to_hub=False,
    )

    trainer = RegressionTrainer(
        model=model,
        args=training_args,
        train_dataset=ds["train"],
        eval_dataset=ds["validation"],
        compute_metrics=compute_metrics_for_regression,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )

    trainer.train()


    # Evaluate the fine-tuned model
    metrics = trainer.evaluate()

    # Print evaluation results
    print(metrics)

    metrics_json = json.dumps(metrics, indent=2)

    # Specify the file path
    file_path = f"{file_dir}/{dataset_name}.json"

    # Write the JSON string to the file
    with open(file_path, 'w') as file:
        file.write(metrics_json)
        
    rename_checkpoint_folder(file_dir)

    # if 1 == 1:
    #     break