In [None]:
from pathlib import Path
import pickle

import datasets
import evaluate
import numpy as np
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments

In [None]:
with Path("prepared_role_domain_1000").open("rb") as f:
    dataset_p1 = pickle.load(f)
with Path("prepared_role_domain_1000_10000").open("rb") as f:
    dataset_p2 = pickle.load(f)
dataset = dataset_p1 + dataset_p2
dataset[-1]

{'function_definition': '\ndef forecast_sales(sales_data: pd.DataFrame, forecast_horizon: int) -> pd.DataFrame:\n    \\"\\"\\"\n    Forecast future sales using an ARIMA model.\n    \n    Args:\n        sales_data (pd.DataFrame): DataFrame containing historical sales data.\n        forecast_horizon (int): Number of periods to forecast.\n    \n    Returns:\n        pd.DataFrame: DataFrame containing forecasted sales.\n    \\"\\"\\"',
 'code': "\n    sales_data = sales_data.set_index('date')\n    \n    # Fit an ARIMA model to the sales data\n    model = ARIMA(sales_data['sales'], order=(1, 1, 1))\n    model_fit = model.fit()\n    \n    # Generate forecasts for the desired horizon\n    forecast = model_fit.forecast(steps=forecast_horizon)\n    \n    # Create a DataFrame with the forecasted sales\n    forecast_df = pd.DataFrame({'date': pd.date_range(start=sales_data.index[-1] + pd.Timedelta(days=1), \n                                                     periods=forecast_horizon, \n        

In [None]:
len(dataset)

9022

In [None]:
xy = datasets.Dataset.from_list(dataset)
xy

Dataset({
    features: ['function_definition', 'code', 'comment', 'explanation', 'correct'],
    num_rows: 9022
})

In [None]:
model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
sep = "[SEP]"

def prepare_input(example):
    tokens = tokenizer(
        example["function_definition"] + sep + example["code"] + sep + example["comment"],
        truncation=True,
        max_length=1024
    )
    for k in tokens:
        example[k] = tokens[k]
    return example

In [None]:
prepared_xy = xy.map(prepare_input, remove_columns=["function_definition", "code", "comment", "explanation"])
prepared_xy = prepared_xy.cast_column("correct", datasets.ClassLabel(num_classes=2))
prepared_xy = prepared_xy.rename_column("correct", "labels")
example = prepared_xy[0]
len(example["input_ids"]), list(example.keys())

Map:   0%|          | 0/9022 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/9022 [00:00<?, ? examples/s]

(149, ['labels', 'input_ids', 'attention_mask'])

In [None]:
lengths = list(map(lambda x: len(x["input_ids"]), prepared_xy))
min(lengths), sum(lengths) / len(lengths), max(lengths)

(26, 212.66005320328088, 476)

In [None]:
prepared_xy = prepared_xy.train_test_split(test_size=0.1, seed=0)
prepared_xy

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 8119
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 903
    })
})

In [None]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return clf_metrics.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

In [None]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding=True,
)

In [None]:
def train(args, filename=None):
    model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)
    trainer = Trainer(
        model=model,
        args=tr_args,
        train_dataset=prepared_xy["train"],
        eval_dataset=prepared_xy["test"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    if filename is not None:
        trainer.save_model(filename)

In [None]:
tr_args = TrainingArguments(
    output_dir="comrel",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=5e-6,
    adam_epsilon=1e-5,
    num_train_epochs=25,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    seed=0,
    fp16=True,
    dataloader_num_workers=0,
    label_names=["labels"],
    label_smoothing_factor=0.05,
    weight_decay=0.05,
    torch_compile=False,
    eval_on_start=True,
    group_by_length=False,
    logging_steps=10,
    save_strategy="best",
    metric_for_best_model="f1",
    load_best_model_at_end=True
)

train(tr_args, "finetuned_bigger")
# Epoch 	Training Loss 	Validation Loss 	Accuracy 	F1 	        Precision 	Recall
# 7 	    0.317800 	    0.472409 	        0.813953 	0.818575 	0.768763 	0.875289