In [None]:
from pathlib import Path
import pickle

import datasets
import evaluate
import numpy as np
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments

In [None]:
with Path("prepared_role_domain_1000").open("rb") as f:
    dataset = pickle.load(f)
dataset[-1]

{'function_definition': '\ndef calculate_portfolio_return(portfolio_values: pd.DataFrame) -> pd.Series:\n    "',
 'code': "\n    portfolio_values['daily_return'] = portfolio_values['portfolio_value'].pct_change()\n    portfolio_values['cumulative_return'] = (1 + portfolio_values['daily_return']).cumprod()\n    return portfolio_values['cumulative_return']",
 'comment': '\n# Calculate the daily and cumulative returns for the portfolio',
 'explanation': '\nThe comment accurately describes the purpose of the code, which is to calculate the daily and cumulative returns for a portfolio based on the provided portfolio values.',
 'correct': True}

In [None]:
xy = datasets.Dataset.from_list(dataset)
xy

Dataset({
    features: ['function_definition', 'code', 'comment', 'explanation', 'correct'],
    num_rows: 916
})

In [None]:
model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
tokenizer.all_special_tokens, tokenizer.all_special_ids

(['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]'],
 [50280, 50282, 50283, 50281, 50284])

In [None]:
sep = "[SEP]"

In [None]:
example = xy[0]
example

{'function_definition': '"def prepare_visual_for_report(data, title, filename):"',
 'code': "\n    import matplotlib.pyplot as plt\n    \n    # Create a figure and axis\n    fig, ax = plt.subplots(figsize=(8, 6))\n    \n    # Plot the data\n    ax.plot(data)\n    \n    # Set the title and axis labels\n    ax.set_title(title)\n    ax.set_xlabel('X')\n    ax.set_ylabel('Y')\n    \n    # Save the figure to a file\n    plt.savefig(filename)",
 'comment': '"# Create a figure and axis"',
 'explanation': '"The comment is incorrect because it does not accurately describe the purpose of the code. The code is creating a figure and axis, but the comment does not mention the purpose of creating them."',
 'correct': False}

In [None]:
text = example["function_definition"] + sep + example["code"] + sep + example["comment"]
tokens = tokenizer.encode(text)
print(tokens)

[50281, 3, 1545, 10347, 64, 34309, 64, 1542, 64, 16223, 9, 2203, 13, 4060, 13, 19722, 2262, 3, 50282, 187, 50274, 2948, 1111, 14095, 4658, 15, 4789, 14095, 347, 499, 85, 187, 50274, 187, 50274, 4, 13119, 247, 4677, 285, 7844, 187, 50274, 926, 13, 4589, 426, 499, 85, 15, 2377, 42045, 9, 926, 3281, 10190, 25, 13, 721, 1228, 187, 50274, 187, 50274, 4, 40185, 253, 941, 187, 50274, 991, 15, 14095, 9, 2203, 10, 187, 50274, 187, 50274, 4, 6618, 253, 4060, 285, 7844, 13301, 187, 50274, 991, 15, 1178, 64, 5564, 9, 5564, 10, 187, 50274, 991, 15, 1178, 64, 89, 1968, 2073, 57, 3401, 187, 50274, 991, 15, 1178, 64, 1190, 1492, 2073, 58, 3401, 187, 50274, 187, 50274, 4, 23017, 253, 4677, 281, 247, 1873, 187, 50274, 46150, 15, 15261, 926, 9, 17479, 10, 50282, 187, 50274, 2948, 1111, 14095, 4658, 15, 4789, 14095, 347, 499, 85, 187, 50274, 187, 50274, 4, 13119, 247, 4677, 285, 7844, 187, 50274, 926, 13, 4589, 426, 499, 85, 15, 2377, 42045, 9, 926, 3281, 10190, 25, 13, 721, 1228, 187, 50274, 187, 50274, 

In [None]:
sep = "[SEP]"

def prepare_input(example):
    tokens = tokenizer(
        example["function_definition"] + sep + example["code"] + sep + example["comment"],
        truncation=True,
        max_length=1024
    )
    for k in tokens:
        example[k] = tokens[k]
    return example

In [None]:
prepared_xy = xy.map(prepare_input, remove_columns=["function_definition", "code", "comment", "explanation"])
prepared_xy = prepared_xy.cast_column("correct", datasets.ClassLabel(num_classes=2))
# TODO: hmmmmm
# if (self.label_smoother is not None or self.compute_loss_func is not None) and "labels" in inputs:
#    3716             labels = inputs.pop("labels")
#    3717         else:
#    3718             labels = None
prepared_xy = prepared_xy.rename_column("correct", "labels")
example = prepared_xy[0]
len(example["input_ids"]), list(example.keys())

Map:   0%|          | 0/916 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/916 [00:00<?, ? examples/s]

(261, ['labels', 'input_ids', 'attention_mask'])

In [None]:
example["labels"], prepared_xy.features

(0,
 {'labels': ClassLabel(names=['0', '1'], id=None),
  'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
  'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)})

In [None]:
lengths = list(map(lambda x: len(x["input_ids"]), prepared_xy))
min(lengths), sum(lengths) / len(lengths), max(lengths)

(29, 341.23799126637556, 841)

In [None]:
prepared_xy = prepared_xy.train_test_split(test_size=0.1, seed=0)
prepared_xy

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 824
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 92
    })
})

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return clf_metrics.compute(predictions=predictions, references=labels)

In [None]:
tr_args = TrainingArguments(
    output_dir="comrel",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=1e-4,
    adam_epsilon=1e-6,
    num_train_epochs=5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.2,
    seed=0,
    fp16=True,
    dataloader_num_workers=0,
    label_names=["labels"],
    label_smoothing_factor=0.0,
    torch_compile=False,
    eval_on_start=True,
    group_by_length=True,
    logging_steps=10
)

In [None]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=tr_args,
    train_dataset=prepared_xy["train"],
    eval_dataset=prepared_xy["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()
# Epoch 	Training Loss 	Validation Loss 	Accuracy 	F1 	        Precision 	Recall
# 5 	    0.028600 	    0.779370 	        0.728261 	0.712644 	0.756098 	0.673913

In [None]:
trainer.save_model("finetuned")

In [None]:
def train(args, filename=None):
    model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)
    trainer = Trainer(
        model=model,
        args=tr_args,
        train_dataset=prepared_xy["train"],
        eval_dataset=prepared_xy["test"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    if filename is not None:
        trainer.save_model(filename)

In [None]:
tr_args = TrainingArguments(
    output_dir="comrel",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=1e-4,
    adam_epsilon=1e-6,
    num_train_epochs=5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.2,
    seed=0,
    fp16=True,
    dataloader_num_workers=0,
    label_names=["labels"],
    label_smoothing_factor=0.05,
    weight_decay=0.03,
    torch_compile=False,
    eval_on_start=True,
    group_by_length=True,
    logging_steps=10
)

train(tr_args)
# Epoch 	Training Loss 	Validation Loss 	Accuracy 	F1 	        Precision 	Recall
# 5 	    0.268400 	    0.514940 	        0.771739 	0.783505 	0.745098 	0.826087

In [None]:
tr_args = TrainingArguments(
    output_dir="comrel",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=1e-4,
    adam_epsilon=1e-6,
    num_train_epochs=10,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    seed=0,
    fp16=True,
    dataloader_num_workers=0,
    label_names=["labels"],
    label_smoothing_factor=0.08,
    weight_decay=0.05,
    torch_compile=False,
    eval_on_start=True,
    group_by_length=True,
    logging_steps=10
)

train(tr_args)
# Epoch 	Training Loss 	Validation Loss 	Accuracy 	F1 	        Precision 	Recall
# 10 	    0.172100     	0.624975 	        0.760870 	0.760870 	0.760870 	0.760870

In [None]:
tr_args = TrainingArguments(
    output_dir="comrel",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=8e-5,
    adam_epsilon=1e-5,
    num_train_epochs=8,
    lr_scheduler_type="cosine",
    warmup_ratio=0.2,
    seed=0,
    fp16=True,
    dataloader_num_workers=0,
    label_names=["labels"],
    label_smoothing_factor=0.05,
    weight_decay=0.03,
    torch_compile=False,
    eval_on_start=True,
    group_by_length=True,
    logging_steps=10
)

train(tr_args)
# Epoch 	Training Loss 	Validation Loss 	Accuracy 	F1       	Precision 	Recall
# 8 	    0.137600 	    0.624936 	        0.706522 	0.715789 	0.693878 	0.739130

In [None]:
tr_args = TrainingArguments(
    output_dir="comrel",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=5e-6,
    adam_epsilon=1e-5,
    num_train_epochs=25,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    seed=0,
    fp16=True,
    dataloader_num_workers=0,
    label_names=["labels"],
    label_smoothing_factor=0.05,
    weight_decay=0.05,
    torch_compile=False,
    eval_on_start=True,
    group_by_length=False,
    logging_steps=10,
    save_strategy="best",
    metric_for_best_model="f1",
    load_best_model_at_end=True
)

train(tr_args, "finetuned")
# Epoch 	Training Loss 	Validation Loss 	Accuracy 	F1 	        Precision 	Recall
# 12 	    0.200000 	    0.619146        	0.793478 	0.804124 	0.764706 	0.847826