In [1]:
import numpy as np
import pandas as pd
from pandas_streaming.df import StreamingDataFrame
import os

from transformers import AutoTokenizer, TrainingArguments, Trainer, ModernBertForSequenceClassification
from datasets import Dataset, load_dataset


import evaluate
import torch

torch.set_float32_matmul_precision('high')

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # convert the logits to their predicted class
    predictions = np.argmax(logits, axis=-1)
    
    # print(f"Avg pred: {sum(predictions)/len(predictions)}, Avg Labels: {sum(labels)/len(labels)}")
    
    return metric.compute(predictions=predictions, references=labels)

dataset = load_dataset("csv", data_files="../data/pairwise-model-data/pairedwise-tweets.tsv", delimiter="\t")
dataset.shuffle()
split = dataset["train"].train_test_split(test_size=0.2)

train_dataset = split["train"].remove_columns('Unnamed: 0')
eval_dataset = split["test"].remove_columns('Unnamed: 0')

In [2]:
def preprocess_function(examples):
    full_str = "[CLS] "
    full_str += examples["query"]
    
    for key in ["paper1", "paper2"]:
        if examples[key] != None:
            full_str += " [SEP] " + examples[key]
    
    return tokenizer(full_str, truncation=True).to('cuda')


id2label = {0: "0", 1: "1"}
label2id = {"0": 0, "1": 1}

tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")


In [3]:
train_dataset_tokenized = train_dataset.map(preprocess_function)

Map:   0%|          | 0/61350 [00:00<?, ? examples/s]

In [4]:
test_dataset_tokenized = eval_dataset.map(preprocess_function)

Map:   0%|          | 0/15338 [00:00<?, ? examples/s]

In [None]:
model = ModernBertForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=2, id2label=id2label, label2id=label2id)

training_args = TrainingArguments(
    output_dir="../models/pairwise-classifier-large",
    learning_rate=2e-6,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=16000,
    save_strategy="steps",
    save_steps=16000,
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16=True,
    optim="adamw_torch",
    save_total_limit=2,
    eval_on_start=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=test_dataset_tokenized,
    compute_metrics=compute_metrics,
)

# trainer.evaluate()
trainer.train(resume_from_checkpoint=True)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
W0510 15:41:53.255000 159379 torch/_inductor/utils.py:1250] [1/0] Not enough SMs to use max_autotune_gemm mode


Step,Training Loss,Validation Loss


: 