In [None]:
import numpy as np
import pandas as pd
import os

from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

import evaluate

metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # convert the logits to their predicted class
    predictions = np.argmax(logits, axis=-1)
    print(predictions, labels)
    return metric.compute(predictions=predictions, references=labels)

df = pd.read_csv( "../data/pairwise-model-data/pairedwise-tweets.tsv", sep = '\t')
df = pd.DataFrame(df[:1000])

dataset = Dataset.from_pandas(df)
split = dataset.train_test_split(test_size=0.2)
train_dataset = split["train"].remove_columns('Unnamed: 0')
eval_dataset = split["test"].remove_columns('Unnamed: 0')

1
0
0
1


In [56]:
def preprocess_function(examples):
    full_str = "[CLS] "
    full_str += examples["query"]
    
    for key in ["abs1", "abs2"]:
        if examples[key] != None:
            full_str += " [SEP] " + examples[key]
    
    return tokenizer(full_str, truncation=True, max_length=4056).to('cuda')


id2label = {0: "0", 1: "1"}
label2id = {"0": 0, "1": 1}

tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
model = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_dataset_tokenized = train_dataset.map(preprocess_function)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

1
0
0
1


In [58]:
test_dataset_tokenized = eval_dataset.map(preprocess_function)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:

training_args = TrainingArguments(
    output_dir="../models/pairwise-classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=400,
    save_strategy="steps",
    save_steps=800,
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16=True,
    optim="adamw_torch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=test_dataset_tokenized,
    compute_metrics=compute_metrics
)

trainer.train(resume_from_checkpoint=False)

1
0
0
1


Step,Training Loss,Validation Loss,F1
400,No log,0.707217,0.675497
800,1.339200,0.043198,0.995074


[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] [0 1 0 0 1 1 0 0 1 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 1 1 1 1 0 0 1 1 1 0 0 0 0
 1 1 0 0 0 0 0 0 1 1 0 0 1 1 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 0 1 1 0 1 1 1 1
 1 0 1 1 1 1 1 0 1 0 1 0 1 0 1 0 0 1 0 0 1 1 1 1 1 1 0 1 1 0 1 1 1 0 1 0 1
 0 0 1 0 1 0 1 1 1 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 1 1 1 1 0 0 1 1 0 1 1 1 0
 0 0 1 0 0 0 1 0 0 1 1 1 1 1 1 0 0 0 1 0 0 1 1 0 1 1 0 0 0 0 0 1 0 1 1 0 1
 1 0 1 1 1 1 0 0 0 1 1 1 0 1 1]
[0 1 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 1 1 1 1 0 0 1 1 1 0 0 0 0
 1 1 0 0 0 0 0 0 1 1 0 0 1 1 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 0 1 1 0 1 1 1 1
 1 0 1 1 1 1 1 0 1 0 1 0 1 0 1 0 0 1

TrainOutput(global_step=800, training_loss=0.876290078163147, metrics={'train_runtime': 124.8324, 'train_samples_per_second': 6.409, 'train_steps_per_second': 6.409, 'total_flos': 378671815183956.0, 'train_loss': 0.876290078163147, 'epoch': 1.0})