In [7]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from src.transformer_funcs import CustomDataset, new_input_to_prediction
from src.utils import injury_codes
import torch
import pandas as pd
import numpy as np
import evaluate
import random

random.seed(35418)

MODEL = "answerdotai/ModernBERT-base"
TRAIN_DATA = "C:/Users/gioc4/Documents/blog/data/falls/neis.csv"
MAX_TOKEN_LENGTH = 128
DATA_SIZE = 1500
TRAIN_SIZE = .90

# init some values
tokenizer = AutoTokenizer.from_pretrained(MODEL)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load("accuracy")
label_encoder = LabelEncoder()

# load data
neis_data = pd.read_csv(TRAIN_DATA).head(DATA_SIZE)

# get top 5 diagnoses
values = neis_data.groupby('Diagnosis').size().sort_values(ascending=False)[:5]
train_data = neis_data[neis_data['Diagnosis'].isin(values.index.values)]


In [3]:
# some local funcs
def prep_data(dataframe):
    X = dataframe['Narrative_1'].to_list()
    y = label_encoder.fit_transform(dataframe['Diagnosis'].map(injury_codes))

    return X, y

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [9]:
# set up data
# prepare the text and labels, train-test split, and init torch datasets

X, y = prep_data(train_data)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=TRAIN_SIZE, random_state=42
)


train_dataset = CustomDataset(X_train, y_train, tokenizer, MAX_TOKEN_LENGTH)
test_dataset = CustomDataset(X_test, y_test, tokenizer, MAX_TOKEN_LENGTH)

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=5)

training_args = TrainingArguments(
    output_dir="models",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

[A
[A
[A
[A
[A
[A
[A
                                       
[A                                             

  0%|          | 0/170 [08:55<?, ?it/s]      
[A
[A

{'eval_loss': 0.8709285855293274, 'eval_accuracy': 0.7155172413793104, 'eval_runtime': 14.1741, 'eval_samples_per_second': 8.184, 'eval_steps_per_second': 0.564, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
                                       
[A                                              

  0%|          | 0/170 [17:16<?, ?it/s]      
[A
[A

{'eval_loss': 0.663345992565155, 'eval_accuracy': 0.75, 'eval_runtime': 15.0007, 'eval_samples_per_second': 7.733, 'eval_steps_per_second': 0.533, 'epoch': 2.0}


                                       
100%|██████████| 132/132 [16:25<00:00,  7.47s/it]

{'train_runtime': 985.5166, 'train_samples_per_second': 2.115, 'train_steps_per_second': 0.134, 'train_loss': 0.9155904018517697, 'epoch': 2.0}





TrainOutput(global_step=132, training_loss=0.9155904018517697, metrics={'train_runtime': 985.5166, 'train_samples_per_second': 2.115, 'train_steps_per_second': 0.134, 'total_flos': 177538530155520.0, 'train_loss': 0.9155904018517697, 'epoch': 2.0})

In [5]:
# load model
model = AutoModelForSequenceClassification.from_pretrained("models/checkpoint-132", num_labels=5)
tokenizer = AutoTokenizer.from_pretrained("models/checkpoint-132")

In [8]:
newdata = pd.read_csv(TRAIN_DATA)
newdata = newdata.iloc[1000:1250]
newdata = newdata[newdata['Diagnosis'].isin(values.index)]

new_text_input = newdata['Narrative_1'].tolist()

In [9]:
# to get new preds we pass the input through the tokenizer
# and get the tokenized input and attention mask
outputs = new_input_to_prediction(model, new_text_input, tokenizer, MAX_TOKEN_LENGTH)

# convert predictions to probabilities, then get max probability as label
predictions = torch.nn.functional.softmax(outputs.logits, dim=1)
df_preds = pd.DataFrame(
    {
        "text": new_text_input,
        "label": newdata["Diagnosis"],
        "pred": predictions.argmax(1),
    }
)