In [1]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from src.transformer_funcs import CustomDataset, new_input_to_prediction
from src.utils import injury_codes
import torch
import pandas as pd
import numpy as np
import evaluate
import random

random.seed(35418)

MODEL = "answerdotai/ModernBERT-base"
TRAIN_DATA = "C:/Users/gioc4/Documents/blog/data/falls/neis.csv"
MAX_TOKEN_LENGTH = 256
DATA_SIZE = 2000
TRAIN_SIZE = .90

# init some values
tokenizer = AutoTokenizer.from_pretrained(MODEL)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
label_encoder = LabelEncoder()

# load data
neis_data = pd.read_csv(TRAIN_DATA).head(DATA_SIZE)

# get top 5 diagnoses
values = neis_data.groupby('Diagnosis').size().sort_values(ascending=False)[:5]
train_data = neis_data[neis_data['Diagnosis'].isin(values.index.values)]


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# some local funcs
def prep_data(dataframe):
    X = dataframe['Narrative_1'].to_list()
    y = label_encoder.fit_transform(dataframe['Diagnosis'].map(injury_codes))

    # return dict of encoded labels
    keys = label_encoder.classes_
    values = label_encoder.transform(label_encoder.classes_)
    value_dict = dict(zip(keys, map(int, values)))


    return X, y, value_dict

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # get preds using just the max predicted value
    acc = accuracy.compute(predictions=predictions, references=labels)

    return acc


In [3]:
# set up data
# prepare the text and labels, train-test split, and init torch datasets

X, y, value_dict = prep_data(train_data)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=TRAIN_SIZE, random_state=42
)


train_dataset = CustomDataset(X_train, y_train, tokenizer, MAX_TOKEN_LENGTH)
test_dataset = CustomDataset(X_test, y_test, tokenizer, MAX_TOKEN_LENGTH)

In [4]:
# set up model

# set labels for inputs
id2label = dict((v,k) for k,v in value_dict.items())
label2id = value_dict

model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=5, id2label=id2label, label2id=label2id)

training_args = TrainingArguments(
    output_dir="models",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                
 50%|█████     | 87/174 [20:16<16:54, 11.66s/it]

{'eval_loss': 0.8524370193481445, 'eval_accuracy': 0.6753246753246753, 'eval_runtime': 37.0549, 'eval_samples_per_second': 4.156, 'eval_steps_per_second': 0.27, 'epoch': 1.0}


                                                 
100%|██████████| 174/174 [40:56<00:00, 11.84s/it]

{'eval_loss': 0.5316950678825378, 'eval_accuracy': 0.7727272727272727, 'eval_runtime': 37.2713, 'eval_samples_per_second': 4.132, 'eval_steps_per_second': 0.268, 'epoch': 2.0}


100%|██████████| 174/174 [40:58<00:00, 14.13s/it]

{'train_runtime': 2458.4675, 'train_samples_per_second': 1.126, 'train_steps_per_second': 0.071, 'train_loss': 0.8236960926275144, 'epoch': 2.0}





TrainOutput(global_step=174, training_loss=0.8236960926275144, metrics={'train_runtime': 2458.4675, 'train_samples_per_second': 1.126, 'train_steps_per_second': 0.071, 'total_flos': 471618667438080.0, 'train_loss': 0.8236960926275144, 'epoch': 2.0})

In [3]:
# load model
model = AutoModelForSequenceClassification.from_pretrained("models/checkpoint-174", num_labels=5)
tokenizer = AutoTokenizer.from_pretrained("models/checkpoint-174")

In [4]:
newdata = pd.read_csv(TRAIN_DATA)
newdata = newdata.iloc[2500:3000]
newdata = newdata[newdata['Diagnosis'].isin(values.index)]

new_text_input = newdata['Narrative_1'].tolist()

In [5]:
# to get new preds we pass the input through the tokenizer
# and get the tokenized input and attention mask
outputs = new_input_to_prediction(model, new_text_input, tokenizer, MAX_TOKEN_LENGTH)

# convert predictions to probabilities, then get max probability as label
predictions = torch.nn.functional.softmax(outputs.logits, dim=1)
df_preds = pd.DataFrame(
    {
        "text": new_text_input,
        "label": newdata["Diagnosis"],
        "pred": predictions.argmax(1),
    }
)

In [8]:
torch.nn.functional.softmax(outputs.logits, dim=1)

tensor([[2.3225e-02, 7.6291e-01, 1.9845e-01, 8.7046e-03, 6.7145e-03],
        [5.4486e-01, 1.5052e-01, 4.9559e-02, 6.9284e-02, 1.8578e-01],
        [3.4269e-02, 3.8225e-02, 8.2826e-01, 9.0455e-02, 8.7931e-03],
        ...,
        [3.1903e-03, 1.3829e-03, 1.7341e-03, 1.6485e-04, 9.9353e-01],
        [1.9445e-03, 9.5668e-01, 3.6274e-02, 4.3934e-03, 7.1027e-04],
        [9.6429e-05, 9.9820e-01, 8.8022e-04, 7.7958e-04, 4.7722e-05]])

In [6]:
df_preds['pred_LABEL'] = df_preds['pred'].map(model.config.id2label)
df_preds

Unnamed: 0,text,label,pred,pred_LABEL
2501,27 YOM FELL SKIING AND INJ HAND ON MOUNTAIN D...,57,1,Fracture
2502,7 YOF IN HOUSE AND FELL AND HIT HAND ON FURNIT...,57,0,"Contusions, Abrasions"
2503,5 MOM ROLLED OFF BED LANDING ON TILE FLOOR AND...,53,2,Internal organ injury
2504,26 YOM FELL SNOWBOARDING ONTO KNEE INJ IT DX...,57,2,Internal organ injury
2505,49 YOF PLAYING SOCCER AND KNOCKED TO GROUOND A...,53,0,"Contusions, Abrasions"
...,...,...,...,...
2992,2 YOM FELL INTO DOORINJURED LIP DX LACERATION LIP,59,3,Laceration
2994,25 YOM INJURED FINGER ON A BROKEN MIRROR DX LA...,59,3,Laceration
2995,45 YOF CO PAIN RIGHT ANKLE AND SWELLING HURTS ...,71,4,Other/Not Stated
2997,22 YOM FELL FROM A LADDER AND INJURED LEFT ANK...,57,1,Fracture
