In [None]:
from sentence_transformers import SentenceTransformer, util,  losses
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

from datasets import Dataset
import torch
import pandas as pd
import re
import numpy as np

MODEL = "answerdotai/ModernBERT-base"
TRAIN_DATA = "C:/Users/gioc4/Documents/blog/data/falls/falls.csv"
EVAL_DATA = "C:/Users/gioc4/Documents/blog/data/falls/neis.csv"
MAX_TOKEN_LENGTH = 256
DATA_SIZE = 1000
TRAIN_SIZE = .90

# load data
falls_data = pd.read_csv(TRAIN_DATA).head(DATA_SIZE)
neis_data = pd.read_csv(EVAL_DATA).head(DATA_SIZE)

# define a sentence transformer model
model = SentenceTransformer(MODEL)

No sentence-transformers model found with name answerdotai/ModernBERT-base. Creating a new one with mean pooling.


In [31]:
# first, we need to set up a training dataset based on the cosine similarity between
# observed falls (falls data) and general cases from the NEIS

# we want the observations to be agnostic to patient age, so we remove those
# define remappings of abbreviations
# and strings to remove from narratives

remap = {
    "FX": "FRACTURE",
    "INJ": "INJURY",
    "LAC": "LACERATION",
    "CONT": "CONTUSION",
    "CHI" : "CLOSED HEAD INJURY",
    "ETOH": "ALCOHOL",
    "SDH": "SUBDURAL HEMATOMA",
    "NH": "NURSING HOME",
    "PT": "PATIENT",
    "LT": "LEFT",
    "RT": "RIGHT",
    "&" : " AND "
}
str_remove = "YOM|YOF|MOM|MOF|C/O|S/P|H/O|DX"


def process_text(txt):
    words = txt.split()
    new_words = [remap.get(word, word) for word in words]
    txt = " ".join(new_words)

    txt = re.sub("[^a-zA-Z ]", "", txt)
    txt = re.sub(str_remove, "", txt)

    return re.sub(r"^\s+", "", txt)

In [32]:
falls = falls_data['narrative'].apply(process_text).tolist()
neis = neis_data['Narrative_1'].apply(process_text).tolist()

In [101]:
# encode verified falls, and neis narratives
embed_falls = model.encode(falls)
embed_neis = model.encode(neis)
cos_sim = util.cos_sim(embed_falls, embed_neis)

NameError: name 'torch' is not defined

In [103]:
# get just the pairwise comparisons for now
dists = torch.diagonal(cos_sim)
d_min, d_max = dists.min(), dists.max()

dists = (dists - d_min)/(d_max - d_min)
dists = np.array(dists).tolist()

  dists = np.array(dists).tolist()


In [107]:
# now convert to a train dataset

train_dataset = Dataset.from_dict({
    "sentence1": falls[0:899],
    "sentence2": neis[0:899],
    "score": dists[0:899]
})

eval_dataset = Dataset.from_dict({
    "sentence1": falls[900:1000],
    "sentence2": neis[900:1000],
    "score": dists[900:1000]
})

In [123]:
train_dataset[335]

{'sentence1': 'FELL OUT OF A CHAIR AT THE NURSING HOME HIT HER HIP CUT HER WRIST CONTUSION HIPLAC WRIST',
 'sentence2': 'ROLLED OUT BEDHIT HEAD ON TABLEFACIAL FXLAC',
 'score': 0.5013291835784912}

In [72]:
train_loss = losses.CosineSimilarityLoss(model=model)

dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=eval_dataset["sentence1"],
    sentences2=eval_dataset["sentence2"],
    scores=eval_dataset["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-dev",
)

In [73]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models",
    # Optional training parameters:
    num_train_epochs=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_ratio=0.1,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    run_name="sts",
)

# 6. Create the trainer & start training
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=train_loss,
    evaluator=dev_evaluator,
)
trainer.train()

100%|██████████| 58/58 [15:31<00:00, 16.07s/it]

{'train_runtime': 931.7774, 'train_samples_per_second': 1.93, 'train_steps_per_second': 0.062, 'train_loss': 0.09221398419347303, 'epoch': 2.0}





TrainOutput(global_step=58, training_loss=0.09221398419347303, metrics={'train_runtime': 931.7774, 'train_samples_per_second': 1.93, 'train_steps_per_second': 0.062, 'total_flos': 0.0, 'train_loss': 0.09221398419347303, 'epoch': 2.0})

In [77]:
preds = trainer.predict(eval_dataset)

  eval_pearson, _ = pearsonr(labels, scores)
  eval_spearman, _ = spearmanr(labels, scores)
100%|██████████| 4/4 [00:20<00:00,  5.01s/it]
