In [1]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.cross_encoder import CrossEncoder

import torch
import pandas as pd
import re
import numpy as np
from src.search_funcs import RetrieveReranker

MODEL = "answerdotai/ModernBERT-base"
CORPUS = "C:/Users/gioc4/Documents/blog/data/falls/neis.csv"
MAX_TOKEN_LENGTH = 256
CORPUS_SIZE = 1000

# load data
neis_data = pd.read_csv(CORPUS).head(CORPUS_SIZE)

# define a sentence transformer model
model = SentenceTransformer(MODEL)

  from .autonotebook import tqdm as notebook_tqdm
No sentence-transformers model found with name answerdotai/ModernBERT-base. Creating a new one with mean pooling.


In [2]:
# we want the observations to be agnostic to patient age, so we remove those
# define remappings of abbreviations
# and strings to remove from narratives

remap = {
    "FX": "FRACTURE",
    "INJ": "INJURY",
    "LAC": "LACERATION",
    "LOC": "LOSS OF CONCIOUSNESS",
    "CONT": "CONTUSION",
    "CHI" : "CLOSED HEAD INJURY",
    "ETOH": "ALCOHOL",
    "SDH": "SUBDURAL HEMATOMA",
    "AFIB": "ATRIAL FIBRILLATION",
    "NH": "NURSING HOME",
    "LTCF": "LONG TERM CARE FACILITY",
    "PT": "PATIENT",
    "LT": "LEFT",
    "RT": "RIGHT",
    "&" : " AND "
}
str_remove = "YOM|YOF|MOM|MOF|C/O|S/P|H/O|DX"


def process_text(txt):
    words = txt.split()
    new_words = [remap.get(word, word) for word in words]
    txt = " ".join(new_words)

    txt = re.sub("[^a-zA-Z ]", "", txt)
    txt = re.sub(str_remove, "", txt)

    return re.sub(r"^\s+", "", txt)

In [3]:
narrative_strings = neis_data['Narrative_1'].apply(process_text).tolist()

In [4]:
# define models and ranker
biencoder = SentenceTransformer("answerdotai/ModernBERT-base")
crossencoder = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2")


ranker = RetrieveReranker(corpus=narrative_strings,bi_encoder_model=biencoder,cross_encoder_model=crossencoder)

No sentence-transformers model found with name answerdotai/ModernBERT-base. Creating a new one with mean pooling.


In [9]:
query = ["SLIPPED AND FELL IN BATHROOM","FELL DOWN STAIRS"]

for q in query:
    output = ranker.query(process_text(q), number_ranks=100)
    print(output)

[-11.441222  -10.746133   -6.4654574  -8.920775  -10.38837   -10.590063
  -7.100114   -8.685466   -8.27563    -7.040555  -11.220435  -10.094994
  -8.633861  -10.7748375 -11.1467905  -5.588657    3.4937973 -10.036702
  -8.446795   -8.041659  -11.057987  -11.380419  -11.407936   -8.145477
 -10.969235  -10.649743   -9.966466   -5.400455  -11.093788  -11.397966
 -11.461528   -4.868828  -10.828729  -11.1225605  -7.8962426 -10.278608
 -10.767406   -4.849986  -11.408037    3.2422495  -8.512106  -11.452318
 -11.200092   -9.420544  -11.482085   -5.6337714  -3.636729  -10.389117
 -11.429259  -11.458939  -11.514708   -8.438914  -10.978502  -11.233551
 -10.575564  -10.0953865 -11.349391   -7.647438  -11.159008  -11.475522
  -9.768885  -11.381157  -11.387488   -9.853773   -7.7164884 -10.423466
 -11.295868   -3.4322436 -11.428775   -8.303815  -11.374237  -10.180848
  -9.489268   -6.678531   -7.61709    -7.566039   -7.999158  -10.928313
 -11.482136   -9.574499  -10.517405   -6.181459  -11.466616   -7

In [232]:
# encode verified falls, and neis narratives
narrative_embed = model.encode(narrative_strings)

In [477]:
# rank re-rank method

# first we get the top n most semantically similar sentences
# then use a bi-encoder to re-rank them
query = ["SLIPPED AND FELL IN BATHROOM","FELL DOWN STAIRS"]
N = 100

query_embed = model.encode(process_text(query))
sims = model.similarity(query_embed,narrative_embed)
idx = np.array(torch.topk(sims, N).indices)[0]

  idx = np.array(torch.topk(sims, N).indices)[0]


In [478]:
ce_list = []

for i in idx:
    ce_list.append([query, narrative_strings[i]])

In [479]:
ce_model = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2")
scores = ce_model.predict(ce_list)

In [480]:
ce_list[scores.argmax()][0]

'SLIPPED AND FELL IN BATHROOM'

In [481]:
ce_list[scores.argmax()][1]

'SLIPPED AND FELL ON WET BATHROOM FLOOR  LUMBAR PAIN'