In [33]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.cross_encoder import CrossEncoder

import numpy as np
import pandas as pd
import re

from src.search_funcs import RetrieveReranker

# local vars
BI_ENCODER_MODEL = "answerdotai/ModernBERT-base"
CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-TinyBERT-L-2-v2"
CORPUS = "C:/Users/gioc4/Documents/blog/data/falls/neis.csv"
MAX_TOKEN_LENGTH = 512
CORPUS_SIZE = 50000

# we want the observations to be agnostic to patient age, so we remove those
# define remappings of abbreviations
# and strings to remove from narratives

remap = {
    "FX": "FRACTURE",
    "INJ": "INJURY",
    "LAC": "LACERATION",
    "LOC": "LOSS OF CONCIOUSNESS",
    "CONT": "CONTUSION",
    "CHI" : "CLOSED HEAD INJURY",
    "ETOH": "ALCOHOL",
    "SDH": "SUBDURAL HEMATOMA",
    "AFIB": "ATRIAL FIBRILLATION",
    "NH": "NURSING HOME",
    "LTCF": "LONG TERM CARE FACILITY",
    "C/O": "COMPLAINS OF",
    "H/O": "HISTORY OF",
    "S/P": "STATUS POST",
    "DX:": "DIAGNOSIS",
    "YOM": "YEAR OLD MALE",
    "YOF": "YEAR OLD FEMALE",
    "MOM": "MONTH OLD MALE",
    "MOF": "MONTH OLD FEMALE",
    "PT": "PATIENT",
    "LT": "LEFT",
    "RT": "RIGHT",
    "&" : " AND "
}

def process_text(txt):

    # remap leading age and sex info
    txt = re.sub(r"(\d+)(YOM|YOF|MOM|MOF)", lambda m: f"{m.group(1)} {remap[m.group(2)]}", txt)

    words = txt.split()
    new_words = [remap.get(word, word) for word in words]
    txt = " ".join(new_words)

    return re.sub(r"^\s+", "", txt)

In [34]:
# strings to encode as searchable

# load data
neis_data = pd.read_csv(CORPUS).head(CORPUS_SIZE)
narrative_strings = neis_data['Narrative_1'].apply(process_text).tolist()

# define models and ranker
biencoder = SentenceTransformer(BI_ENCODER_MODEL)
crossencoder = CrossEncoder(CROSS_ENCODER_MODEL)

No sentence-transformers model found with name answerdotai/ModernBERT-base. Creating a new one with mean pooling.


In [35]:
# set up a Retriveal-Ranker class
ranker = RetrieveReranker(
    corpus=narrative_strings,
    bi_encoder_model=biencoder,
    cross_encoder_model=crossencoder,
    save_corpus=True,
    corpus_path="C:/Users/gioc4/Documents/blog/data/corpus_large.pkl"
)

In [53]:
# rag-ish thing
# get the top 5 most similar cases, based on the query
query = "80 MALE FELT DIZZY AND HIT HEAD ON TOLIET"

idx, output = ranker.query(process_text(query), number_results=5)

In [54]:
output

['90 MALE WAS GETTING OUT OF BED AND FELL STRUCK FACE ON THE CLOSET DOOR DIAGNOSIS HEMATOMA TO HEAD',
 '92 MALE WITH A FALL DOWN THE STEPS STRIKING HIS HEAD +HEAD PAIN DX BRADYCARDIA, FALL',
 '16 MONTH OLD MALE WAS PLAYING AND FELL HITTING HIS HEAD ON A WOODEN CHAIR DX LACERATION OF HEAD',
 '3 MALE FELL OUT OF A CHAIR AND HIT HEAD ON A RADIATOR. DX FACE LACERATION',
 '85 MALE WITH FALL OUT OF CHAIR DIAGNOSIS CLOSED HEAD INJURY AND LACERATION TO FACE']

In [47]:
neis_data.iloc[idx]

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Hispanic,Body_Part,Diagnosis,Other_Diagnosis,...,Fire_Involvement,Product_1,Product_2,Product_3,Alcohol,Drug,Narrative_1,Stratum,PSU,Weight
545,220111585,1/2/2022,219,2,2,,2,76,53,,...,0,4076,0,0,0,0,"19 MOF FELL OFF BED. DX FACE CONTUSION, HEAD ...",C,31,5.8342
837,220114137,1/3/2022,205,2,2,,2,76,53,,...,0,4076,0,0,0,0,5 MOF FELL OFF BED. DX FACE CONTUSION,C,31,5.8342
831,220114126,1/3/2022,219,2,1,,2,76,59,,...,0,4076,0,0,0,0,19 MOF FELL AND HIT FACE ON BED FRAME. DX LAC...,C,31,5.8342
234,220108361,1/1/2022,219,1,2,,2,75,62,,...,0,1842,0,0,0,0,19 MOM FELL DOWN STEPS. DX HEAD INJURY,C,31,5.8342
228,220108350,1/1/2022,214,2,1,,2,75,62,,...,0,679,1807,0,0,0,"14 MOF STANDING ON A COUCH AND FELL OFF, HIT H...",C,31,5.8342
