In [1]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.cross_encoder import CrossEncoder

import numpy as np
import pandas as pd
import re

from src.search_funcs import RetrieveReranker

# local vars
BI_ENCODER_MODEL = "answerdotai/ModernBERT-base"
CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-TinyBERT-L-2-v2"
CORPUS = "C:/Users/gioc4/Documents/blog/data/falls/neis.csv"
MAX_TOKEN_LENGTH = 256
CORPUS_SIZE = 30000

# we want the observations to be agnostic to patient age, so we remove those
# define remappings of abbreviations
# and strings to remove from narratives

remap = {
    "FX": "FRACTURE",
    "INJ": "INJURY",
    "LAC": "LACERATION",
    "LOC": "LOSS OF CONCIOUSNESS",
    "CONT": "CONTUSION",
    "CHI" : "CLOSED HEAD INJURY",
    "ETOH": "ALCOHOL",
    "SDH": "SUBDURAL HEMATOMA",
    "AFIB": "ATRIAL FIBRILLATION",
    "NH": "NURSING HOME",
    "LTCF": "LONG TERM CARE FACILITY",
    "PT": "PATIENT",
    "LT": "LEFT",
    "RT": "RIGHT",
    "&" : " AND "
}
str_remove = "YOM|YOF|MOM|MOF|C/O|S/P|H/O|DX"


def process_text(txt):
    words = txt.split()
    new_words = [remap.get(word, word) for word in words]
    txt = " ".join(new_words)

    txt = re.sub("[^a-zA-Z ]", "", txt)
    txt = re.sub(str_remove, "", txt)

    return re.sub(r"^\s+", "", txt)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# strings to encode as searchable

# load data
neis_data = pd.read_csv(CORPUS).head(CORPUS_SIZE)
narrative_strings = neis_data['Narrative_1'].apply(process_text).tolist()

# define models and ranker
biencoder = SentenceTransformer(BI_ENCODER_MODEL)
crossencoder = CrossEncoder(CROSS_ENCODER_MODEL)

No sentence-transformers model found with name answerdotai/ModernBERT-base. Creating a new one with mean pooling.


In [3]:
# set up a Retriveal-Ranker class
ranker = RetrieveReranker(
    corpus=narrative_strings,
    bi_encoder_model=biencoder,
    cross_encoder_model=crossencoder,
    save_corpus=True,
    corpus_path="C:/Users/gioc4/Documents/blog/data/corpus_medium.pkl"
)

In [12]:
# now pass queries 

query = ["57YOM HERE AFTER A FALL 2 DAYS AGO FORM THE STAIRS (10 STEPS) IN THE GROCERY STORE W/ MODERATE TO SEVERE PAIN IN RT HAND AND LT LOWER EXTREMITY LABS+ THC COCAINE DX: RT HALLUX FX POSSIBLE LT MEDIAL TALAR FX LT LOWER EXTREMITY CELLULITIS POLYSUBSTANCE ABUSE",
         "44YOF IS A HAITUAL NARCOTIC INJECTION DRUG USER WHO HAS TROUBLE FINDING HER VEINS AND OFTEN MISSES OR SKIN POPS, SHE HAS DEVELOPED SUBCUTANEOUS ABSCESSES ON BOTH THE LEFT AND RIGHT UPPER EXTREMITIES, NO UDS DX: MULTIPLE SUBCUTANEOUS INJECTION SITE ABSCESSES FROM IV DRUG USE",
         "22YOM HERE FOR LAC TO POSTERIOR LT SCALP PT STATES HE FELL ONTO A POLE PT HAD 2 BLUNTS AND 2 SHOTS OF WHISKEY AND FELT DIZZY AND FELL BAC NOT DONE DX: LAC OF HEAD"]

for q in query:
    output = ranker.query(process_text(q), number_ranks=100, number_results=3)
    print(output)

[(26643, 'PATIENT PRESENTS AFTER A FALL HE WAS SITTING IN THE BLEACHERS AND FELL DOWN  STEPS AT A BASKETBALL GAME POSTERIOR SCALP HEMATOMA  FRACTURE OF RIGHT TH RIB SOFT TISSUE CONTUSION AND HEMATOMA ALONG LEFT LOWER ABDOMINAL WALL RIGHT SUBOCCIPITAL SCALP HEMATOMA LEFT FRONTAL SUBDURAL HEMATOMA AND ASSOCIATED SCANT SAH RIGHT TEMPORAL SAH'), (1224, 'CO LEFT KNEE PAIN BEGAN  DAYS AGO SP MECHANICAL FALL PATIENT WAS REACHING FOR GROCERY CART WHOSE WHEELS GOT STUCK CAUSING HER TO FALL AND LAND ON LEFT KNEE  FEMUR FRACTURE'), (2009, 'TO ER FOR EVAL AFTER FALL  DAYS AGO PATIENT FELL IN HOME  AND  HAS RIGHT HIP PAIN FROM LANDING ON RIGHT SIDE ON FLOOR PATIENT SAYS HE HELPED HER UP  AND  PUT HER TO BED PATIENT UNABLE TO AMBULATE  FALL CLSD FRACTURE RIGHT HIP FECAL IMPACTIONACUTE CYSTITIS W HEMATURIA')]
[(19452, 'PRESENTS WITH RIGHT MIDDLE FINGER PAIN PATIENT STATES THAT  DAYS PRIOR SHE BELIEVED SHE STRUCK HER FINGER ON A DOOR PATIENT WAS AGITATED IN THE ED ENDORSES SMIOKING CRACK COCAINE HRS P

In [14]:
neis_data.iloc[[26643,19452,22824]]

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Hispanic,Body_Part,Diagnosis,Other_Diagnosis,...,Fire_Involvement,Product_1,Product_2,Product_3,Alcohol,Drug,Narrative_1,Stratum,PSU,Weight
26643,220222850,2/4/2022,62,1,2,,2,75,66,,...,0,1294,1842,0,0,0,"62YOM PT PRESENTS AFTER A FALL, HE WAS SITTING...",V,64,17.2223
19452,220231190,1/26/2022,48,2,2,,2,92,57,,...,0,1893,0,0,0,1,48YOF PRESENTS WITH RT MIDDLE FINGER PAIN. PT ...,V,41,17.2223
22824,220369472,1/17/2022,208,2,2,,2,75,59,,...,0,4076,676,474,0,0,8MOF PRESENTS WITH FACIAL LACERATION. PT FELL ...,C,10,5.8342
