In [1]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.cross_encoder import CrossEncoder

import numpy as np
import pandas as pd
import re

from src.search_funcs import RetrieveReranker

# local vars
BI_ENCODER_MODEL = "answerdotai/ModernBERT-base"
CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-TinyBERT-L-2-v2"
CORPUS = "C:/Users/gioc4/Documents/blog/data/falls/neis.csv"
MAX_TOKEN_LENGTH = 256
CORPUS_SIZE = 30000

# we want the observations to be agnostic to patient age, so we remove those
# define remappings of abbreviations
# and strings to remove from narratives

remap = {
    "FX": "FRACTURE",
    "INJ": "INJURY",
    "LAC": "LACERATION",
    "LOC": "LOSS OF CONCIOUSNESS",
    "CONT": "CONTUSION",
    "CHI" : "CLOSED HEAD INJURY",
    "ETOH": "ALCOHOL",
    "SDH": "SUBDURAL HEMATOMA",
    "AFIB": "ATRIAL FIBRILLATION",
    "NH": "NURSING HOME",
    "LTCF": "LONG TERM CARE FACILITY",
    "PT": "PATIENT",
    "LT": "LEFT",
    "RT": "RIGHT",
    "&" : " AND "
}
str_remove = "YOM|YOF|MOM|MOF|C/O|S/P|H/O|DX"


def process_text(txt):
    words = txt.split()
    new_words = [remap.get(word, word) for word in words]
    txt = " ".join(new_words)

    txt = re.sub("[^a-zA-Z ]", "", txt)
    txt = re.sub(str_remove, "", txt)

    return re.sub(r"^\s+", "", txt)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# strings to encode as searchable

# load data
neis_data = pd.read_csv(CORPUS).head(CORPUS_SIZE)
narrative_strings = neis_data['Narrative_1'].apply(process_text).tolist()

# define models and ranker
biencoder = SentenceTransformer(BI_ENCODER_MODEL)
crossencoder = CrossEncoder(CROSS_ENCODER_MODEL)

No sentence-transformers model found with name answerdotai/ModernBERT-base. Creating a new one with mean pooling.


In [3]:
# set up a Retriveal-Ranker class
ranker = RetrieveReranker(
    corpus=narrative_strings,
    bi_encoder_model=biencoder,
    cross_encoder_model=crossencoder,
    save_corpus=True,
    corpus_path="C:/Users/gioc4/Documents/blog/data/corpus_medium.pkl"
)

In [6]:
# now pass queries 

query = ["66 YOM INJ HEAD FACE AND CHEST DRIVER OF SIDE-BY-SIDE LOST CONTROL WENT OFF THE ROAD IN DITCH ETOH INTOX BAC 103 DX CONCUSSION, LT RIB FX ;",
         "64YOF PT WAS DRINKING ALCOHOL AND FELL HITTING HEAD ON FLOOR AT HOME NO BAL DX CHI, ALCOHOL USE",
         "65 YOM REPORTS FELL ASLEEP IN HIS WHEELCHAIR AND THEN FELL OUT TO THE FLOOR. PT APPEARS INTOXICATED, BAC 200. DX: FALL, SHOULDER FX"]

for q in query:
    output = ranker.query(process_text(q), number_ranks=100, number_results=3)
    print(output)

[(17391, 'WAS DRINKING ALCOHOL WHILE RIDING HIS BICYCLE AND FELL OFF NO BAC DRAWN  CLOSED HEAD INJURY CONTUSION TO FACE'), (1459, 'PAIN ALL OVER BODY AND NECK WHEN FALL TO FLOOR FACE DOWN WHEN ALCOHOL INTOX BAC  FRACTURE C FALL'), (523, 'PATIENT REPORTS THAT YESTERDAY SHE LOST HER BALANCE WHEN GETTING OUT OF BED AND FELL BACKWARDS STRIKING A DESK WITH HER LOWER BACK ALCOHOL USE BAL   CLOSED FRACTURE OF ONE RIB OF LEFT SIDE FALL FROM GROUND LEVEL ALCOHOLIC INTOXICATON WITHOUT COMPLICATION')]
[(17634, 'PATIENT WAS AT HOME DRINKING ALCOHOL BAL  FELL BACKWARDS IN THE BATHROOM HITTING BACK OF HEAD ON FLOOR LOC  CHI LACERATION SCALP CM ALCOHOL USE'), (13003, 'PATIENT IS ALCOHOL INTOXICATED BAL  FELL AT  ONTO FLOOR HITTING HEAD LOC  CHI ALCOHOL INTOXICATION'), (5142, 'PATIENT IS ALCOHOL INTOXICATED BAL  WHEN HE FELL OFF THE SOFA HITTING HEAD ON FLOOR LOC  CHI ALCOHOL INTOXICATED')]
[(28447, 'PATIENT FELL OUT OF HER WHEELCHAIR ONTO FLOOR HITTING HEAD AND RIGHT ANKLE DEFORMITY IN THE GROCERY ST

In [7]:
neis_data.iloc[[17391,17634,28447]]

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Hispanic,Body_Part,Diagnosis,Other_Diagnosis,...,Fire_Involvement,Product_1,Product_2,Product_3,Alcohol,Drug,Narrative_1,Stratum,PSU,Weight
17391,220215250,1/19/2022,49,1,0,,0,75,62,,...,0,5040,0,0,1,0,49YOM WAS DRINKING ALCOHOL WHILE RIDING HIS BI...,M,63,72.873
17634,220216431,1/16/2022,67,2,1,,2,75,62,,...,0,1807,0,0,1,0,67YOF PT WAS AT HOME DRINKING ALCOHOL BAL 31 F...,L,89,57.5246
28447,220231809,2/2/2022,80,2,1,,2,75,62,,...,0,1807,0,0,0,0,80YOF PT FELL OUT OF HER WHEELCHAIR ONTO FLOOR...,L,89,57.5246
