In [1]:
%reset -f
!pip install --upgrade pip
!pip install nltk



In [2]:
import pandas as pd

data_fullset = pd.read_csv('../data/code_replaced_neiss_2014_2023.csv')
# CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Hispanic,Body_Part,Diagnosis,Other_Diagnosis,Body_Part_2,Diagnosis_2,Other_Diagnosis_2,Disposition,Location,Fire_Involvement,Product_1,Product_2,Product_3,Alcohol,Drug,Narrative,Stratum,PSU,Weight,Year,Month,Day
# year, month, age, gender, product, body_part, location, narrative_cleaned
data_fullset = data_fullset[['Year', 'Month', 'Age', 'Sex', 'Product_1', 'Body_Part', 'Location', 'Narrative']]
data_fullset.head(10)
data = data_fullset.sample(frac=0.1,random_state=42).reset_index(drop=True)

  data_fullset = pd.read_csv('../data/code_replaced_neiss_2014_2023.csv')


In [3]:
def remove_after_dx(narrative):
  if isinstance(narrative, str):
    parts = narrative.split("DX", 1)
    if len(parts) > 1:
      return parts[0]
    else:
      return narrative  # No "DX:" found, return the original string
  else:
    return narrative  # Not a string, return as is

data.loc[:, 'Narrative'] = data['Narrative'].apply(remove_after_dx)
data.head(10)

Unnamed: 0,Year,Month,Age,Sex,Product_1,Body_Part,Location,Narrative
0,2022,9,14.0,MALE,"BASKETBALL, ACTIVITY AND RELATED EQUIPMENT",WRIST,UNK,14YOM REPORTS HE FELL 1 WEEK AND COMPLAINS OF ...
1,2018,10,28.0,MALE,"CONTAINERS, NOT SPECIFIED",LOWER TRUNK,HOME,A 28YOM BENT TO PICK UP CRATE AT HOME TO ED WI...
2,2020,10,35.0,MALE,MOUNTAIN OR ALL-TERRAIN BICYCLES AND ACCESSORIES,SHOULDER,SPORTS,35YOMRIDING ON MOUNTAIN BIKE PRACTICING FELL DOWN
3,2016,11,1.17,FEMALE,STAIRS OR STEPS,FACE,UNK,14 MONTH OLD FEMALE ABRASION FOR NOSE AND FORE...
4,2018,10,4.0,MALE,"PRETEND ELECTRONICS, TOOLS, HOUSEWARES & APPLI...",FINGER,HOME,4YR M PLAYING WITH TOY KITCHEN APPLIANCE AND ...
5,2020,1,0.58,MALE,"HAIR CURLERS, CURLING IRONS, CLIPS & HAIRPINS",HAND,HOME,7MOM SITTING ON THE COUNTER AND GRABBED A HOT ...
6,2014,9,12.0,MALE,"SPORTS AND RECREATIONAL ACTIVITY, N.E.C.",HAND,SCHOOL,12YOM FELL DURING PE ACTIVITY
7,2022,10,44.0,FEMALE,COMPUTERS (EQUIPMENT AND ELECTRONIC GAMES),LOWER TRUNK,UNK,44YOF CHILD WAS SWINGING A BACK PACK THAT HAD ...
8,2020,6,28.0,MALE,"SKATEBOARDS, UNPOWERED OR UNSPECIFIED",SHOULDER,UNK,28YOM FELL OFF SKATEBOARD LANDED ON L SIDE
9,2014,10,16.0,FEMALE,"VOLLEYBALL (ACTIVITY, APPAREL OR EQUIPMENT)",LOWER TRUNK,SPORTS,16YOF ACTIVE PLAYING VOLLEYBALL 7 DAYS A WEEK ...


In [4]:
medical_terms = {
    "&": "and",
    "***": "",
    ">>": "clinical diagnosis",
    "@": "at",
    "abd": "abdomen",
    "af": "accidental fall",
    "afib": "atrial fibrillation",
    "aki": "acute kidney injury",
    "am": "morning",
    "ams": "altered mental status",
    "bac": "blood alcohol content",
    "bal": "blood alcohol level,",
    "biba": "brought in by ambulance",
    "c/o": "complains of",
    "chi": "closed-head injury",
    "clsd": "closed",
    "cpk": "creatine phosphokinase",
    "cva": "cerebral vascular accident",
    "dx": "clinical diagnosis",
    "ecf": "extended-care facility",
    "er": "emergency room",
    "etoh": "ethyl alcohol",
    "eval": "evaluation",
    "fd": "fall detected",
    "fx": "fracture",
    "fxs": "fractures",
    "glf": "ground level fall",
    "h/o": "history of",
    "htn": "hypertension",
    "hx": "history of",
    "inj": "injury",
    "inr": "international normalized ratio",
    "intox": "intoxication",
    "l": "left",
    "loc": "loss of consciousness",
    "lt": "left",
    "mech": "mechanical",
    "mult": "multiple",
    "n.h.": "nursing home",
    "nh": "nursing home",
    "p/w": "presents with",
    "pm": "afternoon",
    "pt": "patient",
    "pta": "prior to arrival",
    "pts": "patient's",
    "px": "physical examination", # not "procedure",
    "r": "right",
    "r/o": "rules out",
    "rt": "right",
    "s'd&f": "slipped and fell",
    "s/p": "after",
    "sah": "subarachnoid hemorrhage",
    "sdh": "acute subdural hematoma",
    "sts": "sit-to-stand",
    "t'd&f": "tripped and fell",
    "tr": "trauma",
    "uti": "urinary tract infection",
    "w/": "with",
    "w/o": "without",
    "wks": "weeks"
}

In [7]:
import multiprocessing as mp
import numpy as np
import re
import nltk
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

# Load tokenizer globally
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def clean_text(text):
    """
    Clean a single text entry.
    """
    if not isinstance(text, str):
        return ""
        
    # lowercase everything
    text = text.lower()
    
    # unglue DX
    regex_dx = r"([ˆ\W]*(dx)[ˆ\W]*)"
    text = re.sub(regex_dx, r". dx: ", text)
    
    # remove age and sex identifications
    regex_age_sex = r"(\d+)\s*?(yof|yf|yo\s*female|yo\s*f|yom|ym|yr|yo\s*male|yo\s*m)"
    age_sex_match = re.search(regex_age_sex, text)
    
    if age_sex_match:
        text = text.replace(age_sex_match.group(0), "patient")
    
    # translate medical terms
    for term, replacement in medical_terms.items():
        if term in ["@", ">>", "&", "***"]:
            pattern = fr"({re.escape(term)})"
            text = re.sub(pattern, f" {replacement} ", text)
        else:
            pattern = fr"(?<!-)\b({re.escape(term)})\b(?!-)"
            text = re.sub(pattern, replacement, text)
    
    # user-friendly format
    try:
        sentences = sent_tokenizer.tokenize(text)
        sentences = [sent.capitalize() for sent in sentences]
        return " ".join(sentences)
    except Exception as e:
        return text

def clean_text_wrapper(args):
    """
    Wrapper function for multiprocessing that unpacks arguments
    """
    return clean_text(args)

def process_texts(texts, use_parallel=True, n_jobs=None):
    """
    Process texts either in parallel or sequentially.
    
    Args:
        texts: list or pandas Series of texts
        use_parallel: bool, whether to use parallel processing
        n_jobs: int, number of CPU cores to use (None = all cores)
    """
    if not use_parallel:
        # Sequential processing with progress bar
        return [clean_text(text) for text in tqdm(texts, desc="Processing texts")]
    
    # Parallel processing
    if n_jobs is None:
        n_jobs = mp.cpu_count()
        
    print(f"Processing {len(texts)} texts using {n_jobs} cores...")
    
    # Create a pool with the specified number of processes
    with mp.Pool(n_jobs) as pool:
        # Process texts in parallel with progress bar
        results = list(tqdm(
            pool.imap(clean_text_wrapper, texts, chunksize=100),
            total=len(texts),
            desc="Processing texts"
        ))
    
    return results

# Example usage:
# Single process version:
# data['Narrative'] = process_texts(data['Narrative'].tolist(), use_parallel=False)

# Multi-process version:
data['Narrative'] = process_texts(data['Narrative'].tolist(), use_parallel=True)

# Or specify number of cores:
# data['Narrative'] = process_texts(data['Narrative'].tolist(), use_parallel=True, n_jobs=4)

Processing 352052 texts using 24 cores...


[nltk_data] Downloading package punkt to /home/eric/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/eric/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/eric/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/eric/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Processing texts: 100%|██████████| 352052/352052 [00:02<00:00, 161564.54it/s]


In [8]:
data['Narrative']=data['Narrative'].str.upper()
data[['Narrative']]

Unnamed: 0,Narrative
0,PATIENT REPORTS HE FELL 1 WEEK AND COMPLAINS O...
1,A PATIENT BENT TO PICK UP CRATE AT HOME TO ED ...
2,PATIENTRIDING ON MOUNTAIN BIKE PRACTICING FELL...
3,14 MONTH OLD FEMALE ABRASION FOR NOSE AND FORE...
4,PATIENT M PLAYING WITH TOY KITCHEN APPLIANCE A...
...,...
352047,PATIENT HELPING LOAD BICYCLE IN VAN CUT HEAD
352048,PATIENT PUSHING DOWN FOOD IN BLENDER MAKING A ...
352049,PATIENT FELL DOWN STEPS
352050,PATIENT WAS RIDING A BIKE AND FELL WRIST INJURY


In [9]:
replace_list = [
  'INGESTION','ASPIRATION','BURN','ELECTRICAL','SCALD','CHEMICAL','AMPUTATION','THERMAL','CONCUSSION','CONTUSIONS','CRUSHING',
  'DISLOCATION','FOREIGN','FRACTURE','HEMATOMA','LACERATION','DENTAL','NERVE','DAMAGE','INTERNAL','PUNCTURE','STRAIN','SPRAIN', ' SPR ',
  'HEMORRHAGE','ELECTRIC','POISONING','SUBMERSION','AVULSION','RADIATION','DERMA','CONJUNCT','SWELL','WRIST','ABRASION','ACHE',
  'BREAK','CHIN','CUT','ER','FX','HIT','INJURY','LOSE','LOC','PAIN','TWIST','CONTUSION','LAC','YOM','YOF','YR','OLD','MALE','FEMALE',' YO ',' AFT ',
  'YO ',' F ','YF',' M ','MOF','MM ','MOM',' MO ','MO ','ACCIDENTALLY','PATIENT',' PT ','PT ',' P ',' Y ','INJURY','REPORT',' S ',' FE ','HURT','INJ',
  'FELL','INJURE','JURED','URED',' ED',' RT ',' LT '
]

# add body parts
# replace_list += [
#   'ANKLE', 'ARM', 'BODY_PART', 'CHEST', 'CONTUSION', 'CUT', 'EAR', 'ELBOW', 'EYE', 'FACE', 'FINGER', 'FOOT', 'FOREHEAD', 'FRACTURE', 'FX', 
#   'HAND', 'HEAD', 'HIP', 'KNEE', 'LAC', 'LACERATION', 'LEG', 'LOC', 'LOSE', 'NECK', 'PAIN', 'SHOULDER', 'SPRAIN', 'STRAIN', 'SWELL', 'THUMB', 
#   'TOE', 'WRIST','ABRASION', 'ACHE', 'BREAK', 'BURN', 'CHIN', 'CUT', 'ER', 'FRACTURE', 'FX', 'HIT', 'INJURY', 'LACERATION', 'LIP', 'LOSE', 'LOC',
#   'MOUTH', 'NOSE', 'PAIN', 'RIB', 'SCALP', 'SPRAIN', 'STRAIN', 'SWELL', 'TOE', 'TWIST', 'WRIST'
# ]

for i in replace_list:
  data['Narrative'] = data['Narrative'].str.replace(i, '')

data.head(10)

Unnamed: 0,Year,Month,Age,Sex,Product_1,Body_Part,Location,Narrative
0,2022,9,14.0,MALE,"BASKETBALL, ACTIVITY AND RELATED EQUIPMENT",WRIST,UNK,HE 1 WEEK AND COMPLAINS OF HE HAS BEEN PLAY...
1,2018,10,28.0,MALE,"CONTAINERS, NOT SPECIFIED",LOWER TRUNK,HOME,A BENT TO PICK UP CRATE AT HOME TO WITH LOW B...
2,2020,10,35.0,MALE,MOUNTAIN OR ALL-TERRAIN BICYCLES AND ACCESSORIES,SHOULDER,SPORTS,RIDING ON MOUNTAIN BIKE PRACTICING DOWN
3,2016,11,1.17,FEMALE,STAIRS OR STEPS,FACE,UNK,14 MONTH FOR NOSE AND FOREHEAD WAS COMINGDOW...
4,2018,10,4.0,MALE,"PRETEND ELECTRONICS, TOOLS, HOUSEWARES & APPLI...",FINGER,HOME,PLAYING WITH TOY KITCHEN APPLIANCE AND GOT FI...
5,2020,1,0.58,MALE,"HAIR CURLERS, CURLING IRONS, CLIPS & HAIRPINS",HAND,HOME,7 SITTING ON THE COUNT AND GRABBED A HOT CURLI...
6,2014,9,12.0,MALE,"SPORTS AND RECREATIONAL ACTIVITY, N.E.C.",HAND,SCHOOL,DURING PE ACTIVITY
7,2022,10,44.0,FEMALE,COMPUTERS (EQUIPMENT AND ELECTRONIC GAMES),LOWER TRUNK,UNK,CHILD WAS SWINGING A BACK PACK THAT HAD A LAP...
8,2020,6,28.0,MALE,"SKATEBOARDS, UNPOWERED OR UNSPECIFIED",SHOULDER,UNK,OFF SKATEBOARD LANDED ON LEFT SIDE
9,2014,10,16.0,FEMALE,"VOLLEYBALL (ACTIVITY, APPAREL OR EQUIPMENT)",LOWER TRUNK,SPORTS,ACTIVE PLAYING VOLLEYBALL 7 DAYS A WEEK RUNNI...


In [10]:

data.rename(columns={
    'Year': 'year', 
    'Month': 'month', 
    'Age': 'age', 
    'Sex': 'gender', 
    'Product_1': 'product', 
    'Body_Part': 'body_part', 
    'Location': 'location', 
    'Narrative': 'narrative_cleaned'
}, inplace=True)

In [None]:
import pandas as pd
import uuid

data.to_csv('../data/code_replaced_neiss_2014_2023_cleaned_10p.csv', index=False)
data.head(10)

Unnamed: 0,year,month,age,gender,product,body_part,location,narrative_cleaned
0,2022,9,14.0,MALE,"BASKETBALL, ACTIVITY AND RELATED EQUIPMENT",WRIST,UNK,HE 1 WEEK AND COMPLAINS OF HE HAS BEEN PLAY...
1,2018,10,28.0,MALE,"CONTAINERS, NOT SPECIFIED",LOWER TRUNK,HOME,A BENT TO PICK UP CRATE AT HOME TO WITH LOW B...
2,2020,10,35.0,MALE,MOUNTAIN OR ALL-TERRAIN BICYCLES AND ACCESSORIES,SHOULDER,SPORTS,RIDING ON MOUNTAIN BIKE PRACTICING DOWN
3,2016,11,1.17,FEMALE,STAIRS OR STEPS,FACE,UNK,14 MONTH FOR NOSE AND FOREHEAD WAS COMINGDOW...
4,2018,10,4.0,MALE,"PRETEND ELECTRONICS, TOOLS, HOUSEWARES & APPLI...",FINGER,HOME,PLAYING WITH TOY KITCHEN APPLIANCE AND GOT FI...
5,2020,1,0.58,MALE,"HAIR CURLERS, CURLING IRONS, CLIPS & HAIRPINS",HAND,HOME,7 SITTING ON THE COUNT AND GRABBED A HOT CURLI...
6,2014,9,12.0,MALE,"SPORTS AND RECREATIONAL ACTIVITY, N.E.C.",HAND,SCHOOL,DURING PE ACTIVITY
7,2022,10,44.0,FEMALE,COMPUTERS (EQUIPMENT AND ELECTRONIC GAMES),LOWER TRUNK,UNK,CHILD WAS SWINGING A BACK PACK THAT HAD A LAP...
8,2020,6,28.0,MALE,"SKATEBOARDS, UNPOWERED OR UNSPECIFIED",SHOULDER,UNK,OFF SKATEBOARD LANDED ON LEFT SIDE
9,2014,10,16.0,FEMALE,"VOLLEYBALL (ACTIVITY, APPAREL OR EQUIPMENT)",LOWER TRUNK,SPORTS,ACTIVE PLAYING VOLLEYBALL 7 DAYS A WEEK RUNNI...
