In [1]:
!pip install nltk xgboost imblearn jupyter ipywidgets torch



In [2]:
import pandas as pd
import gc

data = pd.read_csv('../data/neiss_sample_10p.csv')
data_new_columns = pd.read_csv('../data/neiss_sample_10p_new_columns.csv')
embedding=pd.read_csv(f'../data/neiss_sample_10p_embedding_v1.csv')

## Clean up Narrative

In [3]:
medical_terms = {
    "&": "and",
    "***": "",
    ">>": "clinical diagnosis",
    "@": "at",
    "abd": "abdomen",
    "af": "accidental fall",
    "afib": "atrial fibrillation",
    "aki": "acute kidney injury",
    "am": "morning",
    "ams": "altered mental status",
    "bac": "blood alcohol content",
    "bal": "blood alcohol level,",
    "biba": "brought in by ambulance",
    "c/o": "complains of",
    "chi": "closed-head injury",
    "clsd": "closed",
    "cpk": "creatine phosphokinase",
    "cva": "cerebral vascular accident",
    "dx": "clinical diagnosis",
    "ecf": "extended-care facility",
    "er": "emergency room",
    "etoh": "ethyl alcohol",
    "eval": "evaluation",
    "fd": "fall detected",
    "fx": "fracture",
    "fxs": "fractures",
    "glf": "ground level fall",
    "h/o": "history of",
    "htn": "hypertension",
    "hx": "history of",
    "inj": "injury",
    "inr": "international normalized ratio",
    "intox": "intoxication",
    "l": "left",
    "loc": "loss of consciousness",
    "lt": "left",
    "mech": "mechanical",
    "mult": "multiple",
    "n.h.": "nursing home",
    "nh": "nursing home",
    "p/w": "presents with",
    "pm": "afternoon",
    "pt": "patient",
    "pta": "prior to arrival",
    "pts": "patient's",
    "px": "physical examination", # not "procedure",
    "r": "right",
    "r/o": "rules out",
    "rt": "right",
    "s'd&f": "slipped and fell",
    "s/p": "after",
    "sah": "subarachnoid hemorrhage",
    "sdh": "acute subdural hematoma",
    "sts": "sit-to-stand",
    "t'd&f": "tripped and fell",
    "tr": "trauma",
    "uti": "urinary tract infection",
    "w/": "with",
    "w/o": "without",
    "wks": "weeks"
}

In [4]:
def remove_after_dx(narrative):
  if isinstance(narrative, str):
    parts = narrative.split("DX", 1)
    if len(parts) > 1:
      return parts[0]
    else:
      return narrative  # No "DX:" found, return the original string
  else:
    return narrative  # Not a string, return as is

data.loc[:, 'Narrative'] = data['Narrative'].apply(remove_after_dx)
data.head(10)

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Hispanic,Body_Part,Diagnosis,Other_Diagnosis,...,Product_3,Alcohol,Drug,Narrative,Stratum,PSU,Weight,Year,Month,Day
0,221032332,2022-09-24,14,1,0,0,0.0,34,71,PAIN,...,0,0.0,0.0,14YOM REPORTS HE FELL 1 WEEK AND COMPLAINS OF ...,V,77,17.2223,2022,9,24
1,181109464,2018-10-30,28,1,1,0,0.0,79,71,BACK PAIN,...,0,0.0,0.0,A 28YOM BENT TO PICK UP CRATE AT HOME TO ED WI...,V,25,17.5136,2018,10,30
2,210103105,2020-10-24,35,1,0,0,0.0,30,53,,...,0,0.0,0.0,35YOMRIDING ON MOUNTAIN BIKE PRACTICING FELL DOWN,S,27,76.0369,2020,10,24
3,161157997,2016-11-15,214,2,0,0,0.0,76,53,,...,0,0.0,0.0,14 MONTH OLD FEMALE ABRASION FOR NOSE AND FORE...,S,48,85.2143,2016,11,15
4,181107411,2018-10-21,4,1,0,0,0.0,92,72,,...,0,0.0,0.0,4YR M PLAYING WITH TOY KITCHEN APPLIANCE AND ...,C,20,4.9383,2018,10,21
5,200134239,2020-01-10,207,1,0,0,0.0,82,51,,...,0,0.0,0.0,7MOM SITTING ON THE COUNTER AND GRABBED A HOT ...,C,20,4.851,2020,1,10
6,140951498,2014-09-10,12,1,1,0,0.0,82,53,,...,0,0.0,0.0,12YOM FELL DURING PE ACTIVITY,C,20,5.7174,2014,9,10
7,221017396,2022-10-03,44,2,1,0,2.0,79,53,,...,0,0.0,0.0,44YOF CHILD WAS SWINGING A BACK PACK THAT HAD ...,V,21,17.2223,2022,10,3
8,200645623,2020-06-13,28,1,1,0,2.0,30,57,,...,0,0.0,0.0,28YOM FELL OFF SKATEBOARD LANDED ON L SIDE,S,82,76.0369,2020,6,13
9,141040420,2014-10-13,16,2,1,0,0.0,79,71,GROIN PAIN,...,0,0.0,0.0,16YOF ACTIVE PLAYING VOLLEYBALL 7 DAYS A WEEK ...,C,8,5.7174,2014,10,13


In [5]:
import multiprocessing as mp
import numpy as np
import re
import nltk
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# Load tokenizer globally
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def clean_text(text):
    """
    Clean a single text entry.
    """
    if not isinstance(text, str):
        return ""
        
    # lowercase everything
    text = text.lower()
    
    # unglue DX
    regex_dx = r"([ˆ\W]*(dx)[ˆ\W]*)"
    text = re.sub(regex_dx, r". dx: ", text)
    
    # remove age and sex identifications
    regex_age_sex = r"(\d+)\s*?(yof|yf|yo\s*female|yo\s*f|yom|ym|yr|yo\s*male|yo\s*m)"
    age_sex_match = re.search(regex_age_sex, text)
    
    if age_sex_match:
        text = text.replace(age_sex_match.group(0), "patient")
    
    # translate medical terms
    for term, replacement in medical_terms.items():
        if term in ["@", ">>", "&", "***"]:
            pattern = fr"({re.escape(term)})"
            text = re.sub(pattern, f" {replacement} ", text)
        else:
            pattern = fr"(?<!-)\b({re.escape(term)})\b(?!-)"
            text = re.sub(pattern, replacement, text)
    
    # user-friendly format
    try:
        sentences = sent_tokenizer.tokenize(text)
        sentences = [sent.capitalize() for sent in sentences]
        return " ".join(sentences)
    except Exception as e:
        return text

def clean_text_wrapper(args):
    """
    Wrapper function for multiprocessing that unpacks arguments
    """
    return clean_text(args)

def process_narrative(texts, use_parallel=True, n_jobs=None):
    """
    Process texts either in parallel or sequentially.
    
    Args:
        texts: list or pandas Series of texts
        use_parallel: bool, whether to use parallel processing
        n_jobs: int, number of CPU cores to use (None = all cores)
    """
    if not use_parallel:
        # Sequential processing with progress bar
        return [clean_text(text) for text in tqdm(texts, desc="Processing texts")]
    
    # Parallel processing
    if n_jobs is None:
        n_jobs = mp.cpu_count()
        
    print(f"Processing {len(texts)} texts using {n_jobs} cores...")
    
    # Create a pool with the specified number of processes
    with mp.Pool(n_jobs) as pool:
        # Process texts in parallel with progress bar
        results = list(tqdm(
            pool.imap(clean_text_wrapper, texts, chunksize=100),
            total=len(texts),
            desc="Processing texts"
        ))
    
    return results

# Example usage:
# Single process version:
# data['Narrative'] = process_narrative(data['Narrative'].tolist(), use_parallel=False)

# Multi-process version:
data['Narrative'] = process_narrative(data['Narrative'].tolist(), use_parallel=True)

# Or specify number of cores:
# data['Narrative'] = process_narrative(data['Narrative'].tolist(), use_parallel=True, n_jobs=4)

Processing 352052 texts using 24 cores...


Processing texts:   0%|          | 0/352052 [00:00<?, ?it/s]

### Remove the injuery terms from the narrative

In [6]:
replace_list = [
  'INGESTION','ASPIRATION','BURN','ELECTRICAL','SCALD','CHEMICAL','AMPUTATION','THERMAL','CONCUSSION','CONTUSIONS','CRUSHING',
  'DISLOCATION','FOREIGN','FRACTURE','HEMATOMA','LACERATION','DENTAL','NERVE','DAMAGE','INTERNAL','PUNCTURE','STRAIN','SPRAIN', ' SPR ',
  'HEMORRHAGE','ELECTRIC','POISONING','SUBMERSION','AVULSION','RADIATION','DERMA','CONJUNCT','SWELL','WRIST','ABRASION','ACHE',
  'BREAK','CHIN','CUT','ER','FX','HIT','INJURY','LOSE','LOC','PAIN','TWIST','CONTUSION','LAC','YOM','YOF','YR','OLD','MALE','FEMALE',' YO ',' AFT ',
  'YO ',' F ','YF',' M ','MOF','MM ','MOM',' MO ','MO ','ACCIDENTALLY','PATIENT',' PT ','PT ',' P ',' Y ','INJURY','REPORT',' S ',' FE ','HURT','INJ',
  'FELL','INJURE','JURED','URED',' ED',' RT ',' LT '
]

data['Narrative']=data['Narrative'].str.upper()
for i in replace_list:
  data['Narrative'] = data['Narrative'].str.replace(i, '')

data.head(10)

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Hispanic,Body_Part,Diagnosis,Other_Diagnosis,...,Product_3,Alcohol,Drug,Narrative,Stratum,PSU,Weight,Year,Month,Day
0,221032332,2022-09-24,14,1,0,0,0.0,34,71,PAIN,...,0,0.0,0.0,HE 1 WEEK AND COMPLAINS OF HE HAS BEEN PLAY...,V,77,17.2223,2022,9,24
1,181109464,2018-10-30,28,1,1,0,0.0,79,71,BACK PAIN,...,0,0.0,0.0,A BENT TO PICK UP CRATE AT HOME TO WITH LOW B...,V,25,17.5136,2018,10,30
2,210103105,2020-10-24,35,1,0,0,0.0,30,53,,...,0,0.0,0.0,RIDING ON MOUNTAIN BIKE PRACTICING DOWN,S,27,76.0369,2020,10,24
3,161157997,2016-11-15,214,2,0,0,0.0,76,53,,...,0,0.0,0.0,14 MONTH FOR NOSE AND FOREHEAD WAS COMINGDOW...,S,48,85.2143,2016,11,15
4,181107411,2018-10-21,4,1,0,0,0.0,92,72,,...,0,0.0,0.0,PLAYING WITH TOY KITCHEN APPLIANCE AND GOT FI...,C,20,4.9383,2018,10,21
5,200134239,2020-01-10,207,1,0,0,0.0,82,51,,...,0,0.0,0.0,7 SITTING ON THE COUNT AND GRABBED A HOT CURLI...,C,20,4.851,2020,1,10
6,140951498,2014-09-10,12,1,1,0,0.0,82,53,,...,0,0.0,0.0,DURING PE ACTIVITY,C,20,5.7174,2014,9,10
7,221017396,2022-10-03,44,2,1,0,2.0,79,53,,...,0,0.0,0.0,CHILD WAS SWINGING A BACK PACK THAT HAD A LAP...,V,21,17.2223,2022,10,3
8,200645623,2020-06-13,28,1,1,0,2.0,30,57,,...,0,0.0,0.0,OFF SKATEBOARD LANDED ON LEFT SIDE,S,82,76.0369,2020,6,13
9,141040420,2014-10-13,16,2,1,0,0.0,79,71,GROIN PAIN,...,0,0.0,0.0,ACTIVE PLAYING VOLLEYBALL 7 DAYS A WEEK RUNNI...,C,8,5.7174,2014,10,13


## Process Body Part

In [7]:
# bdpt_dict={}
# bdpt_dict[0]='INTERNAL'
# bdpt_dict[30]='SHOULDER'
# bdpt_dict[31]='UPPERTRUNK'
# bdpt_dict[32]='ELBOW'
# bdpt_dict[33]='LOWERARM'
# bdpt_dict[34]='WRIST'
# bdpt_dict[35]='KNEE'
# bdpt_dict[36]='LOWERLEG'
# bdpt_dict[37]='ANKLE'
# bdpt_dict[38]='PUBICREGION'
# bdpt_dict[75]='HEAD'
# bdpt_dict[76]='FACE'
# bdpt_dict[77]='EYEBALL'
# bdpt_dict[78]='UPPERTRUNK(OLD)'
# bdpt_dict[79]='LOWERTRUNK'
# bdpt_dict[80]='UPPERARM'
# bdpt_dict[81]='UPPERLEG'
# bdpt_dict[82]='HAND'
# bdpt_dict[83]='FOOT'
# bdpt_dict[84]='25-50% OF BODY'
# bdpt_dict[85]='ALLPARTSBODY'
# bdpt_dict[86]='OTHER(OLD)'
# bdpt_dict[87]='NOTSTATED/UNK'
# bdpt_dict[88]='MOUTH'
# bdpt_dict[89]='NECK'
# bdpt_dict[90]='LOWERARM(OLD)'
# bdpt_dict[91]='LOWERLEG(OLD)'
# bdpt_dict[92]='FINGER'
# bdpt_dict[93]='TOE'
# bdpt_dict[94]='EAR'

# data['body_string'] = data['Body_Part'].map(bdpt_dict)

data = data[(data['Body_Part']!=0) & (data['Body_Part']!=84) & (data['Body_Part']!=85) & (data['Body_Part']!=86) & (data['Body_Part']!=87)]

In [8]:
import numpy as np
data['Disposition_recode']=np.nan
data.loc[((data['Disposition']==1)), 'Disposition_recode'] = 0
data.loc[((data['Disposition']==2)), 'Disposition_recode'] = 1
data.loc[((data['Disposition']==4)), 'Disposition_recode'] = 2
data.loc[((data['Disposition']==5)), 'Disposition_recode'] = 3
data.loc[((data['Disposition']==8)), 'Disposition_recode'] = 4
data=data[data['Disposition_recode'].notna()]

data['Disposition_recode_2']=0
data.loc[((data['Disposition_recode']>0)), 'Disposition_recode_2'] = 1

In [9]:
data.head(10)

Unnamed: 0,CPSC_Case_Number,Treatment_Date,Age,Sex,Race,Other_Race,Hispanic,Body_Part,Diagnosis,Other_Diagnosis,...,Drug,Narrative,Stratum,PSU,Weight,Year,Month,Day,Disposition_recode,Disposition_recode_2
0,221032332,2022-09-24,14,1,0,0,0.0,34,71,PAIN,...,0.0,HE 1 WEEK AND COMPLAINS OF HE HAS BEEN PLAY...,V,77,17.2223,2022,9,24,0.0,0
1,181109464,2018-10-30,28,1,1,0,0.0,79,71,BACK PAIN,...,0.0,A BENT TO PICK UP CRATE AT HOME TO WITH LOW B...,V,25,17.5136,2018,10,30,0.0,0
2,210103105,2020-10-24,35,1,0,0,0.0,30,53,,...,0.0,RIDING ON MOUNTAIN BIKE PRACTICING DOWN,S,27,76.0369,2020,10,24,0.0,0
3,161157997,2016-11-15,214,2,0,0,0.0,76,53,,...,0.0,14 MONTH FOR NOSE AND FOREHEAD WAS COMINGDOW...,S,48,85.2143,2016,11,15,0.0,0
4,181107411,2018-10-21,4,1,0,0,0.0,92,72,,...,0.0,PLAYING WITH TOY KITCHEN APPLIANCE AND GOT FI...,C,20,4.9383,2018,10,21,0.0,0
5,200134239,2020-01-10,207,1,0,0,0.0,82,51,,...,0.0,7 SITTING ON THE COUNT AND GRABBED A HOT CURLI...,C,20,4.851,2020,1,10,0.0,0
6,140951498,2014-09-10,12,1,1,0,0.0,82,53,,...,0.0,DURING PE ACTIVITY,C,20,5.7174,2014,9,10,0.0,0
7,221017396,2022-10-03,44,2,1,0,2.0,79,53,,...,0.0,CHILD WAS SWINGING A BACK PACK THAT HAD A LAP...,V,21,17.2223,2022,10,3,0.0,0
8,200645623,2020-06-13,28,1,1,0,2.0,30,57,,...,0.0,OFF SKATEBOARD LANDED ON LEFT SIDE,S,82,76.0369,2020,6,13,0.0,0
9,141040420,2014-10-13,16,2,1,0,0.0,79,71,GROIN PAIN,...,0.0,ACTIVE PLAYING VOLLEYBALL 7 DAYS A WEEK RUNNI...,C,8,5.7174,2014,10,13,0.0,0


## Sample the data for different training

In [10]:
data_sample_1 = data[['CPSC_Case_Number', 'Age', 'Sex', 'Body_Part', 'Location', 'Product_1', 'Disposition_recode_2', "Narrative"]]
data_sample_2 = data[['CPSC_Case_Number', 'Age', 'Sex', 'Body_Part', 'Location', 'Product_1', 'Disposition_recode_2']].merge(data_new_columns,how='inner',on='CPSC_Case_Number').reset_index(drop=True)
data_sample_3 = data[['CPSC_Case_Number', 'Age', 'Sex', 'Body_Part', 'Location', 'Product_1', 'Disposition_recode_2']].merge(embedding,how='inner',on='CPSC_Case_Number').reset_index(drop=True)
# del data
# del embedding
# del data_new_columns
# gc.collect()
print(data_sample_1.shape)
print(data_sample_2.shape)
print(data_sample_3.shape)

(329102, 8)
(329101, 10)
(329102, 391)


In [11]:
## Process the Narrative field for data sample 1

In [12]:
# import nltk
# import pandas as pd
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer
# from sklearn.feature_extraction.text import TfidfVectorizer

# nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')

# def preprocess_text(text):
#     tokens = word_tokenize(text.lower())
#     tokens = [token for token in tokens if token.isalpha() and token not in stopwords.words('english')]
#     tagged_tokens = nltk.pos_tag(tokens)
#     lemmatizer = WordNetLemmatizer()
#     lemmatized_tokens = [lemmatizer.lemmatize(token, pos='v') for token, pos in tagged_tokens]
#     return ' '.join(lemmatized_tokens)

# def apply_tfidf(data, field_name):
#     corpus = data[field_name].fillna('')

#     data['Processed_Narrative'] = corpus.apply(preprocess_text)

#     # Create a TfidfVectorizer object
#     vectorizer = TfidfVectorizer(max_features=384, stop_words='english')

#     # Fit and transform the processed text
#     tfidf_matrix = vectorizer.fit_transform(data['Processed_Narrative'])

#     # Convert the TF-IDF matrix to a DataFrame
#     tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

#     # fill NaN values with 0
#     tfidf_df.fillna(0.0, inplace=True)
#     return pd.concat([data, tfidf_df], axis=1) #.drop(columns=[field_name, 'Processed_Narrative'])

In [None]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

def preprocess_text(text: str) -> str:
    """
    Preprocess text with strict error handling.
    """
    # Convert to lowercase and tokenize
    tokens = word_tokenize(str(text).lower())
    
    # Remove non-alphabetic tokens and stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    
    # POS tagging
    tagged_tokens = nltk.pos_tag(tokens)
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token, pos='v') for token, pos in tagged_tokens]
    
    return ' '.join(lemmatized_tokens)

def apply_tfidf(data: pd.DataFrame, 
                field_name: str, 
                max_features: int = 384,
                min_df: float = 0.01,
                max_df: float = 0.95) -> pd.DataFrame:
    # Store original index for validation
    original_index = data.index
    
    # Download NLTK dependencies if needed
    for resource in ['punkt', 'averaged_perceptron_tagger', 'stopwords', 'wordnet']:
        try:
            nltk.data.find(f'tokenizers/{resource}')
        except LookupError:
            nltk.download(resource)
    
    # Preprocess with progress tracking
    print("\nPreprocessing text...")
    corpus = data[field_name].fillna('')
    processed_texts = corpus.apply(preprocess_text)
    
    # Create and apply TF-IDF
    print("\nApplying TF-IDF transformation...")
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        stop_words='english',
        min_df=min_df,
        max_df=max_df
    )
    
    # Fit and transform the processed text
    tfidf_matrix = vectorizer.fit_transform(processed_texts)
    
    # Create DataFrame with TF-IDF features
    feature_names = vectorizer.get_feature_names_out()
    tfidf_df = pd.DataFrame(
        tfidf_matrix.toarray(),
        index=data.index,  # Preserve original index
        columns=[f'tfidf_{name}' for name in feature_names]
    )
    
    # Combine with original data, maintaining index
    result = pd.concat([data, tfidf_df], axis=1)
    return result

In [14]:
data_sample_1_processed = apply_tfidf(data_sample_1, 'Narrative', max_features=384, min_df=0.01, max_df=0.95)
data_sample_1_processed.drop(columns=['Narrative'], inplace=True)


Validation at input:
Shape: (329102, 8)
Number of NaN values: 0
Memory usage: 22.60 MB

Preprocessing text...


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/eric/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/eric/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eric/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Applying TF-IDF transformation...

Validation at TF-IDF output:
Shape: (329102, 102)
Number of NaN values: 0
Memory usage: 266.68 MB

Validation at final output:
Shape: (329102, 110)
Number of NaN values: 0
Memory usage: 286.77 MB


In [15]:
print(data_sample_1_processed.head(10))

   CPSC_Case_Number  Age  Sex  Body_Part  Location  Product_1  \
0         221032332   14    1         34         0       1205   
1         181109464   28    1         79         1       1141   
2         210103105   35    1         30         9       5033   
3         161157997  214    2         76         0       1842   
4         181107411    4    1         92         1       5020   
5         200134239  207    1         82         1       1682   
6         140951498   12    1         82         8       1200   
7         221017396   44    2         79         0        557   
8         200645623   28    1         30         0       1333   
9         141040420   16    2         79         9       1266   

   Disposition_recode_2  tfidf_accily  tfidf_ago  tfidf_alcohol  ...  \
0                     0           0.0        0.0            0.0  ...   
1                     0           0.0        0.0            0.0  ...   
2                     0           0.0        0.0            0.0  ...

## Process LLM fields for data sample 2

Combine the 3 LLM columns to one and vectorize the fields with TF-IDF. 
Replace the 3 LLM columns with the new TF-IDF columns

In [16]:
def clean_field(value):
    value_str = str(value).lower()
    return '' if 'unknown' in value_str else value_str

# Apply the function to each field and concatenate
data_sample_2['Narrative_LLM'] = (
    data_sample_2["activity_at_injury"].apply(clean_field) + ' ' +
    data_sample_2["injury_mechanism"].apply(clean_field) + ' ' +
    data_sample_2["object_involved"].apply(clean_field)
)
data_sample_2_processed = apply_tfidf(data_sample_2, 'Narrative_LLM', max_features=384, min_df=0.01, max_df=0.95)
data_sample_2_processed.drop(columns=['activity_at_injury','injury_mechanism','object_involved', 'Narrative_LLM'], inplace=True)


Validation at input:
Shape: (329101, 11)
Number of NaN values: 3
Memory usage: 27.62 MB

Preprocessing text...


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/eric/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/eric/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eric/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Applying TF-IDF transformation...

Validation at TF-IDF output:
Shape: (329101, 46)
Number of NaN values: 0
Memory usage: 115.50 MB

Validation at final output:
Shape: (329101, 57)
Number of NaN values: 3
Memory usage: 143.12 MB


In [17]:
data_sample_2_processed.head(100)

Unnamed: 0,CPSC_Case_Number,Age,Sex,Body_Part,Location,Product_1,Disposition_recode_2,activity_at_injury,injury_mechanism,object_involved,...,tfidf_step,tfidf_strain,tfidf_strike,tfidf_table,tfidf_trampoline,tfidf_trip,tfidf_twist,tfidf_use,tfidf_walk,tfidf_wall
0,221032332,14,1,34,0,1205,0,playing basketball,fell,basketball,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,181109464,28,1,79,1,1141,0,pick up crate,bent,crate,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,210103105,35,1,30,9,5033,0,riding on mountain bike practicing,fell,mountain bike,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,161157997,214,2,76,0,1842,0,coming down stairs,fell,stairs,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,181107411,4,1,92,1,5020,0,playing with toy kitchen appliance,caught,toy kitchen appliance,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,210651263,9,1,30,9,1244,0,unknown,fell,playground equipment,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,221128059,76,1,31,0,4076,0,unknown,fall,bed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,170343009,66,1,92,0,464,0,cutting chicken,cut,knife,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,170735358,35,1,32,9,3299,0,hiking,fell,rock,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
data_sample_3_processed = data_sample_3
data_sample_3_processed.head(10)

Unnamed: 0,CPSC_Case_Number,Age,Sex,Body_Part,Location,Product_1,Disposition_recode_2,0,1,2,...,374,375,376,377,378,379,380,381,382,383
0,221032332,14,1,34,0,1205,0,-0.030642,-0.022758,0.04937,...,0.014122,-0.030197,-0.002454,0.026395,-0.048185,-0.001542,-0.017451,-0.05088,-0.008124,0.030714
1,181109464,28,1,79,1,1141,0,-0.017625,-0.0345,0.065491,...,0.024239,-0.000142,0.007269,-0.015545,-0.040856,-0.042023,-0.05038,0.00876,-0.002266,0.043031
2,210103105,35,1,30,9,5033,0,-0.047855,-0.018055,0.025419,...,0.040865,0.019481,0.005293,-0.059021,-0.047235,0.024601,-0.067565,-0.025657,-0.013521,-0.015571
3,161157997,214,2,76,0,1842,0,-0.030252,-0.054146,0.042925,...,0.013746,-0.022505,-0.026468,-0.053529,-0.033781,0.023896,-0.037096,-0.075717,0.052297,0.016317
4,181107411,4,1,92,1,5020,0,-0.018278,-0.007797,0.042058,...,0.023318,-0.049932,0.014847,0.037483,0.025916,0.035806,-0.044568,0.014808,-0.021472,0.001994
5,200134239,207,1,82,1,1682,0,-0.011073,0.017136,0.011212,...,0.026607,-0.051627,0.009689,0.01888,-0.001498,-0.021843,-0.073475,0.022928,-0.010549,0.009171
6,140951498,12,1,82,8,1200,0,-0.025121,0.012701,0.044701,...,0.069417,0.006998,-0.016011,-0.011223,-0.006756,0.047259,-0.013498,-0.033564,0.013709,0.049572
7,221017396,44,2,79,0,557,0,-0.032237,-0.021063,0.026084,...,0.016179,-0.031867,-0.007946,-0.036613,-0.030827,0.035943,-0.010132,-0.029518,-0.000366,0.040291
8,200645623,28,1,30,0,1333,0,-0.006542,-0.033106,0.048509,...,0.057451,-0.029053,-0.016805,0.008903,-0.002621,0.01693,-0.115614,-0.006699,-0.102164,0.057578
9,141040420,16,2,79,9,1266,0,0.019674,-0.015743,0.015878,...,0.000669,0.020707,-0.02307,0.03214,-0.013655,0.035791,-0.008253,-0.096477,0.032906,0.039108


## Split dataset

In [22]:
def split_data(data):
    total_rows, n_columns = data.shape

    test_size = int(total_rows * 0.2)
    train_size = total_rows - test_size

    print(f"Splitting data into train and test sets...")
    print(f"\tOriginal dataset shape: ({total_rows}, {n_columns})")
    print(f"\tTrain set size: {train_size} rows")
    print(f"\tTest set size: {test_size} rows")
    data_sample = data.sample(frac=1,random_state=42).reset_index(drop=True)

    data_ready = data_sample.tail(train_size).reset_index(drop=True)
    data_ready_test = data_sample.head(test_size).reset_index(drop=True)
    return data_ready, data_ready_test

data_sample_1_ready, data_sample_1_test = split_data(data_sample_1_processed)
data_sample_2_ready, data_sample_2_test = split_data(data_sample_2_processed)
data_sample_3_ready, data_sample_3_test = split_data(data_sample_3_processed)

Splitting data into train and test sets...
	Original dataset shape: (329102, 109)
	Train set size: 263282 rows
	Test set size: 65820 rows
Splitting data into train and test sets...
	Original dataset shape: (329101, 56)
	Train set size: 263281 rows
	Test set size: 65820 rows
Splitting data into train and test sets...
	Original dataset shape: (329102, 391)
	Train set size: 263282 rows
	Test set size: 65820 rows


## Train the models

1. 'Age', 'Sex', 'Body_Part', 'Location', 'Product_1', 'Disposition' + TF-IDF(384 features) on the normalized&cleaned narrative
2. 'Age', 'Sex', 'Body_Part', 'Location', 'Product_1', 'Disposition' + TF-IDF(384 features) on the normalized three new LLM fields
3. 'Age', 'Sex', 'Body_Part', 'Location', 'Product_1', 'Disposition' + embedding on normalized&cleaned narrative

In [23]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from xgboost import DMatrix, train
from sklearn.metrics import f1_score, classification_report, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

drop_list=[
 'CPSC_Case_Number',
 'Disposition_recode_2',
]

def train_model(data_ready, data_ready_test):
    # Separate features (X) and target variable (y)
    X = data_ready.drop(drop_list, axis=1)
    y = data_ready['Disposition_recode_2']

    # Encode target variable if it's categorical
    le = LabelEncoder()
    y = le.fit_transform(y)

    # Split data into training and testing sets
    # X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42)

    X_test=data_ready_test.drop(drop_list, axis=1)
    y_test=data_ready_test['Disposition_recode_2']
    y_test = le.fit_transform(y_test)

    # Create a scaler for numerical features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X.select_dtypes(include=['number']))

    X_test_scaled = scaler.transform(X_test.select_dtypes(include=['number']))

    # Convert scaled features back to DataFrame
    X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.select_dtypes(include=['number']).columns)
    X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.select_dtypes(include=['number']).columns)

    # fill NaN values with 0
    X_train_scaled_df.fillna(0.0, inplace=True)

    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled_df, y)

    # Combine scaled numerical features with categorical features
    X_train_final = X_resampled.copy()
    X_test_final = X_test_scaled_df.copy()
    y_train_final=y_resampled.copy()

    # Convert datasets to DMatrix format (required for xgb.train)
    dtrain = DMatrix(data=X_train_final, label=y_train_final)
    dtest = DMatrix(data=X_test_final, label=y_test)

    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate': 0.1,
        'max_depth': 7,
        'scale_pos_weight': 1,
        'tree_method': 'hist',
        'device': 'cuda',
        'max_bin': 256,
        'nthread': 16
    }

    model = train(params, dtrain, num_boost_round=5000, evals=[(dtest, 'eval')], verbose_eval=False)
    y_prob = model.predict(dtest)
    y_pred = (y_prob > 0.5).astype(int)
    auc = roc_auc_score(y_test, y_prob)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    print(f"AUC: {auc}")
    print(classification_report(y_test, y_pred))

# Train on the three datasets
print("Training on data_sample_1")
train_model(data_sample_1_ready, data_sample_1_test)

print("Training on data_sample_2")
train_model(data_sample_2_ready, data_sample_2_test)

print("Training on data_sample_3")
train_model(data_sample_3_ready, data_sample_3_test)

Training on data_sample_1
Accuracy: 0.8987237921604375
F1 Score: 0.36994328922495273
AUC: 0.8344708279841482
              precision    recall  f1-score   support

           0       0.92      0.97      0.94     59150
           1       0.50      0.29      0.37      6670

    accuracy                           0.90     65820
   macro avg       0.71      0.63      0.66     65820
weighted avg       0.88      0.90      0.89     65820

Training on data_sample_2
Accuracy: 0.8917046490428441
F1 Score: 0.3873130479628675
AUC: 0.8270368695882746
              precision    recall  f1-score   support

           0       0.93      0.95      0.94     59231
           1       0.45      0.34      0.39      6589

    accuracy                           0.89     65820
   macro avg       0.69      0.65      0.66     65820
weighted avg       0.88      0.89      0.89     65820

Training on data_sample_3
Accuracy: 0.9094196292920085
F1 Score: 0.38976458546571136
AUC: 0.8605408567905396
              precis