# Importing of All Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report
from sklearn.inspection import permutation_importance
from tqdm import tqdm

# Scenario: Creating a Set of Machine Learning Friendly Features from EHR Data to Predict Diabetes Onset

First we will load in the necessary data files

In [2]:
def load_data_for_file(filename):
    print(f"Loading data for {filename}")
    df = pd.concat([ # use pd.concat to append/concatenate the data for all states together into a single frame
        pd.read_parquet(f"https://dicbworkshops.s3.amazonaws.com/{output_dir}/parquet/{filename}") # use read_csv to load the data from each output directory
        for output_dir in tqdm(['output_hi', 'output_ma', 'output_tx', 'output_wa'], leave=True, position=0) # loop over each output directory
    ])
    return df

In [3]:
# load in the conditions
conditions = load_data_for_file('conditions.parquet')
# load in the observations
observations = load_data_for_file('observations.parquet')
# load in the medications
medications = load_data_for_file('medications.parquet')
# load in the procedures
procedures = load_data_for_file('procedures.parquet')
# load in the patients
patients = load_data_for_file('patients.parquet')

Loading data for conditions.parquet


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.60s/it]


Loading data for observations.parquet


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:24<00:00,  6.12s/it]


Loading data for medications.parquet


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.04it/s]


Loading data for procedures.parquet


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:27<00:00,  6.94s/it]


Loading data for patients.parquet


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.48it/s]


# Definition of Helper Functions Used to Compose the Larger Pipeline

In [34]:
# function to construct a lookup table of condition onset dates for patients based on the conditions table and a provided code
def get_patient_onset_dates(conditions, code):
    patients_with_condition = (
        conditions.query('CODE == @code') # get all patients diagnosed with the code
        .sort_values(by=['PATIENT', 'START']) # sort the data by patient ID and then start date
        .drop_duplicates(subset=['PATIENT', 'START'], keep='first') # drop duplicates, keeping the instance with the earliest start date
    )

    # now build a lookup table/dictionary to map each patient's ID to the date of their earliest onset
    patient_onset_dates = {
        row['PATIENT']: row['START']
        for _, row in patients_with_condition.iterrows()
    }
    return patient_onset_dates

In [39]:
# function to construct simplified date columns for the observations, medications, and procedures tables
def get_simplified_data(df, date_col, simplified_col='DATE_SIMPLE'):
    return df.assign(**{
        simplified_col: lambda x: pd.to_datetime(x[date_col]).dt.date.astype('str')
    })

In [42]:
# function to filter out post diagnosis records from a table based on patient onset dates
def filter_data_by_onset_dates(df, patient_onset_dates, date_column='DATE_SIMPLE'):
    data_filtered = []
    for _, row in tqdm(df.iterrows(), total=len(df), position=0, leave=True):
        patient = row['PATIENT']
        date = row[date_column]
        if patient in patient_onset_dates and patient_onset_dates[patient] > date:
            data_filtered.append(row)
    return pd.DataFrame(data_filtered)

In [43]:
# function to unify the records for the four different types of events/ecounters into a single table
def get_unified_records(conditions, observations, medications, procedures):
    return pd.concat([
        conditions[['PATIENT', 'START', 'CODE', 'DESCRIPTION']].assign(
            EVENT_TYPE='CONDITION',
        ).rename(columns={'START': 'DATE'}),
        observations[['PATIENT', 'DATE_SIMPLE', 'CODE', 'DESCRIPTION']].assign(
            EVENT_TYPE='OBSERVATION',
        ).rename(columns={'DATE_SIMPLE': 'DATE'}),
        medications[['PATIENT', 'DATE_SIMPLE', 'CODE', 'DESCRIPTION']].assign(
            EVENT_TYPE='MEDICATION',
        ).rename(columns={'DATE_SIMPLE': 'DATE'}),
        procedures[['PATIENT', 'DATE_SIMPLE', 'CODE', 'DESCRIPTION']].assign(
            EVENT_TYPE='PROCEDURE',
        ).rename(columns={'DATE_SIMPLE': 'DATE'})
])

In [44]:
# function to vectorize the unified record data into binary occurence format
def get_multihot_vector_representation(unified_records, vectorizer, train=True):
    # condense the records into a pipe-delimited string of event tokens per patient, 
    # where each token is of the form <EVENT_TYPE>::<CODE>
    records_condensed = unified_records.assign(
        EVENT_TOKEN=lambda x: x['EVENT_TYPE'] + '::' + x['CODE'].astype(str) + '|'
    ).groupby(['PATIENT'])['EVENT_TOKEN'].sum().reset_index()
    # now get the multi-hot representation from the vectorizer
    if train:
        # if this is the training set, fit before transforming
        return vectorizer.fit_transform(records_condensed['EVENT_TOKEN'])
    else:
        #otherwise, just transform
        return vectorizer.transform(records_condensed['EVENT_TOKEN'])
    

In [None]:
# function to add patient ages to a given dataframe, and compute age bins from those ages
def get_aged_patient_data(events_df, patients_df, 

## Filtering Out Patients with Diagnoses of Type-2 Diabetes
For this exercise, we are interested in studying patients with a diagnosis of Type-2 diabetes \
We select these from the conditions table based on the SNOMED code `44054006`

In [29]:
type2_patients = conditions.query('CODE == 44054006').sort_values(by=['PATIENT', 'START']).drop_duplicates(subset=['PATIENT', 'START'], keep='first')

In [30]:
# now we create a dictionary/lookup table to map each patient's ID to the date of their earliest Type 2 diagnosis
patient_diagnosis_dates = {
    row['PATIENT']: row['START']
    for _, row in type2_patients.iterrows()
}

In [7]:
# add simplified date columns to the observations, medications, and procedures 
observations_simplified = observations.assign(
    DATE_SIMPLE=lambda x: pd.to_datetime(x['DATE']).dt.date.astype('str')
)
medications_simplified = medications.assign(
    DATE_SIMPLE=lambda x: pd.to_datetime(x['START']).dt.date.astype('str')
)
procedures_simplified = procedures.assign(
    DATE_SIMPLE=lambda x: pd.to_datetime(x['START']).dt.date.astype('str')
)

## Filtering Out Post-diagnosis Conditions, Observations, Medications, and Procedures and Unifying Into a Shared Representation
First we need to filter out all EHR data from encounters that took place after the Type-2 diabetes diagnosis for the type 2 patients

In [9]:
conditions_filtered = filter_data(conditions, patient_diagnosis_dates, 'START') 

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 182373/182373 [00:02<00:00, 78934.01it/s]


In [10]:
observations_subset = observations_simplified[observations_simplified['PATIENT'].isin(patient_diagnosis_dates)]
observations_filtered = filter_data(observations_subset, patient_diagnosis_dates)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 789669/789669 [00:10<00:00, 78187.63it/s]


In [11]:
medications_subset = medications_simplified[medications_simplified['PATIENT'].isin(patient_diagnosis_dates)]
medications_filtered = filter_data(medications_subset, patient_diagnosis_dates)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 55269/55269 [00:00<00:00, 75217.60it/s]


In [12]:
procedures_subset = procedures_simplified[procedures_simplified['PATIENT'].isin(patient_diagnosis_dates)]
procedures_filtered = filter_data(procedures_subset, patient_diagnosis_dates)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108374/108374 [00:01<00:00, 75648.63it/s]


In [17]:
# now we save the unique patients who have a pre-diagnosis record
type2_prediag_patients = pd.concat([
    conditions_filtered['PATIENT'],
    observations_filtered['PATIENT'],
    medications_filtered['PATIENT'],
    procedures_filtered['PATIENT']
]).unique()

Now we will label the EHR data for the type 2 and non-type 2 patients and unify everything into a single event set

In [14]:
# starting with the Type 2 diabetes patients, we label and unify the data from medications, procedures, observations, and conditions into a single recordset
all_records_type2 = get_unified_records(conditions_filtered, observations_filtered, medications_filtered, procedures_filtered)

In [15]:
# now we will do the same for the non type2 patients
all_records_non_type2 = get_unified_records(
    conditions[~conditions['PATIENT'].isin(type2_patients['PATIENT'])],
    observations_simplified[~observations_simplified['PATIENT'].isin(type2_patients['PATIENT'])],
    medications_simplified[~medications_simplified['PATIENT'].isin(type2_patients['PATIENT'])],
    procedures_simplified[~procedures_simplified['PATIENT'].isin(type2_patients['PATIENT'])],
)

In [16]:
all_records_non_type2

Unnamed: 0,PATIENT,DATE,CODE,DESCRIPTION,EVENT_TYPE
0,8f8229e6-00be-a033-bb16-42781f9d208a,2001-07-18,473461003,Educated to high school level (finding),CONDITION
1,8f8229e6-00be-a033-bb16-42781f9d208a,2001-07-18,160903007,Full-time employment (finding),CONDITION
2,8f8229e6-00be-a033-bb16-42781f9d208a,2011-07-25,161744009,Past pregnancy history of miscarriage (situation),CONDITION
3,8f8229e6-00be-a033-bb16-42781f9d208a,2014-08-13,73595000,Stress (finding),CONDITION
4,8f8229e6-00be-a033-bb16-42781f9d208a,2014-12-03,72892002,Normal pregnancy (finding),CONDITION
...,...,...,...,...,...
212079,3df4eaa0-3234-0118-df9e-5cbe4659744e,2024-10-28,866148006,Screening for domestic abuse (procedure),PROCEDURE
212080,3df4eaa0-3234-0118-df9e-5cbe4659744e,2024-10-28,171207006,Depression screening (procedure),PROCEDURE
212081,3df4eaa0-3234-0118-df9e-5cbe4659744e,2024-10-28,171207006,Depression screening (procedure),PROCEDURE
212082,3df4eaa0-3234-0118-df9e-5cbe4659744e,2024-10-28,428211000124100,Assessment of substance use (procedure),PROCEDURE


## Now We will split our Type 2 and Non-Type 2 patients into Training and Hold-out sets


In [17]:
type2_patients = all_records_type2['PATIENT'].unique()
non_type2_patients = all_records_non_type2['PATIENT'].unique()
labels = np.concatenate([np.ones(type2_patients.shape), np.zeros(non_type2_patients.shape)])
train_patients, test_patients, train_labels, test_labels = train_test_split(np.concatenate([type2_patients, non_type2_patients]), labels, test_size=0.2, stratify=labels)

In [18]:
# now split the records accordingly
all_records = pd.concat([all_records_type2, all_records_non_type2])
train_records = all_records[all_records['PATIENT'].isin(train_patients)]
test_records = all_records[all_records['PATIENT'].isin(test_patients)]

## Option 1: Bag of Labeled Clinical Encounters (Many-hot/multi-hot encoding)
The simplest feature representation we can create and test is a binary vector (many-hot/multi-hot) representation \
which encodes the occurence or lack-therof of different clinical encounters/event in each patient's EHR record \
To construct this representation, we can use the scikit-learn package's `CountVectorizer` class

In [20]:
vectorizer = CountVectorizer(
    binary=True,
    tokenizer=lambda x: x.split('|'),
    token_pattern=None,
    lowercase=False
)

The vectorizer expects a `'|'` pipe delimited string of coded encounters, \
so we will construct this representation now for our training patients

In [21]:
# condense the records into a pipe-delimited string of event tokens per patient, 
# where each token is of the form <EVENT_TYPE>::<CODE>
train_records_condensed = train_records.assign(
    EVENT_TOKEN=lambda x: x['EVENT_TYPE'] + '::' + x['CODE'].astype(str) + '|'
).groupby(['PATIENT'])['EVENT_TOKEN'].sum().reset_index()

In [22]:
train_data_final = train_records_condensed.assign(
    LABEL=lambda x: x['PATIENT'].isin(type2_patients).astype(int)
)

In [23]:
train_data_final

Unnamed: 0,PATIENT,EVENT_TOKEN,LABEL
0,0007008e-e6b0-8ad7-b5b8-c5d6c0c731f4,CONDITION::314529007|CONDITION::314529007|COND...,0
1,00127d79-8f35-5109-fe9b-961386b57f99,CONDITION::314529007|CONDITION::66383009|CONDI...,0
2,0029cec6-9b3b-ef64-8361-3cef09ec8439,CONDITION::314529007|CONDITION::314529007|COND...,0
3,0031b28b-6096-9263-57d1-56383bea4ba4,CONDITION::314529007|CONDITION::314529007|COND...,0
4,0033e953-ab8f-26c6-af87-39cdede3b388,CONDITION::160968000|CONDITION::224299000|COND...,0
...,...,...,...
3655,ffa278a3-bf3c-4437-fe60-a0d8d83979a7,CONDITION::160968000|CONDITION::473461003|COND...,0
3656,ffa71780-67c4-7174-729c-8a3611d0385f,CONDITION::160968000|CONDITION::224295006|COND...,1
3657,ffa95e2b-cbcd-96c9-a14f-2de40f6be332,CONDITION::224295006|CONDITION::714628002|COND...,0
3658,ffd9ad6d-3975-439a-d549-109f33e7a1f2,CONDITION::428251008|CONDITION::714628002|COND...,1


### Now that we have constructed a Pipe Delimited Event Representation, We Can Vectorize

In [24]:
event_occurence_vectors = vectorizer.fit_transform(train_data_final['EVENT_TOKEN'])

In [25]:
event_occurence_vectors.shape

(3660, 1200)

### Now We Will Test the performance of this representation on the Training Set Using KFold Cross Validation

In [27]:
kfold = StratifiedKFold(n_splits=5)
clf = RandomForestClassifier(max_depth=20, n_estimators=50)

In [28]:
metrics = []
for i, (train_index, test_index) in tqdm(enumerate(kfold.split(event_occurence_vectors, train_data_final['LABEL'])), total=5, position=0, leave=True):
    train_x, train_y = event_occurence_vectors[train_index], train_data_final['LABEL'].to_numpy()[train_index]
    test_x, test_y = event_occurence_vectors[test_index], train_data_final['LABEL'].to_numpy()[test_index]
    # fit the model on the training fold
    clf.fit(train_x, train_y)
    # evaluate the model on the validation fold
    preds = clf.predict(test_x)
    scores = clf.predict_proba(test_x)[:, 1]
    # get the AUROC
    fpr, tpr, _ = roc_curve(test_y, scores)
    auroc = auc(fpr, tpr)
    # get the confusion matrix
    cm = confusion_matrix(test_y, preds)
    # save the metrics
    metrics.append({
        'AUROC': auroc,
        'Precision': cm[1, 1] / cm[:, 1].sum(),
        'Recall': cm[1, 1] / cm[1].sum(),
        'Specificity': cm[0, 0] / cm[0].sum()
    })
pd.DataFrame(metrics)
    

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  8.79it/s]


Unnamed: 0,AUROC,Precision,Recall,Specificity
0,0.974861,1.0,0.75,1.0
1,0.922368,1.0,0.688889,1.0
2,0.962656,1.0,0.8,1.0
3,0.944008,1.0,0.8,1.0
4,0.954617,1.0,0.733333,1.0


### The model appears to be performing decently so far, but is that where the story ends, or does the plot thicken?
One thing that we can do is audit the way our current model is behaving, and what its predictions are based \
on, by looking at feature importance rankings. Here we compute the permutation importance of the model features \
on a validation set, and focus in on the top 20 features ranked by importance.

In [29]:
# split our training dataset into "train" and "test" sets to fit the model for explainability purposes
train_x, test_x, train_y, test_y = train_test_split(event_occurence_vectors, train_data_final['LABEL'].to_numpy(), test_size=0.2, stratify=train_data_final['LABEL'])

In [30]:
train_x.shape

(2928, 1200)

In [31]:
test_x.shape

(732, 1200)

In [32]:
clf.fit(train_x, train_y)

In [33]:
print(classification_report(test_y, clf.predict(test_x)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       687
           1       1.00      0.73      0.85        45

    accuracy                           0.98       732
   macro avg       0.99      0.87      0.92       732
weighted avg       0.98      0.98      0.98       732



In [34]:
reverse_lookup = {
    value: key for key, value in vectorizer.vocabulary_.items()
}
top_50_indices = np.argsort(clf.feature_importances_)[-50:]
top50_features = [reverse_lookup[idx] for idx in top_50_indices]
top50_importances = clf.feature_importances_[top_50_indices]
top_features_df = pd.DataFrame({
    'FEATURE_NAME': top50_features,
    'FEATURE_IMPORTANCE': top50_importances
}).assign(
    CODE=lambda x: x['FEATURE_NAME'].str.split('::').apply(lambda pair: pair[1])
).merge(
    all_records[['CODE', 'DESCRIPTION']].drop_duplicates().astype({'CODE': str}),
    on='CODE',
).sort_values(by='FEATURE_IMPORTANCE', ascending=False)

In [37]:
top_features_df.head(20)

Unnamed: 0,FEATURE_NAME,FEATURE_IMPORTANCE,CODE,DESCRIPTION
50,OBSERVATION::4544-3,0.056458,4544-3,Hematocrit [Volume Fraction] of Blood by Autom...
49,OBSERVATION::21000-5,0.055867,21000-5,Erythrocyte distribution width [Entitic volume...
48,OBSERVATION::787-2,0.047015,787-2,MCV [Entitic volume] by Automated count
47,OBSERVATION::8480-6,0.038169,8480-6,Systolic Blood Pressure
46,OBSERVATION::32623-1,0.037299,32623-1,Platelet mean volume [Entitic volume] in Blood...
45,OBSERVATION::777-3,0.037056,777-3,Platelets [#/volume] in Blood by Automated count
44,OBSERVATION::29463-7,0.036824,29463-7,Body Weight
43,OBSERVATION::718-7,0.035714,718-7,Hemoglobin
42,OBSERVATION::718-7,0.035714,718-7,Hemoglobin [Mass/volume] in Blood
41,OBSERVATION::8462-4,0.031748,8462-4,Diastolic Blood Pressure


## Option 2: Binary Occurrence with Inclusion of Discretized Numeric Features
While the feature importance rankings revealed known co-morbidities (e.g., Diastolic Blood Pressure) and known risk-factors for diabetes (e.g., Tobacco smoking status) \
we have left out entirely the numeric data from the observations (lab and vital sign measures) which are likely to contain important information. \
Here we look at one technique for incorporating this information while maintaining the binary occurence vector representation that we used previously

In [38]:
# first we revisit the observations table, and we pull the data for all numeric observations for the training patients
observations_train = pd.concat([
    observations_filtered[observations_filtered['PATIENT'].isin(train_patients)],
    observations_simplified[
        ~observations_simplified['PATIENT'].isin(type2_patients) &
        observations_simplified['PATIENT'].isin(train_patients)
    ]
]).query('TYPE == "numeric"')


In [39]:
observations_train

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CATEGORY,CODE,DESCRIPTION,VALUE,UNITS,TYPE,DATE_SIMPLE
11038,2015-02-17T19:31:01Z,6dc3a549-a3d4-8fee-1a15-b986dbe9b1d2,5f4b2a43-7930-44d9-cb4b-4eaa92c9ccd2,laboratory,4548-4,Hemoglobin A1c/Hemoglobin.total in Blood,6.3,%,numeric,2015-02-17
11040,2015-02-17T19:31:01Z,6dc3a549-a3d4-8fee-1a15-b986dbe9b1d2,5f4b2a43-7930-44d9-cb4b-4eaa92c9ccd2,vital-signs,8302-2,Body Height,178.3,cm,numeric,2015-02-17
11042,2015-02-17T19:31:01Z,6dc3a549-a3d4-8fee-1a15-b986dbe9b1d2,5f4b2a43-7930-44d9-cb4b-4eaa92c9ccd2,vital-signs,72514-3,Pain severity - 0-10 verbal numeric rating [Sc...,2.0,{score},numeric,2015-02-17
11288,2015-02-17T19:31:01Z,6dc3a549-a3d4-8fee-1a15-b986dbe9b1d2,5f4b2a43-7930-44d9-cb4b-4eaa92c9ccd2,vital-signs,29463-7,Body Weight,94.1,kg,numeric,2015-02-17
11289,2015-02-17T19:31:01Z,6dc3a549-a3d4-8fee-1a15-b986dbe9b1d2,5f4b2a43-7930-44d9-cb4b-4eaa92c9ccd2,vital-signs,39156-5,Body mass index (BMI) [Ratio],29.6,kg/m2,numeric,2015-02-17
...,...,...,...,...,...,...,...,...,...,...
1096054,2020-06-20T17:09:05Z,3df4eaa0-3234-0118-df9e-5cbe4659744e,,,QOLS,QOLS,1.0,{score},numeric,2020-06-20
1096055,2021-06-20T17:09:05Z,3df4eaa0-3234-0118-df9e-5cbe4659744e,,,QOLS,QOLS,1.0,{score},numeric,2021-06-20
1096056,2022-06-20T17:09:05Z,3df4eaa0-3234-0118-df9e-5cbe4659744e,,,QOLS,QOLS,0.7,{score},numeric,2022-06-20
1096057,2023-06-20T17:09:05Z,3df4eaa0-3234-0118-df9e-5cbe4659744e,,,QOLS,QOLS,0.7,{score},numeric,2023-06-20


In [None]:
# now we will attach the age of the patient at teh time the numeric observation was taken to the observation data
observations_with_age = observations_train.merge(
    

## Option 2: Bag of Character N-Grams
One way that we can potentially improve on our earlier feature representation is to use a bag of N-grams \
which encodes the occurence or lack-therof of different substrings from clinical encounter descriptions \
To construct this representation, we can again make use of the scikit-learn package's `CountVectorizer` class \
but instead of using the default `"word"` analyzer, we will instead use the `"char_wb"` analyzer

In [40]:
vectorizer_ngram = CountVectorizer(
    binary=True,
    lowercase=True,
    ngram_range=(5, 5),
    analyzer='char_wb'
)

In [41]:
# condense the records into a concatenated description per patient, 
# wherein we take all event descriptions and combine them together into a single
# description that encompasses all of the clinical events for that patient
train_records_condensed = train_records.assign(
    DESCRIPTION_WS=lambda x: x['DESCRIPTION'] + ' '
).groupby('PATIENT')['DESCRIPTION_WS'].sum().reset_index()

In [42]:
train_records_condensed.iloc[0]['DESCRIPTION_WS']

'Medication review due (situation) Medication review due (situation) Medication review due (situation) Medication review due (situation) Medication review due (situation) Otitis media (disorder) Medication review due (situation) Medication review due (situation) Laceration - injury (disorder) Facial laceration (disorder) Medication review due (situation) Viral sinusitis (disorder) Childhood asthma (disorder) Medication review due (situation) Sprain (morphologic abnormality) Sprain of ankle (disorder) Medication review due (situation) Body Height Pain severity - 0-10 verbal numeric rating [Score] - Reported Body Weight Weight-for-length Per age and sex Head Occipital-frontal circumference Percentile Head Occipital-frontal circumference Diastolic Blood Pressure Systolic Blood Pressure Heart rate Respiratory rate Leukocytes [#/volume] in Blood by Automated count Erythrocytes [#/volume] in Blood by Automated count Hemoglobin [Mass/volume] in Blood Hematocrit [Volume Fraction] of Blood by A

In [43]:
train_data_final = train_records_condensed.assign(
    LABEL=lambda x: x['PATIENT'].isin(type2_patients).astype(int)
)

### Now that we have a condensed representation, we will vectorize

In [44]:
ngram_vectors = vectorizer_ngram.fit_transform(train_data_final['DESCRIPTION_WS'])

### Now We Will Test the performance of this representation on the Training Set Using KFold Cross Validation

In [45]:
kfold = StratifiedKFold(n_splits=5)
clf = RandomForestClassifier()

metrics = []
for i, (train_index, test_index) in tqdm(enumerate(kfold.split(ngram_vectors, train_data_final['LABEL'])), total=5, position=0, leave=True):
    train_x, train_y = ngram_vectors[train_index], train_data_final['LABEL'].to_numpy()[train_index]
    test_x, test_y = ngram_vectors[test_index], train_data_final['LABEL'].to_numpy()[test_index]
    # fit the model on the training fold
    clf.fit(train_x, train_y)
    # evaluate the model on the validation fold
    preds = clf.predict(test_x)
    scores = clf.predict_proba(test_x)[:, 1]
    # get the AUROC
    fpr, tpr, _ = roc_curve(test_y, scores)
    auroc = auc(fpr, tpr)
    # get the confusion matrix
    cm = confusion_matrix(test_y, preds)
    # save the metrics
    metrics.append({
        'AUROC': auroc,
        'Precision': cm[1, 1] / cm[:, 1].sum(),
        'Recall': cm[1, 1] / cm[1].sum(),
        'Specificity': cm[0, 0] / cm[0].sum()
    })
pd.DataFrame(metrics)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.37it/s]


Unnamed: 0,AUROC,Precision,Recall,Specificity
0,0.949739,1.0,0.795455,1.0
1,0.900388,1.0,0.688889,1.0
2,0.952515,1.0,0.777778,1.0
3,0.95693,1.0,0.822222,1.0
4,0.950752,1.0,0.711111,1.0


### Let's Again Look at the Top 20 Features Ranked by Importance

In [46]:
# split our training dataset into "train" and "test" sets to fit the model for explainability purposes
train_x, test_x, train_y, test_y = train_test_split(ngram_vectors, train_data_final['LABEL'].to_numpy(), test_size=0.2, stratify=train_data_final['LABEL'])

In [47]:
clf.fit(train_x, train_y)

In [48]:
print(classification_report(test_y, clf.predict(test_x)))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       687
           1       1.00      0.82      0.90        45

    accuracy                           0.99       732
   macro avg       0.99      0.91      0.95       732
weighted avg       0.99      0.99      0.99       732



In [53]:
r = permutation_importance(clf, np.asarray(train_x.todense()), train_y, n_repeats=2, n_jobs=-1, scoring='recall')

In [54]:
importances_df = pd.DataFrame({
    'Feature': vectorizer_ngram.get_feature_names_out(),
    'mean_importance': r.importances_mean
})

In [55]:
top20 = importances_df.sort_values(by='mean_importance', ascending=False).head(20)

In [56]:
top20

Unnamed: 0,Feature,mean_importance
5423,oriny,0.005587
2959,ence,0.005587
1324,vict,0.005587
4034,iral,0.005587
7116,ture,0.005587
3979,inusi,0.005587
2395,colon,0.002793
6756,table,0.002793
2327,cilia,0.002793
1210,tabl,0.002793


In [None]:
for _, row in tqdm(top20.iterrows(), total=20, position=0, leave=True):
    
    