In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm

# Scenario: Creating a Set of Machine Learning Friendly Features from EHR Data to Predict Diabetes Onset

First we will load in the necessary data files

In [2]:
def load_data_for_file(filename):
    print(f"Loading data for {filename}")
    df = pd.concat([ # use pd.concat to append/concatenate the data for all states together into a single frame
        pd.read_parquet(f"https://dicbworkshops.s3.amazonaws.com/{output_dir}/parquet/{filename}") # use read_csv to load the data from each output directory
        for output_dir in tqdm(['output_hi', 'output_ma', 'output_tx', 'output_wa'], leave=True, position=0) # loop over each output directory
    ])
    return df

In [3]:
# load in the conditions
conditions = load_data_for_file('conditions.parquet')
observations = load_data_for_file('observations.parquet')
medications = load_data_for_file('medications.parquet')
procedures = load_data_for_file('procedures.parquet')

Loading data for conditions.parquet


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.42it/s]


Loading data for observations.parquet


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.29s/it]


Loading data for medications.parquet


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.30it/s]


Loading data for procedures.parquet


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.13it/s]


## Filtering Out Patients with Diagnoses of Type-2 Diabetes
For this exercise, we are interested in studying patients with a diagnosis of Type-2 diabetes \
We select these from the conditions table based on the SNOMED code `44054006`

In [4]:
type2_patients = conditions.query('CODE == 44054006').sort_values(by=['PATIENT', 'START']).drop_duplicates(subset=['PATIENT', 'START'], keep='first')

In [5]:
# now we create a dictionary/lookup table to map each patient's ID to the date of their earliest Type 2 diagnosis
patient_diagnosis_dates = {
    row['PATIENT']: row['START']
    for _, row in type2_patients.iterrows()
}

In [6]:
conditions

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,SYSTEM,CODE,DESCRIPTION
0,2001-07-18,,8f8229e6-00be-a033-bb16-42781f9d208a,249a2734-060e-3b6e-de43-c5c2c97e888b,SNOMED-CT,473461003,Educated to high school level (finding)
1,2001-07-18,,8f8229e6-00be-a033-bb16-42781f9d208a,249a2734-060e-3b6e-de43-c5c2c97e888b,SNOMED-CT,160903007,Full-time employment (finding)
2,2011-07-25,,8f8229e6-00be-a033-bb16-42781f9d208a,7570e7f5-b7a8-3848-0208-07e278d3754e,SNOMED-CT,161744009,Past pregnancy history of miscarriage (situation)
3,2014-08-13,2015-07-22,8f8229e6-00be-a033-bb16-42781f9d208a,366aac19-54b2-b4af-2b03-8d35c03ae2ba,SNOMED-CT,73595000,Stress (finding)
4,2014-12-03,2015-07-15,8f8229e6-00be-a033-bb16-42781f9d208a,a047e3c3-9e86-dd54-f026-e28929aa1e6b,SNOMED-CT,72892002,Normal pregnancy (finding)
...,...,...,...,...,...,...,...
47332,2024-12-31,2025-01-14,e4f7d897-4345-a297-0ae3-ff6fbe5a71aa,f9215af9-3736-0599-7ceb-4485b8af96e7,SNOMED-CT,80583007,Severe anxiety (panic) (finding)
47333,2025-01-07,2025-01-21,e4f7d897-4345-a297-0ae3-ff6fbe5a71aa,c9444afc-a873-a18c-1e49-eb3a92c4dc01,SNOMED-CT,314529007,Medication review due (situation)
47334,2025-01-14,2025-01-28,e4f7d897-4345-a297-0ae3-ff6fbe5a71aa,a68b03e2-9449-782b-47d4-da72c2cfecda,SNOMED-CT,423315002,Limited social contact (finding)
47335,2025-01-28,2025-02-04,e4f7d897-4345-a297-0ae3-ff6fbe5a71aa,8b0ae691-9e4c-b1ce-0cc2-da9f273db9f9,SNOMED-CT,314529007,Medication review due (situation)


In [7]:
# add simplified date columns to the observations, medications, and procedures 
observations_simplified = observations.assign(
    DATE_SIMPLE=lambda x: pd.to_datetime(x['DATE']).dt.date.astype('str')
)
medications_simplified = medications.assign(
    DATE_SIMPLE=lambda x: pd.to_datetime(x['START']).dt.date.astype('str')
)
procedures_simplified = procedures.assign(
    DATE_SIMPLE=lambda x: pd.to_datetime(x['START']).dt.date.astype('str')
)

## Filtering Out Post-diagnosis Conditions, Observations, Medications, and Procedures and Unifying Into a Shared Representation
First we need to filter out all EHR data from encounters that took place after the Type-2 diabetes diagnosis for the type 2 patients

In [8]:
def filter_data(df, patients, date_column='DATE_SIMPLE'):
    data_filtered = []
    for _, row in tqdm(df.iterrows(), total=len(df), position=0, leave=True):
        patient = row['PATIENT']
        date = row[date_column]
        if patient in patients and patients[patient] > date:
            data_filtered.append(row)
    return pd.DataFrame(data_filtered)

In [9]:
conditions_filtered = filter_data(conditions, patient_diagnosis_dates, 'START') 

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 182373/182373 [00:02<00:00, 78771.90it/s]


In [10]:
observations_subset = observations_simplified[observations_simplified['PATIENT'].isin(patient_diagnosis_dates)]
observations_filtered = filter_data(observations_subset, patient_diagnosis_dates)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 789669/789669 [00:10<00:00, 76936.18it/s]


In [11]:
medications_subset = medications_simplified[medications_simplified['PATIENT'].isin(patient_diagnosis_dates)]
medications_filtered = filter_data(medications_subset, patient_diagnosis_dates)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 55269/55269 [00:00<00:00, 75076.53it/s]


In [12]:
procedures_subset = procedures_simplified[procedures_simplified['PATIENT'].isin(patient_diagnosis_dates)]
procedures_filtered = filter_data(procedures_subset, patient_diagnosis_dates)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108374/108374 [00:01<00:00, 73671.74it/s]


In [13]:
procedures_filtered

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,SYSTEM,CODE,DESCRIPTION,BASE_COST,REASONCODE,REASONDESCRIPTION,DATE_SIMPLE
39,2016-02-23T02:53:46Z,2016-02-23T03:46:34Z,e1418fe5-3ca2-2d35-9cd0-88ec20bce2d6,799c717a-401a-e1e7-2b4f-ab040641b2b6,SNOMED-CT,710824005,Assessment of health and social care needs (pr...,600.50,,,2016-02-23
40,2016-02-23T03:46:34Z,2016-02-23T04:02:52Z,e1418fe5-3ca2-2d35-9cd0-88ec20bce2d6,799c717a-401a-e1e7-2b4f-ab040641b2b6,SNOMED-CT,710841007,Assessment of anxiety (procedure),600.50,,,2016-02-23
41,2016-02-23T04:02:52Z,2016-02-23T04:28:16Z,e1418fe5-3ca2-2d35-9cd0-88ec20bce2d6,799c717a-401a-e1e7-2b4f-ab040641b2b6,SNOMED-CT,866148006,Screening for domestic abuse (procedure),600.50,,,2016-02-23
42,2016-02-23T04:28:16Z,2016-02-23T04:41:42Z,e1418fe5-3ca2-2d35-9cd0-88ec20bce2d6,799c717a-401a-e1e7-2b4f-ab040641b2b6,SNOMED-CT,171207006,Depression screening (procedure),600.50,,,2016-02-23
43,2016-02-23T04:41:42Z,2016-02-23T05:04:24Z,e1418fe5-3ca2-2d35-9cd0-88ec20bce2d6,799c717a-401a-e1e7-2b4f-ab040641b2b6,SNOMED-CT,171207006,Depression screening (procedure),600.50,,,2016-02-23
...,...,...,...,...,...,...,...,...,...,...,...
188353,2000-01-17T17:24:45Z,2000-01-17T18:12:42Z,66de5451-b446-367b-821d-265b4cfda04f,f653d6cc-e7d0-1170-f770-49aff5d33dbe,SNOMED-CT,710824005,Assessment of health and social care needs (pr...,463.95,,,2000-01-17
188354,2000-01-17T18:12:42Z,2000-01-17T18:34:51Z,66de5451-b446-367b-821d-265b4cfda04f,f653d6cc-e7d0-1170-f770-49aff5d33dbe,SNOMED-CT,710841007,Assessment of anxiety (procedure),463.95,,,2000-01-17
188355,2000-01-17T18:34:51Z,2000-01-17T19:08:18Z,66de5451-b446-367b-821d-265b4cfda04f,f653d6cc-e7d0-1170-f770-49aff5d33dbe,SNOMED-CT,866148006,Screening for domestic abuse (procedure),463.95,,,2000-01-17
188356,2000-01-17T19:08:18Z,2000-01-17T19:20:13Z,66de5451-b446-367b-821d-265b4cfda04f,f653d6cc-e7d0-1170-f770-49aff5d33dbe,SNOMED-CT,171207006,Depression screening (procedure),463.95,,,2000-01-17


Now we will label the EHR data for the type 2 and non-type 2 patients and unify everything into a single event set

In [14]:
def get_unified_records(conditions, observations, medications, procedures):
    return pd.concat([
    conditions[['PATIENT', 'START', 'CODE', 'DESCRIPTION']].assign(
        EVENT_TYPE='CONDITION',
    ).rename(columns={'START': 'DATE'}),
    observations[['PATIENT', 'DATE_SIMPLE', 'CODE', 'DESCRIPTION']].assign(
        EVENT_TYPE='OBSERVATION',
    ).rename(columns={'DATE_SIMPLE': 'DATE'}),
    medications[['PATIENT', 'DATE_SIMPLE', 'CODE', 'DESCRIPTION']].assign(
        EVENT_TYPE='MEDICATION',
    ).rename(columns={'DATE_SIMPLE': 'DATE'}),
    procedures[['PATIENT', 'DATE_SIMPLE', 'CODE', 'DESCRIPTION']].assign(
        EVENT_TYPE='PROCEDURE',
    ).rename(columns={'DATE_SIMPLE': 'DATE'})
    ])

In [15]:
# starting with the Type 2 diabetes patients, we label and unify the data from medications, procedures, observations, and conditions into a single recordset
all_records_type2 = get_unified_records(conditions_filtered, observations_filtered, medications_filtered, procedures_filtered)

In [16]:
# now we will do the same for the non type2 patients
all_records_non_type2 = get_unified_records(
    conditions[~conditions['PATIENT'].isin(type2_patients['PATIENT'])],
    observations_simplified[~observations_simplified['PATIENT'].isin(type2_patients['PATIENT'])],
    medications_simplified[~medications_simplified['PATIENT'].isin(type2_patients['PATIENT'])],
    procedures_simplified[~procedures_simplified['PATIENT'].isin(type2_patients['PATIENT'])],
)

In [17]:
all_records_non_type2

Unnamed: 0,PATIENT,DATE,CODE,DESCRIPTION,EVENT_TYPE
0,8f8229e6-00be-a033-bb16-42781f9d208a,2001-07-18,473461003,Educated to high school level (finding),CONDITION
1,8f8229e6-00be-a033-bb16-42781f9d208a,2001-07-18,160903007,Full-time employment (finding),CONDITION
2,8f8229e6-00be-a033-bb16-42781f9d208a,2011-07-25,161744009,Past pregnancy history of miscarriage (situation),CONDITION
3,8f8229e6-00be-a033-bb16-42781f9d208a,2014-08-13,73595000,Stress (finding),CONDITION
4,8f8229e6-00be-a033-bb16-42781f9d208a,2014-12-03,72892002,Normal pregnancy (finding),CONDITION
...,...,...,...,...,...
212079,3df4eaa0-3234-0118-df9e-5cbe4659744e,2024-10-28,866148006,Screening for domestic abuse (procedure),PROCEDURE
212080,3df4eaa0-3234-0118-df9e-5cbe4659744e,2024-10-28,171207006,Depression screening (procedure),PROCEDURE
212081,3df4eaa0-3234-0118-df9e-5cbe4659744e,2024-10-28,171207006,Depression screening (procedure),PROCEDURE
212082,3df4eaa0-3234-0118-df9e-5cbe4659744e,2024-10-28,428211000124100,Assessment of substance use (procedure),PROCEDURE


## Now We will split our Type 2 and Non-Type 2 patients into Training and Hold-out sets


In [18]:
from sklearn.model_selection import train_test_split
type2_patients = all_records_type2['PATIENT'].unique()
non_type2_patients = all_records_non_type2['PATIENT'].unique()
labels = np.concatenate([np.ones(type2_patients.shape), np.zeros(non_type2_patients.shape)])
train_patients, test_patients, train_labels, test_labels = train_test_split(np.concatenate([type2_patients, non_type2_patients]), labels, test_size=0.2, stratify=labels)

In [19]:
# now split the records accordingly
all_records = pd.concat([all_records_type2, all_records_non_type2])
train_records = all_records[all_records['PATIENT'].isin(train_patients)]
test_records = all_records[all_records['PATIENT'].isin(test_patients)]

## Option 1: Bag of Labeled Clinical Encounters
The simplest feature representation we can create and test is a binary vector representation \
which encodes the occurence or lack-therof of different clinical encounters/events \
To construct this representation, we can use the scikit-learn package's `CountVectorizer` class

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
vectorizer = CountVectorizer(
    binary=True,
    tokenizer=lambda x: x.split('|'),
    token_pattern=None,
    lowercase=False
)

The vectorizer expects a `'|'` pipe delimited string of coded encounters, \
so we will construct this representation now for our training patients

In [22]:
# condense the records into a pipe-delimited string of event tokens per patient, 
# where each token is of the form <EVENT_TYPE>::<CODE>
train_records_condensed = train_records.assign(
    EVENT_TOKEN=lambda x: x['EVENT_TYPE'] + '::' + x['CODE'].astype(str) + '|'
).groupby(['PATIENT'])['EVENT_TOKEN'].sum().reset_index()

In [23]:
train_data_final = train_records_condensed.assign(
    LABEL=lambda x: x['PATIENT'].isin(type2_patients).astype(int)
)

In [24]:
train_data_final

Unnamed: 0,PATIENT,EVENT_TOKEN,LABEL
0,0007008e-e6b0-8ad7-b5b8-c5d6c0c731f4,CONDITION::314529007|CONDITION::314529007|COND...,0
1,00127d79-8f35-5109-fe9b-961386b57f99,CONDITION::314529007|CONDITION::66383009|CONDI...,0
2,001d8f80-cfca-0bfe-8c98-3fcaaccd1655,CONDITION::224299000|CONDITION::446654005|COND...,0
3,0029cec6-9b3b-ef64-8361-3cef09ec8439,CONDITION::314529007|CONDITION::314529007|COND...,0
4,0031b28b-6096-9263-57d1-56383bea4ba4,CONDITION::314529007|CONDITION::314529007|COND...,0
...,...,...,...
3655,ffa278a3-bf3c-4437-fe60-a0d8d83979a7,CONDITION::160968000|CONDITION::473461003|COND...,0
3656,ffa71780-67c4-7174-729c-8a3611d0385f,CONDITION::160968000|CONDITION::224295006|COND...,1
3657,ffa95e2b-cbcd-96c9-a14f-2de40f6be332,CONDITION::224295006|CONDITION::714628002|COND...,0
3658,fff1fcf6-208c-2cf7-b058-5fc63bbc22d1,CONDITION::224295006|CONDITION::422650009|COND...,1


### Now that we have constructed a Pipe Delimited Event Representation, We Can Vectorize

In [25]:
event_occurence_vectors = vectorizer.fit_transform(train_data_final['EVENT_TOKEN'])

In [26]:
event_occurence_vectors.shape

(3660, 1192)

### Now We Will Test the performance of this representation on the Training Set Using KFold Cross Validation

In [27]:
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report
from sklearn.inspection import permutation_importance

In [28]:
kfold = StratifiedKFold(n_splits=5)
clf = RandomForestClassifier()

In [29]:
metrics = []
for i, (train_index, test_index) in tqdm(enumerate(kfold.split(event_occurence_vectors, train_data_final['LABEL'])), total=5, position=0, leave=True):
    train_x, train_y = event_occurence_vectors[train_index], train_data_final['LABEL'].to_numpy()[train_index]
    test_x, test_y = event_occurence_vectors[test_index], train_data_final['LABEL'].to_numpy()[test_index]
    # fit the model on the training fold
    clf.fit(train_x, train_y)
    # evaluate the model on the validation fold
    preds = clf.predict(test_x)
    scores = clf.predict_proba(test_x)[:, 1]
    # get the AUROC
    fpr, tpr, _ = roc_curve(test_y, scores)
    auroc = auc(fpr, tpr)
    # get the confusion matrix
    cm = confusion_matrix(test_y, preds)
    # save the metrics
    metrics.append({
        'AUROC': auroc,
        'Precision': cm[1, 1] / cm[:, 1].sum(),
        'Recall': cm[1, 1] / cm[1].sum(),
        'Specificity': cm[0, 0] / cm[0].sum()
    })
pd.DataFrame(metrics)
    

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.67it/s]


Unnamed: 0,AUROC,Precision,Recall,Specificity
0,0.951754,1.0,0.795455,1.0
1,0.941032,1.0,0.666667,1.0
2,0.966343,1.0,0.755556,1.0
3,0.972845,1.0,0.822222,1.0
4,0.928303,1.0,0.711111,1.0


### The model appears to be performing well so far, but is that where the story ends, or does the plot thicken?
One thing that we can do is audit the way our current model is behaving, and what its predictions are based \
on, by looking at feature importance rankings. Here we compute the permutation importance of the model features \
on a validation set, and focus in on the top 20 features ranked by importance.

In [30]:
# split our training dataset into "train" and "test" sets to fit the model for explainability purposes
train_x, test_x, train_y, test_y = train_test_split(event_occurence_vectors, train_data_final['LABEL'].to_numpy(), test_size=0.2, stratify=train_data_final['LABEL'])

In [31]:
train_x.shape

(2928, 1192)

In [32]:
test_x.shape

(732, 1192)

In [33]:
clf.fit(train_x, train_y)

In [34]:
print(classification_report(test_y, clf.predict(test_x)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       687
           1       1.00      0.64      0.78        45

    accuracy                           0.98       732
   macro avg       0.99      0.82      0.89       732
weighted avg       0.98      0.98      0.98       732



In [35]:
r = permutation_importance(clf, np.asarray(train_x.todense()), train_y, n_repeats=5, n_jobs=-1, scoring='recall')

In [36]:
r

{'importances_mean': array([0.        , 0.00558659, 0.        , ..., 0.        , 0.        ,
        0.        ]),
 'importances_std': array([0.        , 0.00353327, 0.        , ..., 0.        , 0.        ,
        0.        ]),
 'importances': array([[0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.00558659, 0.00558659, 0.00558659, 0.        , 0.01117318],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        ...,
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ]])}

In [37]:
importances_df = pd.DataFrame({
    'Feature': vectorizer.get_feature_names_out(),
    'mean_importance': r.importances_mean
})

In [38]:
top20 = importances_df.sort_values(by='mean_importance', ascending=False).head(20)

In [39]:
top20.assign(
    feature_code=lambda x: x['Feature'].apply(lambda feat: feat.split('::')[1])
).merge(all_records[['CODE', 'DESCRIPTION']].astype({'CODE': 'str'}).drop_duplicates(), left_on='feature_code', right_on='CODE').sort_values(by='mean_importance', ascending=False)

Unnamed: 0,Feature,mean_importance,feature_code,CODE,DESCRIPTION
0,CONDITION::444814009,0.014525,444814009,444814009,Viral sinusitis (disorder)
1,CONDITION::239873007,0.011173,239873007,239873007,Osteoarthritis of knee (disorder)
2,CONDITION::423315002,0.010056,423315002,423315002,Limited social contact (finding)
3,CONDITION::161744009,0.010056,161744009,161744009,Past pregnancy history of miscarriage (situation)
4,OBSERVATION::70274-6,0.010056,70274-6,70274-6,Generalized anxiety disorder 7 item (GAD-7) to...
5,CONDITION::706893006,0.008939,706893006,706893006,Victim of intimate partner abuse (finding)
6,PROCEDURE::1259293006,0.008939,1259293006,1259293006,Application of composite dental filling materi...
7,PROCEDURE::73761001,0.008939,73761001,73761001,Colonoscopy (procedure)
8,PROCEDURE::866148006,0.008939,866148006,866148006,Screening for domestic abuse (procedure)
12,OBSERVATION::2085-9,0.006704,2085-9,2085-9,Cholesterol in HDL [Mass/volume] in Serum or P...


## Option 2: Bag of Character N-Grams
One way that we can potentially improve on our earlier feature representation is to use a bag of N-grams \
which encodes the occurence or lack-therof of different substrings from clinical encounter descriptions \
To construct this representation, we can again make use of the scikit-learn package's `CountVectorizer` class \
but instead of using the default `"word"` analyzer, we will instead use the `"char_wb"` analyzer

In [40]:
vectorizer_ngram = CountVectorizer(
    binary=True,
    lowercase=True,
    ngram_range=(5, 5),
    analyzer='char_wb'
)

In [41]:
# condense the records into a concatenated description per patient, 
# wherein we take all event descriptions and combine them together into a single
# description that encompasses all of the clinical events for that patient
train_records_condensed = train_records.assign(
    DESCRIPTION_WS=lambda x: x['DESCRIPTION'] + ' '
).groupby('PATIENT')['DESCRIPTION_WS'].sum().reset_index()

In [42]:
train_records_condensed.iloc[0]['DESCRIPTION_WS']

'Medication review due (situation) Medication review due (situation) Medication review due (situation) Medication review due (situation) Medication review due (situation) Otitis media (disorder) Medication review due (situation) Medication review due (situation) Laceration - injury (disorder) Facial laceration (disorder) Medication review due (situation) Viral sinusitis (disorder) Childhood asthma (disorder) Medication review due (situation) Sprain (morphologic abnormality) Sprain of ankle (disorder) Medication review due (situation) Body Height Pain severity - 0-10 verbal numeric rating [Score] - Reported Body Weight Weight-for-length Per age and sex Head Occipital-frontal circumference Percentile Head Occipital-frontal circumference Diastolic Blood Pressure Systolic Blood Pressure Heart rate Respiratory rate Leukocytes [#/volume] in Blood by Automated count Erythrocytes [#/volume] in Blood by Automated count Hemoglobin [Mass/volume] in Blood Hematocrit [Volume Fraction] of Blood by A

In [43]:
train_data_final = train_records_condensed.assign(
    LABEL=lambda x: x['PATIENT'].isin(type2_patients).astype(int)
)

### Now that we have a condensed representation, we will vectorize

In [44]:
ngram_vectors = vectorizer_ngram.fit_transform(train_data_final['DESCRIPTION_WS'])

### Now We Will Test the performance of this representation on the Training Set Using KFold Cross Validation

In [45]:
kfold = StratifiedKFold(n_splits=5)
clf = RandomForestClassifier()

metrics = []
for i, (train_index, test_index) in tqdm(enumerate(kfold.split(ngram_vectors, train_data_final['LABEL'])), total=5, position=0, leave=True):
    train_x, train_y = ngram_vectors[train_index], train_data_final['LABEL'].to_numpy()[train_index]
    test_x, test_y = ngram_vectors[test_index], train_data_final['LABEL'].to_numpy()[test_index]
    # fit the model on the training fold
    clf.fit(train_x, train_y)
    # evaluate the model on the validation fold
    preds = clf.predict(test_x)
    scores = clf.predict_proba(test_x)[:, 1]
    # get the AUROC
    fpr, tpr, _ = roc_curve(test_y, scores)
    auroc = auc(fpr, tpr)
    # get the confusion matrix
    cm = confusion_matrix(test_y, preds)
    # save the metrics
    metrics.append({
        'AUROC': auroc,
        'Precision': cm[1, 1] / cm[:, 1].sum(),
        'Recall': cm[1, 1] / cm[1].sum(),
        'Specificity': cm[0, 0] / cm[0].sum()
    })
pd.DataFrame(metrics)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.37it/s]


Unnamed: 0,AUROC,Precision,Recall,Specificity
0,0.949739,1.0,0.795455,1.0
1,0.900388,1.0,0.688889,1.0
2,0.952515,1.0,0.777778,1.0
3,0.95693,1.0,0.822222,1.0
4,0.950752,1.0,0.711111,1.0


### Let's Again Look at the Top 20 Features Ranked by Importance

In [46]:
# split our training dataset into "train" and "test" sets to fit the model for explainability purposes
train_x, test_x, train_y, test_y = train_test_split(ngram_vectors, train_data_final['LABEL'].to_numpy(), test_size=0.2, stratify=train_data_final['LABEL'])

In [47]:
clf.fit(train_x, train_y)

In [48]:
print(classification_report(test_y, clf.predict(test_x)))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       687
           1       1.00      0.82      0.90        45

    accuracy                           0.99       732
   macro avg       0.99      0.91      0.95       732
weighted avg       0.99      0.99      0.99       732



In [53]:
r = permutation_importance(clf, np.asarray(train_x.todense()), train_y, n_repeats=2, n_jobs=-1, scoring='recall')

In [54]:
importances_df = pd.DataFrame({
    'Feature': vectorizer_ngram.get_feature_names_out(),
    'mean_importance': r.importances_mean
})

In [55]:
top20 = importances_df.sort_values(by='mean_importance', ascending=False).head(20)

In [56]:
top20

Unnamed: 0,Feature,mean_importance
5423,oriny,0.005587
2959,ence,0.005587
1324,vict,0.005587
4034,iral,0.005587
7116,ture,0.005587
3979,inusi,0.005587
2395,colon,0.002793
6756,table,0.002793
2327,cilia,0.002793
1210,tabl,0.002793


In [None]:
for _, row in tqdm(top20.iterrows(), total=20, position=0, leave=True):
    
    