# Importing of All Packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report
from sklearn.metrics import RocCurveDisplay, ConfusionMatrixDisplay
from sklearn.inspection import permutation_importance
from tqdm import tqdm
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
tqdm.pandas()

# Scenario: Creating a Set of Machine Learning Friendly Features from EHR Data to Predict Type 2 Diabetes Onset

First we will load in the necessary data files

In [None]:
def load_data_for_file(filename):
    print(f"Loading data for {filename}")
    df = pd.concat([ # use pd.concat to append/concatenate the data for all states together into a single frame
        pd.read_parquet(f"https://dicbworkshops.s3.amazonaws.com/{output_dir}/parquet/{filename}") # use read_csv to load the data from each output directory
        for output_dir in tqdm(['output_hi', 'output_ma', 'output_tx', 'output_wa'], leave=True, position=0) # loop over each output directory
    ])
    return df

In [None]:
# load in the conditions
conditions = load_data_for_file('conditions.parquet')
# load in the observations
observations = load_data_for_file('observations.parquet')
# load in the medications
medications = load_data_for_file('medications.parquet')
# load in the procedures
procedures = load_data_for_file('procedures.parquet')
# load in the patients
patients = load_data_for_file('patients.parquet')

## Extracting Patients with Type 2 Diabetes Diagnoses
For this exercise, we are interested in studying patients with a diagnosis of Type-2 diabetes \
We select these from the conditions table based on the SNOMED code `44054006`

In [None]:
type2_code = 44054006

## Split the Data into 80/20 Training/Hold out Sets

In [None]:
type2_patients = conditions.query('CODE == @type2_code')['PATIENT'].unique()

In [None]:
# label the patients dataframe based on type2/non-type2 status
patients_labeled = patients.assign(
    label=lambda x: x['Id'].isin(type2_patients).astype('int')
)

In [None]:
# split the patients into training and test sets
patients_train, patients_test = train_test_split(patients_labeled, test_size=0.2, stratify=patients_labeled['label'], random_state=913)

In [None]:
# now we split the rest of the files into train and test sets based on these patient splits
conditions_train, conditions_test = (
    conditions[conditions['PATIENT'].isin(patients_train['Id'])],
    conditions[conditions['PATIENT'].isin(patients_test['Id'])]
)
observations_train, observations_test = (
    observations[observations['PATIENT'].isin(patients_train['Id'])],
    observations[observations['PATIENT'].isin(patients_test['Id'])]
)
medications_train, medications_test = (
    medications[medications['PATIENT'].isin(patients_train['Id'])],
    medications[medications['PATIENT'].isin(patients_test['Id'])]
)
procedures_train, procedures_test = (
    procedures[procedures['PATIENT'].isin(patients_train['Id'])],
    procedures[procedures['PATIENT'].isin(patients_test['Id'])]
)

In [None]:
# now collate these into dictionaries for later processing
train_data = {
    'conditions': conditions_train,
    'observations': observations_train,
    'medications': medications_train,
    'procedures': procedures_train
}

test_data = {
    'conditions': conditions_test,
    'observations': observations_test,
    'medications': medications_test,
    'procedures': procedures_test
}

# Definition of Helper Functions Used to Compose the Larger Pipeline

In [None]:
# function to construct a lookup table of condition onset dates for patients based on the conditions table and a provided code
def get_patient_onset_dates(conditions, code):
    patients_with_condition = (
        conditions.query('CODE == @code') # get all patients diagnosed with the code
        .sort_values(by=['PATIENT', 'START']) # sort the data by patient ID and then start date
        .drop_duplicates(subset=['PATIENT', 'START'], keep='first') # drop duplicates, keeping the instance with the earliest start date
    )

    # now build a lookup table/dictionary to map each patient's ID to the date of their earliest onset
    patient_onset_dates = {
        row['PATIENT']: row['START']
        for _, row in patients_with_condition.iterrows()
    }
    return patient_onset_dates

In [None]:
# function to construct simplified date columns for the observations, medications, and procedures tables
def get_simplified_data(df, date_col, simplified_col='DATE_SIMPLE'):
    return df.assign(**{
        simplified_col: lambda x: pd.to_datetime(x[date_col]).dt.date.astype('str')
    })

In [None]:
# function to filter out post diagnosis records from a table based on patient onset dates
def filter_data_by_onset_dates(df, patient_onset_dates, date_column='DATE_SIMPLE'):
    data_filtered = []
    for _, row in tqdm(df.iterrows(), total=len(df), position=0, leave=True):
        patient = row['PATIENT']
        date = row[date_column]
        if patient in patient_onset_dates and patient_onset_dates[patient] > date:
            data_filtered.append(row)
    return pd.DataFrame(data_filtered)

In [None]:
# function to unify the records for the four different types of events/ecounters into a single table
def get_unified_records(conditions, observations, medications, procedures):
    return pd.concat([
        conditions[['PATIENT', 'START', 'CODE', 'DESCRIPTION']].assign(
            EVENT_TYPE='CONDITION',
        ).rename(columns={'START': 'DATE'}),
        observations[['PATIENT', 'DATE_SIMPLE', 'CODE', 'DESCRIPTION']].assign(
            EVENT_TYPE='OBSERVATION',
        ).rename(columns={'DATE_SIMPLE': 'DATE'}),
        medications[['PATIENT', 'DATE_SIMPLE', 'CODE', 'DESCRIPTION']].assign(
            EVENT_TYPE='MEDICATION',
        ).rename(columns={'DATE_SIMPLE': 'DATE'}),
        procedures[['PATIENT', 'DATE_SIMPLE', 'CODE', 'DESCRIPTION']].assign(
            EVENT_TYPE='PROCEDURE',
        ).rename(columns={'DATE_SIMPLE': 'DATE'})
    ]).sort_values(by=['PATIENT', 'DATE']).reset_index(drop=True)

In [None]:
# function to get condensed record data from unified records
def get_condensed_record_data(unified_records):
    # condense the records into a pipe-delimited string of event tokens per patient, 
    # where each token is of the form <EVENT_TYPE>::<CODE>
    records_condensed = unified_records.assign(
        EVENT_TOKEN=lambda x: x['EVENT_TYPE'] + '::' + x['CODE'].astype(str) + '|'
    ).groupby(['PATIENT'])['EVENT_TOKEN'].sum().reset_index()
    records_condensed['EVENT_TOKEN'] = records_condensed['EVENT_TOKEN'].str.rstrip('|')
    return records_condensed

In [None]:
# function to get condensed record data from unified records for n-gram input representation
def get_condensed_record_data_for_ngrams(unified_records):
    # condense the records into a space-delimited string of event descriptions per patient
    records_condensed = unified_records.assign(
        EVENT_DESCRIPTION=lambda x: x['DESCRIPTION'] + ' '
    ).groupby(['PATIENT'])['EVENT_DESCRIPTION'].sum().reset_index()
    records_condensed['EVENT_DESCRIPTION'] = records_condensed['EVENT_DESCRIPTION'].str.rstrip()
    return records_condensed

In [None]:
# function to vectorize the unified record data into binary occurence format
def get_multihot_vector_representation(condensed_records, vectorizer, feature_col='EVENT_TOKEN', train=True):
    # now get the multi-hot representation from the vectorizer
    if train:
        # if this is the training set, fit before transforming
        return vectorizer.fit_transform(condensed_records[feature_col])
    else:
        #otherwise, just transform
        return vectorizer.transform(condensed_records[feature_col])
    

In [None]:
# function to add patient ages to a given dataframe, and compute age bins from those ages
def get_aged_patient_data(events_df, patients_df, bin_width=5, date_col='DATE_SIMPLE'):
    # first merge in the birthdates from the patients dataframe
    merged = events_df.merge(
        patients_df[['Id', 'BIRTHDATE']],
        left_on='PATIENT',
        right_on='Id'
    )
    # now calculate the age from the birthdate and the date column
    aged = merged.assign(
        AGE=lambda x: (pd.to_datetime(x[date_col]) - pd.to_datetime(x['BIRTHDATE'])).dt.days // 365
    )
    # now use the calculated age to compute age bins using pd.cut
    age_binned = aged.assign(
        AGE_BIN=lambda x: pd.cut(x['AGE'], bins=list(np.arange(0, x['AGE'].max() + bin_width, bin_width)), include_lowest=True)
    )
    # now get the human readable age bin
    result = age_binned.assign(
        AGE_RANGE=lambda x: x['AGE_BIN'].apply(lambda b: f"{int(b.left)}-{int(b.right)}" if b.left <= 0 else f"{int(b.left)+1}-{int(b.right)}")
    )
    # return the age_binned data
    return result

In [None]:
# function to build age_binned distributions for numeric observations
def get_age_binned_distributions(observations_df):
    # first make sure that we are only considering numeric observations
    numeric_obs = observations_df.query('TYPE == "numeric"')
    # build the datastructure to store the distributions
    observation_distributions = {
        code: {}
        for code in numeric_obs['CODE'].unique()
    }
    # now iterate over the observations, and construct the age binned distributions
    for _, row in tqdm(numeric_obs.iterrows(), total=len(numeric_obs)):
        # get the value, code, and age_range
        value = row['VALUE']
        code = row['CODE']
        age_range = row['AGE_RANGE']
        # add the value to the distribution for the corresponding code and age range
        observation_distributions[code][age_range] = observation_distributions[code].get(age_range, []) + [value]
    # now sort all of the distributions
    for age_bin_dists in observation_distributions.values():
        for age_bin_dist in age_bin_dists.values():
            age_bin_dist.sort()
    return observation_distributions
        

In [None]:
# Fast implementation of percentile ranking, assume group is sorted
def fast_percentile_ranking(group, value):
    if len(group) == 0:
        return np.nan
    return recursive_ranking(group, value, 0, len(group) - 1)

def recursive_ranking(group, value, start, end):
    if start > end:
        return start / len(group) * 100

    mid = (start + end) // 2

    if value > group[mid]:
        return recursive_ranking(group, value, mid + 1, end)
    elif value < group[mid]:
        return recursive_ranking(group, value, start, mid - 1)
    else:
        # Handle case where value == group[mid]
        low, high = mid, mid

        # Extend low to include all equal values before mid
        while low > start and group[low - 1] == group[mid]:
            low -= 1

        # Extend high to include all equal values after mid
        while high < end and group[high + 1] == group[mid]:
            high += 1

        low = low if group[low] == value else low + 1
        high = high if group[high] == value else high - 1

        # Calculate percentile for the range of identical values
        low_percentile = (low+1) / len(group) * 100
        high_percentile = (high+1) / len(group) * 100

        # Average the percentiles of the range of identical values
        return (low_percentile + high_percentile) / 2

In [None]:
# function to compute the percentile score of all numeric observations against those in their age cohort
def get_percentile_ranked_observations(observations_df, observation_distributions):
    observations_ranked = observations_df.assign(
        PERCENTILE_RANK=lambda x: x.progress_apply(
            lambda row: fast_percentile_ranking(observation_distributions.get(row['CODE'], {}).get(row['AGE_RANGE'], []), row['VALUE']),
            axis=1
        )
    )
    return observations_ranked

In [None]:
# function to add the label for the percentile score of the ranked numeric observations to the CODE column
def get_percentile_labeled_observations(observations_df, percentiles=4):
    # make sure we drop any observations for which the percentile_rank is nan
    observations_no_nan = observations_df.dropna(subset=['PERCENTILE_RANK']).reset_index(drop=True).copy()
    # use the pd.cut function to bin the PERCENTILE_RANK into percentiles many bins
    bin_width = 100 // percentiles
    observations_no_nan['PERCENTILE'] = pd.cut(
        observations_no_nan['PERCENTILE_RANK'], 
        bins=np.arange(0, 100+bin_width, bin_width), 
        include_lowest=True
    )
    observations_no_nan['CODE'] = (
        observations_no_nan['CODE'] + '__' + 
        observations_no_nan['PERCENTILE'].map(lambda interval: interval.left).astype('int').astype('str') + "th - " +
        observations_no_nan['PERCENTILE'].map(lambda interval: interval.right).astype('int').astype('str') +
        'th Percentile'
    )
    return observations_no_nan
    

In [None]:
def train_and_evaluate_classifier_kfold(clf, X, y, k=5):
    metrics = []
    kfold = StratifiedKFold(n_splits=k, random_state=913, shuffle=True)
    for i, (train_index, test_index) in tqdm(enumerate(kfold.split(X, y)), total=k, position=0, leave=True):
        train_x, train_y = X[train_index], y[train_index]
        test_x, test_y = X[test_index], y[test_index]
        # fit the model on the training fold
        clf.fit(train_x, train_y)
        # evaluate the model on the validation fold
        preds = clf.predict(test_x)
        scores = clf.predict_proba(test_x)[:, 1]
        # get the AUROC
        fpr, tpr, _ = roc_curve(test_y, scores)
        auroc = auc(fpr, tpr)
        # get the confusion matrix
        cm = confusion_matrix(test_y, preds)
        # save the metrics
        metrics.append({
            'AUROC': auroc,
            'Precision': cm[1, 1] / cm[:, 1].sum(),
            'Recall': cm[1, 1] / cm[1].sum(),
            'Specificity': cm[0, 0] / cm[0].sum()
        })

    return pd.DataFrame(metrics), fpr, tpr, cm

In [None]:
# gets feature importance rankings assuming a tree-based model as clf (DecisionTree, RandomForest, ExtraTrees, etc.)
def get_feature_importance_rankings(clf, vectorizer, all_records):
    reverse_lookup = {
        value: key for key, value in vectorizer.vocabulary_.items()
    }
    importances = clf.feature_importances_
    feature_names = [reverse_lookup[idx] for idx in np.arange(0, len(importances), 1)]
    importances_df = pd.DataFrame({
        'FEATURE_NAME': feature_names,
        'FEATURE_IMPORTANCE': importances
    }).assign(
        CODE=lambda x: x['FEATURE_NAME'].str.split('::').apply(lambda pair: pair[1])
    ).merge(
        all_records[['CODE', 'DESCRIPTION']].drop_duplicates().astype({'CODE': str}),
        on='CODE',
    ).sort_values(by='FEATURE_IMPORTANCE', ascending=False)
    return importances_df

In [None]:
def get_feature_importance_rankings_lr(clf, vectorizer, all_records):
    reverse_lookup = {
        value: key for key, value in vectorizer.vocabulary_.items()
    }
    importances = clf.coef_[0]
    feature_names = [reverse_lookup[idx] for idx in np.arange(0, len(importances), 1)]
    importances_df = pd.DataFrame({
        'FEATURE_NAME': feature_names,
        'FEATURE_IMPORTANCE': importances,
        'ABSOLUTE_IMPORTANCE': np.abs(importances)
    }).assign(
        CODE=lambda x: x['FEATURE_NAME'].str.split('::').apply(lambda pair: pair[1])
    ).merge(
        all_records[['CODE', 'DESCRIPTION']].drop_duplicates().astype({'CODE': str}),
        on='CODE',
    ).sort_values(by='ABSOLUTE_IMPORTANCE', ascending=False)
    return importances_df

In [None]:
def get_feature_importance_rankings_ngram_lr(clf, vectorizer):
    reverse_lookup = {
        value: key for key, value in vectorizer.vocabulary_.items()
    }
    importances = clf.coef_[0]
    feature_names = [reverse_lookup[idx] for idx in np.arange(0, len(importances), 1)]
    importances_df = pd.DataFrame({
        'FEATURE_NAME': feature_names,
        'FEATURE_IMPORTANCE': importances,
        'ABSOLUTE_IMPORTANCE': np.abs(importances)
    }).sort_values(by='ABSOLUTE_IMPORTANCE', ascending=False)
    return importances_df

# Common Data Preprocessing Steps
All of the pipelines that we look at today will depend on the same set of preprocessing steps, \
so we have broken those out into a separate function that can be run once so the processed data can \
be reused by all of the pipelines.

In [None]:
def get_preprocessed_data(data):
    # assign dictionary values to variables for less verbose access
    conditions = data['conditions']
    observations = data['observations']
    medications = data['medications']
    procedures = data['procedures']
    
    print("Getting onset dates...")
    # get onset dates of patients for type-2 diabetes with SNOMED code 44054006
    type2_onset_dates = get_patient_onset_dates(conditions, type2_code)

    print("Simplifying dates...")
    # add simplified date columns to the observations, medications, and procedures
    observations_simple = get_simplified_data(observations, 'DATE')
    medications_simple = get_simplified_data(medications, 'START')
    procedures_simple = get_simplified_data(procedures, 'START')

    # drop all cause of death observations from the data
    observations_non_cod = observations_simple[observations_simple['CODE'] != '69453-9']

    print("Filtering out postdiagnosis events...")
    # now we will get filtered data for the type 2 patients to exclude post-diagnosis information
    conditions_filtered = filter_data_by_onset_dates(conditions, type2_onset_dates, 'START')
    observations_filtered = filter_data_by_onset_dates(observations_non_cod, type2_onset_dates)
    medications_filtered = filter_data_by_onset_dates(medications_simple, type2_onset_dates)
    procedures_filtered = filter_data_by_onset_dates(procedures_simple, type2_onset_dates)

    # now we will save the set of unique type2 patients who have a pre-diagnosis record
    type2_patients = pd.concat([
        conditions_filtered['PATIENT'],
        observations_filtered['PATIENT'],
        medications_filtered['PATIENT'],
        procedures_filtered['PATIENT']
    ]).unique()

    # now extract the observations for the non-type2 patients
    conditions_non_type2 = conditions[~conditions['PATIENT'].isin(type2_patients)]
    observations_non_type2 = observations_non_cod[~observations_non_cod['PATIENT'].isin(type2_patients)]
    medications_non_type2 = medications_simple[~medications_simple['PATIENT'].isin(type2_patients)]
    procedures_non_type2 = procedures_simple[~procedures_simple['PATIENT'].isin(type2_patients)]

    type2_data = {
        'conditions': conditions_filtered,
        'observations': observations_filtered,
        'medications': medications_filtered,
        'procedures': procedures_filtered
    }

    non_type2_data = {
        'conditions': conditions_non_type2,
        'observations': observations_non_type2,
        'medications': medications_non_type2,
        'procedures': procedures_non_type2
    }

    return type2_data, non_type2_data

In [None]:
type2_data, non_type2_data = get_preprocessed_data(train_data)

# Pipeline/Feature Engineering Alternative Specifications and Evaluation

## Option 1: Bag of Labeled Clinical Encounters (Many-hot/multi-hot encoding)
The simplest feature representation we can create and test is a binary vector (many-hot/multi-hot) representation \
which encodes the occurence or lack-therof of different clinical encounters/event in each patient's EHR record \
To construct this representation, we can use the scikit-learn package's `CountVectorizer` class

In [None]:
def pipeline_option1(type2_data, non_type2_data):
    print("Unifying data for type 2 and non-type2 patients...")
    # now we will unify the data together into a single set of records
    unified_records_type2 = get_unified_records(
        type2_data['conditions'],
        type2_data['observations'],
        type2_data['medications'],
        type2_data['procedures']
    )

    unified_records_non_type2 = get_unified_records(
        non_type2_data['conditions'],
        non_type2_data['observations'],
        non_type2_data['medications'],
        non_type2_data['procedures']
    )
    print("Converting to condensed representation...")
    # now we will condense the records for the type 2 and non-type 2 patients
    type2_condensed = get_condensed_record_data(unified_records_type2)
    non_type2_condensed = get_condensed_record_data(unified_records_non_type2)

    # now we will concatenate the two datasets together and label them
    all_data_condensed = pd.concat([
        type2_condensed.assign(LABEL=1),
        non_type2_condensed.assign(LABEL=0)
    ])

    print("Vectorizing data...")
    # now we will get the multi-hot vector representation for the records
    vectorizer = CountVectorizer(
        binary=True,
        tokenizer=lambda x: x.split('|'),
        token_pattern=None,
        lowercase=False
    )

    # now vectorize the data to get a multi-hot representation
    multi_hot_vectors = get_multihot_vector_representation(all_data_condensed, vectorizer)

    # print out the dimensionality of the multi_hot_vectors
    print(f"Multi hot vectors have {multi_hot_vectors.shape[1]} features")

    print("Fitting and evaluating classifier...")
    # now we will construct the random forest classifier and evaluate it using KFold cross-validation
    clf = LogisticRegression(
        solver='lbfgs',
        max_iter=1000,
        random_state=913
    )

    # now we train and evaluate the classifier
    results, fpr, tpr, cm = train_and_evaluate_classifier_kfold(
        clf, multi_hot_vectors, all_data_condensed['LABEL'].to_numpy()
    )

    # create and display the ROC curve plot
    display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=results['AUROC'].iloc[-1], estimator_name='Logistic Regression - Pipeline 1')
    display.plot()
    plt.show()

    # create and display the confusion matrix plot
    display = ConfusionMatrixDisplay(cm, display_labels=np.array(['Non-Type 2', 'Type 2']))
    display.plot(cmap=plt.cm.Blues)
    plt.show()

    # now return the results, trained classifier, and vectorizer
    return results, clf, vectorizer, pd.concat([unified_records_type2, unified_records_non_type2])

    
    

In [None]:
results, clf, vectorizer, all_records = pipeline_option1(type2_data, non_type2_data)

In [None]:
results

In [None]:
results['AUROC'].mean()

### Auditing the Model with Feature Importance Rankings
One thing that we can do is audit the way our current model is behaving, and what its predictions are based \
on, by looking at feature importance rankings. Here we look at the values of the coefficients of the logistic regression model  \
to get a sense for how each binary feature is contributing to the log odds of the binary target variable.

In [None]:
feature_importances = get_feature_importance_rankings_lr(clf, vectorizer, all_records)

In [None]:
feature_importances.head(50)

## Option 2: Binary Occurrence with Inclusion of Discretized Numeric Features
While the feature importance rankings revealed known co-morbidities (e.g., tooth loss) and known risk-factors for diabetes (e.g., Prediabetes, BMI) \
we have left out entirely the numeric data from the observations (lab and vital sign measures) which are likely to contain important information. \
Here we look at one technique for incorporating this information while maintaining the binary occurence vector representation that we used previously

In [None]:
def additional_preprocessing_steps(type2_data, non_type2_data, observation_distributions=None):
    print("Adding patient ages to observations...")
    ###### NEW STEP: Add ages to the observations data and compute age bins ######
    observations_type2_aged = get_aged_patient_data(type2_data['observations'], patients)
    observations_non_type2_aged = get_aged_patient_data(non_type2_data['observations'], patients)

    observations_type2_numeric = observations_type2_aged.query('TYPE == "numeric"')
    observations_non_type2_numeric = observations_non_type2_aged.query('TYPE == "numeric"')

    if observation_distributions is None:
        print("Computing age-binned numeric observation distributions...")
        ###### NEW STEP: Compute the age-binned numeric observation distributions ######
        observation_distributions = get_age_binned_distributions(
            pd.concat([observations_type2_numeric, observations_non_type2_numeric])
        )

    print("Computing percentile rank of numeric observations...")
    ###### NEW STEP: Compute the percentile rank of each observation against those for patients in the same age cohort ######
    observations_type2_ranked = get_percentile_ranked_observations(observations_type2_numeric, observation_distributions)
    observations_non_type2_ranked = get_percentile_ranked_observations(observations_non_type2_numeric, observation_distributions)

    observations_type2_labeled = get_percentile_labeled_observations(observations_type2_ranked)
    observations_non_type2_labeled = get_percentile_labeled_observations(observations_non_type2_ranked)

    ###### Make sure we recombine the numeric and non-numeric observations ######
    observations_type2_final = pd.concat([
        observations_type2_labeled.drop(columns=['PERCENTILE', 'PERCENTILE_RANK']),
        observations_type2_aged.query('TYPE != "numeric"')
    ])

    observations_non_type2_final = pd.concat([
        observations_non_type2_labeled.drop(columns=['PERCENTILE', 'PERCENTILE_RANK']),
        observations_non_type2_aged.query('TYPE != "numeric"')
    ])

    type2_data_processed = {
        key: value if key != 'observations' else observations_type2_final
        for key, value in type2_data.items()
    }

    non_type2_data_processed = {
        key: value if key != 'observations' else observations_non_type2_final
        for key, value in non_type2_data.items()
    }

    return type2_data_processed, non_type2_data_processed, observation_distributions
    

In [None]:
type2_data_p2, non_type2_data_p2, observation_distributions = additional_preprocessing_steps(type2_data, non_type2_data)

In [None]:
def pipeline_option2(type2_data, non_type2_data):

    print("Unifying and condensing records...")
    # now we will unify the data together into a single set of records
    unified_records_type2 = get_unified_records(
        type2_data['conditions'],
        type2_data['observations'],
        type2_data['medications'],
        type2_data['procedures']
    )

    unified_records_non_type2 = get_unified_records(
        non_type2_data['conditions'],
        non_type2_data['observations'],
        non_type2_data['medications'],
        non_type2_data['procedures']
    )

    # now we will condense the records for the type 2 and non-type 2 patients
    type2_condensed = get_condensed_record_data(unified_records_type2)
    non_type2_condensed = get_condensed_record_data(unified_records_non_type2)

    # now we will concatenate the two datasets together and label them
    all_data_condensed = pd.concat([
        type2_condensed.assign(LABEL=1),
        non_type2_condensed.assign(LABEL=0)
    ])

    print("Vectorizing data...")
    # now we will get the multi-hot vector representation for the records
    vectorizer = CountVectorizer(
        binary=True,
        tokenizer=lambda x: x.split('|'),
        token_pattern=None,
        lowercase=False
    )

    # now vectorize the data to get a multi-hot representation
    multi_hot_vectors = get_multihot_vector_representation(all_data_condensed, vectorizer)

    print(f"Multi hot vectors have {multi_hot_vectors.shape[1]} features")

    print("Fitting and evaluating classifier...")
    # now we will construct the random forest classifier and evaluate it using KFold cross-validation
    clf = LogisticRegression(
        solver='lbfgs',
        max_iter=1000,
        random_state=913
    )

    # now we train and evaluate the classifier
    results, fpr, tpr, cm = train_and_evaluate_classifier_kfold(
        clf, multi_hot_vectors, all_data_condensed['LABEL'].to_numpy()
    )

    # create and display the ROC curve plot
    display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=results['AUROC'].iloc[-1], estimator_name='Logistic Regression - Pipeline 2')
    display.plot()
    plt.show()

    # create and display the confusion matrix plot
    display = ConfusionMatrixDisplay(cm, display_labels=np.array(['Non-Type 2', 'Type 2']))
    display.plot(cmap=plt.cm.Blues)
    plt.show()

    # now return the results, trained classifier, and vectorizer
    return results, clf, vectorizer, pd.concat([unified_records_type2, unified_records_non_type2])

In [None]:
results_p2, clf_p2, vectorizer_p2, all_records_p2 = pipeline_option2(type2_data_p2, non_type2_data_p2)

In [None]:
results_p2

In [None]:
results_p2['AUROC'].mean()

In [None]:
feature_importances_p2 = get_feature_importance_rankings_lr(clf_p2, vectorizer_p2, all_records_p2)

In [None]:
feature_importances_p2.head(50)

## Option 3: Bag of Character N-Grams
One way that we can potentially improve on our earlier feature representation is to use a bag of N-grams \
which encodes the occurence or lack-therof of different substrings from clinical encounter descriptions \
To construct this representation, we can again make use of the scikit-learn package's `CountVectorizer` class \
but instead of using the default `"word"` analyzer, we will instead use the `"char_wb"` analyzer

In [None]:
def pipeline_option3(type2_data, non_type2_data):
    print("Unifying and condensing records...")
    # now we will unify the data together into a single set of records
    unified_records_type2 = get_unified_records(
        type2_data['conditions'],
        type2_data['observations'],
        type2_data['medications'],
        type2_data['procedures']
    )

    unified_records_non_type2 = get_unified_records(
        non_type2_data['conditions'],
        non_type2_data['observations'],
        non_type2_data['medications'],
        non_type2_data['procedures']
    )

    # now we will condense the records for the type 2 and non-type 2 patients
    ##### MODIFICATION: Here we use spaces as a separator instead of '|' characters as we are working with n-grams
    type2_condensed = get_condensed_record_data_for_ngrams(unified_records_type2)
    non_type2_condensed = get_condensed_record_data_for_ngrams(unified_records_non_type2)

    # now we will concatenate the two datasets together and label them
    all_data_condensed = pd.concat([
        type2_condensed.assign(LABEL=1),
        non_type2_condensed.assign(LABEL=0)
    ])

    print("Vectorizing data...")
    # now we will get the multi-hot vector representation for the records
    vectorizer = CountVectorizer(
        binary=True,
        analyzer='char_wb', ##### MODIFICATION: Use character word boundaries as the analyzer mode #####
        ngram_range=(5, 5), ##### MODIFICATION: We want character 5-grams #####
        lowercase=True ##### MODIFICATION: We want to treat "Blood" the same way as "blood" for example #####
    )

    # now vectorize the data to get a multi-hot representation
    multi_hot_vectors = get_multihot_vector_representation(all_data_condensed, vectorizer, feature_col='EVENT_DESCRIPTION')

    print(f"Multi hot vectors have {multi_hot_vectors.shape[1]} features")

    print("Fitting and evaluating classifier...")
    # now we will construct the random forest classifier and evaluate it using KFold cross-validation
    clf = LogisticRegression(
        solver='lbfgs',
        max_iter=1000,
        random_state=913,
    )

    # now we train and evaluate the classifier
    results, fpr, tpr, cm = train_and_evaluate_classifier_kfold(
        clf, multi_hot_vectors, all_data_condensed['LABEL'].to_numpy()
    )

    display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=results['AUROC'].iloc[-1], estimator_name='Logistic Regression - Pipeline 3')
    display.plot()
    plt.show()

    # create and display the confusion matrix plot
    display = ConfusionMatrixDisplay(cm, display_labels=np.array(['Non-Type 2', 'Type 2']))
    display.plot(cmap=plt.cm.Blues)
    plt.show()

    # now return the results, trained classifier, and vectorizer
    return results, clf, vectorizer, pd.concat([unified_records_type2, unified_records_non_type2])

In [None]:
results_p3, clf_p3, vectorizer_p3, all_records_p3 = pipeline_option3(type2_data_p2, non_type2_data_p2)

In [None]:
results_p3

In [None]:
results_p3['AUROC'].mean()

In [None]:
feature_importances_p3 = get_feature_importance_rankings_ngram_lr(clf_p3, vectorizer_p3)

In [None]:
feature_importances_p3.head(50)

In [None]:
all_records_p3[all_records_p3['DESCRIPTION'].str.contains('cont', regex=False, case=False)]['DESCRIPTION'].unique()

# Final Evaluation of Modeling Approaches on Held Out Data

In [None]:
type2_test, non_type2_test = get_preprocessed_data(test_data)

In [None]:
# add additional preprocessing steps for the second pipeline
type2_test_p2, non_type2_test_p2, _ = additional_preprocessing_steps(type2_test, non_type2_test, observation_distributions)

In [None]:
def test_pipeline(type2_data, non_type2_data, clf, vectorizer, condenser, feature_col='EVENT_TOKEN'):
    print("Unifying data for type 2 and non-type2 patients...")
    # now we will unify the data together into a single set of records
    unified_records_type2 = get_unified_records(
        type2_data['conditions'],
        type2_data['observations'],
        type2_data['medications'],
        type2_data['procedures']
    )

    unified_records_non_type2 = get_unified_records(
        non_type2_data['conditions'],
        non_type2_data['observations'],
        non_type2_data['medications'],
        non_type2_data['procedures']
    )
    print("Converting to condensed representation...")
    # now we will condense the records for the type 2 and non-type 2 patients
    type2_condensed = condenser(unified_records_type2)
    non_type2_condensed = condenser(unified_records_non_type2)

    # now we will concatenate the two datasets together and label them
    all_data_condensed = pd.concat([
        type2_condensed.assign(LABEL=1),
        non_type2_condensed.assign(LABEL=0)
    ])

    print("Vectorizing data...")

    # now vectorize the data to get a multi-hot representation
    multi_hot_vectors = get_multihot_vector_representation(all_data_condensed, vectorizer, train=False, feature_col=feature_col)

    # print out the dimensionality of the multi_hot_vectors
    print(f"Multi hot vectors have {multi_hot_vectors.shape[1]} features")

    print("Evaluating classifier...")
    # evaluate the model on the validation fold
    preds = clf.predict(multi_hot_vectors)
    scores = clf.predict_proba(multi_hot_vectors)[:, 1]
    test_y = all_data_condensed['LABEL'].to_numpy()
    # get the AUROC
    fpr, tpr, _ = roc_curve(test_y, scores)
    auroc = auc(fpr, tpr)
    # get the confusion matrix
    cm = confusion_matrix(test_y, preds)
    # save the metrics
    metrics = {
        'AUROC': auroc,
        'Precision': cm[1, 1] / cm[:, 1].sum(),
        'Recall': cm[1, 1] / cm[1].sum(),
        'Specificity': cm[0, 0] / cm[0].sum()
    }
    

    # create and display the ROC curve plot
    display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=metrics['AUROC'], estimator_name='Pipeline 1 - Holdout Set')
    display.plot()
    plt.show()

    # create and display the confusion matrix plot
    display = ConfusionMatrixDisplay(cm, display_labels=np.array(['Non-Type 2', 'Type 2']))
    display.plot(cmap=plt.cm.Blues)
    plt.show()

    # now return the results, trained classifier, and vectorizer
    return metrics

In [None]:
results_p1_test = test_pipeline(type2_test, non_type2_test, clf, vectorizer, condenser=get_condensed_record_data)

In [None]:
results_p1_test

In [None]:
results.mean()

In [None]:
results_p2_test = test_pipeline(type2_test_p2, non_type2_test_p2, clf_p2, vectorizer_p2, condenser=get_condensed_record_data)

In [None]:
results_p2_test

In [None]:
results_p2.mean()

In [None]:
results_p3_test = test_pipeline(
    type2_test_p2, non_type2_test_p2, clf_p3, vectorizer_p3, 
    condenser=get_condensed_record_data_for_ngrams, feature_col='EVENT_DESCRIPTION'
) 

In [None]:
results_p3_test

In [None]:
results_p3.mean()

In [None]:
test_results_collated = pd.DataFrame([
    results_p1_test,
    results_p2_test,
    results_p3_test
]).assign(pipeline=['Pipeline 1 - Bag of Labeled Encounters', 'Pipeline 2 - BOLE with Numeric Discretization', 'Pipeline 3 - Bag of Description n-grams'])

In [None]:
test_results_collated

## Visualizing Difference in Distribution of Number of Distinct Events Between Positive (Type 2) and Negative (non-Type 2) patients

In [None]:
all_records_labeled = all_records.assign(
    LABEL=lambda x: x['PATIENT'].isin(type2_patients).astype('int'),
    EVENT_TOKEN=lambda x: x['CODE'].astype('str') + '::' + x['EVENT_TYPE']
)

In [None]:
patient_encounter_type_counts = all_records_labeled.groupby('PATIENT').agg({
    'EVENT_TOKEN': 'nunique',
    'LABEL': 'max'
}).reset_index().rename(columns={'EVENT_TOKEN': '# of distinct events'})

In [None]:
patient_encounter_type_counts

In [None]:
plt.rcParams['figure.dpi'] = 200
sns.histplot(patient_encounter_type_counts, x='# of distinct events', hue='LABEL', common_norm=False, stat='probability')