In [1]:
import json

import numpy as np
import pandas as pd


def _make_code_feature(
    X: pd.DataFrame,
    code: str,
    code_columns: list,
) -> pd.Series:
    '''
    Make a feature representing an ICD-9 code.

    Args:
        X: Pandas DataFrame; its columns must include everything in
            code_columns
        code: string, ICD-9 code
        code_columns: list of strings

    Returns:
        Pandas boolean Series, with same index as X, indicating which
            patients have this code in one of the code_columns
    '''
    if code.startswith('250'):
        col = code_columns[0]
        feature = X[col].str.startswith(code)
        for col in code_columns[1:]:
            feature = feature | X[col].str.startswith(code)
    else:
        # we can't use startswith for all the codes, because some of them
        # are less than 3 characters (the leading zeroes are missing), so
        # startswith would pick up other 3-digit codes
        col = code_columns[0]
        feature = X[col] == code
        for col in code_columns[1:]:
            feature = feature | (X[col] == code)
    return feature

In [2]:
def _decode_nominal_columns(df, drop=False):
    '''
    Add columns for admission_type, admission_source, and discharge_disposition

    diabetic_data.csv has columns 'admission_type_id', 'admission_source_id',
    and 'discharge_disposition_id'. This function creates columns
    'admission_type', 'admission_source', and 'discharge_disposition' containing
    the corresponding string values, which are found in IDS_mapping.csv.

    Args:
        df: a pandas DataFrame with columns 'admission_type_id',
            'admission_source_id', and 'discharge_disposition_id'
        drop: (Optional) boolean. If True, the columns 'admission_type_id',
            'admission_source_id', and 'discharge_disposition_id' will
            be removed from the output

    Returns:
        the input pandas DataFrame, with the added columns 'admission_type',
        'admission_source', and 'discharge_disposition'
    '''
    # Read the codes and their meanings from IDS_mapping.csv
    admission_type = pd.read_csv(
        'IDS_mapping.csv',
        index_col=0,
        nrows=8,
    )
    discharge_disposition = pd.read_csv(
        'IDS_mapping.csv',
        index_col=0,
        skiprows=10,
        nrows=30,
    )
    admission_source = pd.read_csv(
        'IDS_mapping.csv',
        index_col=0,
        skiprows=42,
    )

    # Create dicts to make codes to meanings
    admission_type_dict = admission_type.description.to_dict()
    discharge_disposition_dict = discharge_disposition.description.to_dict()
    admission_source_dict = admission_source.description.to_dict()

    # Add columns to df that have the meanings of the codes
    df['admission_type'] = df.admission_type_id.replace(
        admission_type_dict,
        inplace=False,
    )
    df['discharge_disposition'] = df.discharge_disposition_id.replace(
        discharge_disposition_dict,
        inplace=False,
    )
    df['admission_source'] = df.admission_source_id.replace(
        admission_source_dict,
        inplace=False,
    )

    if drop:
        df = df.drop(columns=[
            'admission_type_id',
            'admission_source_id',
            'discharge_disposition_id'
        ])

    return df

In [3]:
def _extract_code_features(df, feature_data_dict):
    '''
    Create features for individual ICD-9 codes

    Args:
        df: a Pandas DataFrame containing patient data formatted like
            diabetic_data.csv
        feature_data_dict: a dict with keys 'common_codes' and
            'common_primary_codes'

    Returns:
        a Pandas DataFrame with the same index as df
    '''
    code_features = dict()

    for code in feature_data_dict['common_codes']:
        code_features['ICD9_' + code] = _make_code_feature(
            df,
            code,
            code_columns=['diag_1', 'diag_2', 'diag_3'],
        )

    for code in feature_data_dict['common_primary_codes']:
        code_features['primary_ICD9_' + code] = _make_code_feature(
            df,
            code,
            code_columns=['diag_1'],
        )

    return pd.DataFrame(code_features)

In [4]:
def _extract_code_range_features(df, feature_data_dict):
    '''
    Create features for ICD-9 codes ranges

    Args:
        df: a Pandas DataFrame containing patient data formatted like
            diabetic_data.csv
        feature_data_dict: a dict with keys 'code_ranges' and
            'primary_code_ranges'

    Returns:
        a Pandas DataFrame with the same index as df
    '''
    code_nums = df[['diag_1', 'diag_2', 'diag_3']].apply(
        pd.to_numeric,
        errors='coerce',
    )

    range_features = dict()
    for low, high in feature_data_dict['code_ranges']:
        mask = code_nums.apply(
            lambda col: col.between(low, high),
            axis=0
        )
        range_features[f'ICD9_{low}-{high}'] = mask.any(axis=1)

    for low, high in feature_data_dict['primary_code_ranges']:
        range_features[f'primary_ICD9_{low}-{high}'] = (
            code_nums['diag_1'].between(low, high)
        )

    return pd.DataFrame(range_features)

In [5]:
def _extract_features(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Extracts features from patient data encoded in a specific format

    The input DataFrame must be formatted in the same way as
    diabetic_data.csv. A few variations are acceptable:
    1. The columns 'encounter_id', 'patient_nbr', and 'readmitted' may be
    omitted. If present, they will be ignored.
    2. Any columns with unexpected headings will be ignored.
    3. The 'citoglipton' column may be called 'sitagliptin' instead.
    4. Missing values may be represented with '?' or in standard Pandas ways

    The features are extracted by the same rules used in Readmission.ipynb

    Args:
        df: a Pandas DataFrame containing patient data formatted like
            diabetic_data.csv

    Returns:
        a Pandas DataFrame with 475 feature columns, and the same number of
        rows as df
    '''
    with open('feature_data.json', 'r') as file:
        feature_data_dict = json.load(file)

    if 'citoglipton' in df.columns:
        df.rename(columns={'citoglipton': 'sitagliptin'}, inplace=True)
    df.replace({'?': np.nan}, inplace=True)

    df = _decode_nominal_columns(df)

    features = pd.DataFrame(
        index = df.index,
        columns = feature_data_dict['all_columns'],
        dtype = float,
    )

    code_features = _extract_code_features(df, feature_data_dict)
    code_range_features = _extract_code_range_features(df, feature_data_dict)


    category_features_dict = dict()

    category_fields = [
        ('races', 'race', 'race_'),
        ('common_payers', 'payer_code', 'payer_'),
        ('common_specialties', 'medical_specialty', 'specialty_'),
        ('glucose_results', 'max_glu_serum', 'glucose_'),
        ('a1c_results', 'A1Cresult', 'a1c_'),
        ('common_admit_types', 'admission_type', 'admit_type_'),
        ('common_admit_sources', 'admission_source', 'admit_source_'),
        ('common_discharges', 'discharge_disposition', 'discharge_'),
    ]
    for data_dict_key, df_col, feature_stem in category_fields:
        for feature in feature_data_dict[data_dict_key]:
            category_features_dict[feature_stem + feature] = (
                df[df_col] == feature
            ).astype(float)

    category_features = pd.DataFrame(category_features_dict)

    # Make changes to medical specialty features:
    # 1 A "Hematology" feature that includes "Hematology/Oncology"
    category_features['specialty_Hematology'] = (
        (df.medical_specialty == 'Hematology')
        | (df.medical_specialty == 'Hematology/Oncology')
    )
    # 2 An "Oncology" feature that includes "Hematology/Oncology"
    category_features['specialty_Oncology'] = (
        (df.medical_specialty == 'Oncology')
        | (df.medical_specialty == 'Hematology/Oncology')
    )
    # 3 A "Hematology/Oncology" feature that has value 1 for
    # "Hematology/Oncology", 0.5 for "Hematology" and "Oncology",
    # and 0 for everything else.
    category_features.loc[
        df.medical_specialty == 'Hematology',
        'specialty_Hematology/Oncology'
    ] = 0.5
    category_features.loc[
        df.medical_specialty == 'Hematology',
        'specialty_Hematology/Oncology'
    ] = 0.5
    # 4 include "Orthopedics-Reconstructive" patients under "Orthopedics"
    category_features.specialty_Orthopedics = (
        (df.medical_specialty == 'Orthopedics')
        | (df.medical_specialty == 'Orthopedics-Reconstructive')
    )
    # 5 The "Pediatrics" feature will include all values that contain "Pediat"
    # case insensitive
    category_features.specialty_Pediatrics = (
        ~df.medical_specialty.isna()
        & df.medical_specialty.str.lower().str.contains('pediat')
    )
    # 6 "Pediatrics-Endocrinology" patients will also be included in an
    # "Endocrinology" feature; there will be no "Pediatrics-Endocrinology"
    # feature
    category_features.drop(
        columns='specialty_Pediatrics-Endocrinology',
        inplace=True
    )
    category_features['specialty_Endocrinology'] = (
        ~df.medical_specialty.isna()
        & df.medical_specialty.str.lower().str.contains('endocrin')
    )
    # 7 The "Psychiatry" feature will include "Psychology" patients
    category_features.specialty_Psychiatry = (
        (df.medical_specialty == 'Psychiatry')
        | (df.medical_specialty == 'Psychology')
    )
    # 8 The "Surgery-General" feature will include "Surgeon" and
    # "SurgicalSpecialty"
    category_features['specialty_Surgery-General'] = (
        (df.medical_specialty == 'Surgery-General')
        | (df.medical_specialty == 'Surgeon')
        | (df.medical_specialty == 'SurgicalSpecialty')
    )
    # 9. add a "Surgery" feature, including all values that contain "Surg"
    category_features['specialty_Surgery'] = (
        ~df.medical_specialty.isna()
        & df.medical_specialty.str.lower().str.contains('surg')
    )
    # 10. The "ObstetricsandGynecology" feature will include "Gynecology"
    category_features.specialty_ObstetricsandGynecology = (
        ~df.medical_specialty.isna()
        & df.medical_specialty.str.lower().str.contains('gynec')
    )
    # 11. a "Surgery-Thoracic" feature that includes
    # "Surgery-Cardiovascular/Thoracic"
    category_features['specialty_Surgery-Thoracic'] = (
        (df.medical_specialty == 'Surgery-Thoracic')
        | (df.medical_specialty == 'Surgery-Cardiovascular/Thoracic')
    )
    # 12. There will be a "Surgery-Cardiovascular" feature that includes
    # "Surgery-Cardiovascular/Thoracic"
    category_features['specialty_Surgery-Cardiovascular'] = (
        (df.medical_specialty == 'Surgery-Cardiovascular')
        | (df.medical_specialty == 'Surgery-Cardiovascular/Thoracic')
    ).astype(float)
    # 13. The "Surgery-Cardiovascular/Thoracic" feature will have value 0.5
    # for "Surgery-Thoracic" and "Surgery-Cardiovascular"
    category_features.loc[
        df.medical_specialty == 'Surgery-Thoracic',
        'specialty_Surgery-Cardiovascular/Thoracic'
    ] = 0.5
    category_features.loc[
        df.medical_specialty == 'Surgery-Cardiovascular',
        'specialty_Surgery-Cardiovascular/Thoracic'
    ] = 0.5
    # 14. The "Radiologist" feature will include "Radiology".
    category_features.specialty_Radiologist = (
        ~df.medical_specialty.isna()
        & df.medical_specialty.str.lower().str.contains('radiol')
    )

    # Specific medication features
    for col in feature_data_dict['med_features']:
        parts = col.split('_')
        if len(parts) == 2:
            category_features[col] = df[parts[0]].str.lower() == parts[1]
        else:
            category_features[col] = df[col] != 'No'

    for group, keywords in feature_data_dict['discharge_keywords'].items():
        # Initially create feature as all False
        category_features['discharge_' + group] = False
        # Then find patients with discharge_disposition containing a keyword
        for keyword in keywords:
            category_features['discharge_' + group] = (
                category_features['discharge_' + group]
                | df.discharge_disposition.str.contains(keyword)
            )

    numeric_features = pd.DataFrame(index=df.index, columns=['gender'])
    numeric_features.gender = np.nan
    numeric_features.loc[df.gender == 'Female', 'gender'] = 0
    numeric_features.loc[df.gender == 'Male', 'gender'] = 1

    for age in range(0, 100, 10):
        age_string = f'[{age}-{age + 10})'
        numeric_features.loc[df.age == age_string, 'age'] = age + 5
    numeric_features.age.value_counts().sort_index()

    for weight in range(0, 200, 25):
        weight_string = f'[{weight}-{weight + 25})'
        numeric_features.loc[
            df.weight == weight_string, 'weight'] = weight + 12.5
    numeric_features.loc[df.weight == '>200', 'weight'] = 212.5
    for col in [
        'time_in_hospital',
        'num_lab_procedures',
        'num_procedures',
        'num_medications',
        'number_outpatient',
        'number_emergency',
        'number_inpatient',
        'number_diagnoses',
    ]:
        numeric_features[col] = df[col]

    specific_med_columns = [
        'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
        'glimepiride', 'acetohexamide', 'glipizide', 'glyburide',
        'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol',
        'troglitazone', 'tolazamide', 'examide', 'sitagliptin', 'insulin',
        'glyburide-metformin', 'glipizide-metformin',
        'glimepiride-pioglitazone', 'metformin-rosiglitazone',
        'metformin-pioglitazone',
    ]
    numeric_features['num_specific_meds'] = (
        df[specific_med_columns] != 'No'
    ).sum(axis=1)

    numeric_features['change'] = (df.change == 'Ch').astype(int)
    return pd.concat(
        [
            code_features,
            code_range_features,
            category_features,
            numeric_features,
        ],
        axis=1,
    ).astype(float)

In [6]:
df = pd.read_csv(
    'diabetic_data.csv',
    na_values='?',
    keep_default_na=False,
    dtype={
        'diag_1': 'object',
        'diag_2': 'object',
        'diag_3': 'object',
        'payer_code': 'object'}
    )
discharge_disposition = pd.read_csv(
    'IDS_mapping.csv',
    index_col=0,
    skiprows=10,
    nrows=30)
discharge_disposition_dict = discharge_disposition.description.to_dict()
df['discharge_disposition'] = df.discharge_disposition_id.replace(
    discharge_disposition_dict,
    inplace=False)
df = df[~df['discharge_disposition'].str.contains(
    'expired',
    case=False,
    na=False,
)]
df.drop_duplicates(subset=['patient_nbr'], inplace=True)

In [7]:
new_features = _extract_features(df)
new_features.shape

(70439, 475)

In [8]:
X_test = pd.read_csv('features_test.csv', index_col=0)
new_features_test = new_features.loc[X_test.index, :]
for col in new_features.columns:
    if not new_features_test[col].astype(float).equals(X_test[col].astype(float)):
        print(col, X_test[col].dtype, new_features_test[col].dtype)

In [9]:
X_train = pd.read_csv('features_train.csv', index_col=0)
new_features_train = new_features.loc[X_train.index, :]
for col in new_features.columns:
    if not new_features_train[col].astype(float).equals(X_train[col].astype(float)):
        print(col, X_train[col].dtype, new_features_train[col].dtype)

In [10]:
X_train.astype(float).equals(new_features_train.astype(float))

False