In [1]:
import json
from sys import version

import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split

In [2]:
# Python version
print('Python version: ', version)

Python version:  3.11.13 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 13:03:15) [MSC v.1929 64 bit (AMD64)]


In [3]:
print('numpy', np.__version__)
print('pandas', pd.__version__)
print('scikit-learn', sklearn.__version__)

numpy 2.0.1
pandas 2.3.2
scikit-learn 1.7.2


# 1 Data load

In [4]:
df = pd.read_csv(
    'diabetic_data.csv',
    na_values='?',
    keep_default_na=False,
    dtype={
        'diag_1': 'object',
        'diag_2': 'object',
        'diag_3': 'object',
        'payer_code': 'object'}
    )
df.shape

(101766, 50)

In [5]:
# Read the codes and their meanings from IDS_mapping.csv
admission_type = pd.read_csv(
    'IDS_mapping.csv',
    index_col=0,
    nrows=8)
discharge_disposition = pd.read_csv(
    'IDS_mapping.csv',
    index_col=0,
    skiprows=10,
    nrows=30)
admission_source = pd.read_csv(
    'IDS_mapping.csv',
    index_col=0,
    skiprows=42)

# Create dicts to make codes to meanings
admission_type_dict = admission_type.description.to_dict()
discharge_disposition_dict = discharge_disposition.description.to_dict()
admission_source_dict = admission_source.description.to_dict()

# Add columns to df that have the meanings of the codes
df['admission_type'] = df.admission_type_id.replace(
    admission_type_dict,
    inplace=False)
df['discharge_disposition'] = df.discharge_disposition_id.replace(
    discharge_disposition_dict,
    inplace=False)
df['admission_source'] = df.admission_source_id.replace(
    admission_source_dict,
    inplace=False)

# Remove the columns that have the codes
df.drop(columns=[
    'admission_type_id',
    'discharge_disposition_id',
    'admission_source_id'],
        inplace=True)

# We should still have the same number of columns
df.shape

(101766, 50)

In [6]:
# The 'readmitted' column is the output. We only care about readmission
# within 30 days
y = (df['readmitted'] == '<30').astype(int)
df = df.drop(columns=['readmitted'])
(df.shape, y.shape)

((101766, 49), (101766,))

# 2 Scoping

In [7]:
df = df[~df['discharge_disposition'].str.contains(
    'expired',
    case=False,
    na=False,
)]
y = y[df.index]
df.shape, y.shape

((100114, 49), (100114,))

# 3 Preventing leakage

In [8]:
df.drop_duplicates(subset=['patient_nbr'], inplace=True)
y = y[df.index]
df.shape, y.shape

((70439, 49), (70439,))

# 4. Separating training data from test set
I will save 5000 samples for testing. It is common to save 20%, but I think 5000 is enough to accurately evaluate the model.

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=5000, random_state=0)
[z.shape for z in [X_train, X_test, y_train, y_test]]

[(65439, 49), (5000, 49), (65439,), (5000,)]

# 6 Removing inconsistent data from the training set

In [10]:
code_columns_list = ['diag_1', 'diag_2', 'diag_3']

In [11]:
def fill_pattern(row: pd.Series) -> str:
    '''
    Indicate which of the ICD-9 columns are filled for a row of X_train

    Args:
        row: pandas Series; a row of X_train
    Returns:
        string
    '''
    filled = ['Blank' if row.isna().loc[col] else 'Filled'
              for col in code_columns_list]
    return ', '.join(filled)

In [12]:
X_train['diag_filled'] = X_train.apply(fill_pattern, axis=1)

In [13]:
def diag_consistent(row: pd.Series) -> bool:
    '''
    Check if the diagnosis columns are consistent for a row of X_train

    Args:
        row: pandas Series, a row a X_train
    Returns:
        boolean
    '''
    if row.number_diagnoses == 1:
        return row.diag_filled == 'Filled, Blank, Blank'
    if row.number_diagnoses == 2:
        return row.diag_filled == 'Filled, Filled, Blank'
    return row.diag_filled == 'Filled, Filled, Filled'

In [14]:
X_train['diag_consistent'] = X_train.apply(diag_consistent, axis=1)
y_train = y_train[X_train.diag_consistent]
X_train = X_train[X_train.diag_consistent]
# The columns diag_consistent and diag_filled won't be needed again
X_train = X_train.drop(columns=['diag_consistent', 'diag_filled'])
(X_train.shape, y_train.shape)

((65251, 49), (65251,))

In [15]:
X_train.loc[
    X_train.diag_1.notna() & (X_train.diag_1 == X_train.diag_2),
    'diag_2'] = np.nan
X_train.loc[
    X_train.diag_1.notna() & (X_train.diag_1 == X_train.diag_3),
    'diag_3'] = np.nan
X_train.loc[
    X_train.diag_2.notna() & (X_train.diag_2 == X_train.diag_3),
    'diag_3'] = np.nan

In [16]:
age_inconsistent = (
    ((X_train.admission_type == 'Newborn') & ~(X_train.age == '[0-10)'))
    | (X_train.discharge_disposition ==
       'Neonate discharged to another hospital for neonatal aftercare')
    | (X_train.admission_source == ' Sick Baby')
    | (X_train.admission_source == ' Extramural Birth')
)
y_train = y_train[~age_inconsistent]
X_train = X_train[~age_inconsistent]
(X_train.shape, y_train.shape)

((65236, 49), (65236,))

# 7 Feature selection

In [17]:
feature_data_dict = dict()

## 7.1 Diagnosis codes

In [18]:
X_train.loc[
    X_train.diag_1.notna() & (X_train.diag_1 == X_train.diag_2),
    'diag_2'] = np.nan
X_train.loc[
    X_train.diag_1.notna() & (X_train.diag_1 == X_train.diag_3),
    'diag_3'] = np.nan
X_train.loc[
    X_train.diag_2.notna() & (X_train.diag_2 == X_train.diag_3),
    'diag_3'] = np.nan

In [19]:
all_codes = X_train[code_columns_list].stack()
code_counts = all_codes.value_counts()
codes_with_children = ['250'] + ['250.' + str(i) for i in range(10)]
for code in codes_with_children:
    code_counts.loc[code] = all_codes.str.startswith(code).sum()

In [20]:
def make_code_feature(
    X_train: pd.DataFrame,
    code: str,
    code_columns: list = code_columns_list
) -> pd.Series:
    '''
    Make a feature representing an ICD-9 code.

    Args:
        X_train: Pandas DataFrame; its columns must include everything in
            code_columns
        code: string, ICD-9 code
        code_columns: list of strings

    Returns:
        Pandas boolean Series, with same index as X_train, indicating which
            patients have this code in one of the code_columns
    '''
    if code.startswith('250'):
        col = code_columns[0]
        feature = X_train[col].str.startswith(code)
        for col in code_columns[1:]:
            feature = feature | X_train[col].str.startswith(code)
    else:
        # we can't use startswith for all the codes, because some of them
        # are less than 3 characters (the leading zeroes are missing), so
        # this would pick up other 3-digit codes
        col = code_columns[0]
        feature = X_train[col] == code
        for col in code_columns[1:]:
            feature = feature | (X_train[col] == code)
    return feature

In [21]:
feature_data_dict['common_codes'] = list(code_counts[code_counts >= 100].index)

In [22]:
code_counts = X_train.diag_1.value_counts()

for code in codes_with_children:
    code_counts.loc[code] = X_train.diag_1.str.startswith(code).sum()

feature_data_dict['common_primary_codes'] = list(
    code_counts[code_counts >= 100].index
)

In [23]:
ranges = [
    (1, 139),
    (140, 239),
    (240, 279),
    (280, 289),
    (290, 319),
    (320, 389),
    (390, 459),
    (460, 519),
    (520, 579),
    (580, 629),
    (630, 679),
    (680, 709),
    (710, 739),
    (740, 759),
    (760, 779),
    (780, 799),
    (800, 999)
]
ranges.remove((760, 779))
feature_data_dict['code_ranges'] = ranges[:]

In [24]:
ranges.remove((740, 759))
feature_data_dict['primary_code_ranges'] = ranges

## 7.2 Other categorical features

In [25]:
races = list(X_train.race.dropna().unique())
races.remove('Other')
feature_data_dict['races'] = races

In [26]:
payer_counts = X_train.payer_code.value_counts()
feature_data_dict['common_payers'] = list(
    payer_counts[payer_counts >= 100].index
)

In [27]:
specialty_counts = X_train.medical_specialty.value_counts()
feature_data_dict['common_specialties'] = list(
    specialty_counts[specialty_counts >= 100].index
)

In [28]:
feature_data_dict['glucose_results'] = ['Norm', '>200', '>300']
feature_data_dict['a1c_results'] = ['Norm', '>7', '>8']

In [29]:
specific_med_columns = [
    'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
    'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
    'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
    'examide', 'sitagliptin', 'insulin', 'glyburide-metformin',
    'glipizide-metformin', 'glimepiride-pioglitazone',
    'metformin-rosiglitazone', 'metformin-pioglitazone'
]
X_train.rename(columns={'citoglipton': 'sitagliptin'}, inplace=True)

In [30]:
med_features_train = pd.DataFrame(
    index=X_train.index,
    columns=specific_med_columns,
)

for col in specific_med_columns:
    # Since there are no missing values, any value other than
    # 'No' is a patient who took this medication
    med_features_train[col] = ~(X_train[col] == 'No')

num_specific_meds_train = med_features_train.sum(axis=1)

med_features_up_train = pd.DataFrame(
    index=X_train.index,
    columns=[col + '_up' for col in specific_med_columns],
)
med_features_down_train = pd.DataFrame(
    index=X_train.index,
    columns=[col + '_down' for col in specific_med_columns],
)
for col in specific_med_columns:
    med_features_up_train[col + '_up'] = (X_train[col] == 'Up')
    med_features_down_train[col + '_down'] = (X_train[col] == 'Down')

# Combine all med categorical features
med_features_train = pd.concat(
    [med_features_train, med_features_up_train, med_features_down_train],
    axis=1,
    copy=False,
)
print(med_features_train.shape)

(65236, 69)


In [31]:
med_feature_sums_train = med_features_train.sum(axis=0)
feature_data_dict['med_features'] = list(
    med_feature_sums_train[med_feature_sums_train >= 100].index
)

In [32]:
admit_type_counts = X_train.admission_type.value_counts()
feature_data_dict['common_admit_types'] = list(
    admit_type_counts[admit_type_counts >= 100].index
)

In [33]:
admit_source_counts = X_train.admission_source.value_counts()
feature_data_dict['common_admit_sources'] = list(
    admit_source_counts[admit_source_counts >= 100].index
)

In [34]:
discharge_counts = X_train.discharge_disposition.value_counts()
feature_data_dict['common_discharges'] = list(
    discharge_counts[discharge_counts >= 100].index
)
discharge_keywords = {
    'home': ['home'],
    'hospital': [
        'hospital',
        'swing bed',
    ],
    'facility': [
        'to SNF',
        'to another type of inpatient care institution',
        'to ICF',
        'Hospice / medical facility',
        'to a nursing facility',
        'to a federal health care facility',
    ],
}
feature_data_dict['discharge_keywords'] = discharge_keywords

In [35]:
with open('feature_data.json', 'w') as fp:
    json.dump(feature_data_dict, fp)