In [1]:
import pandas as pd
import numpy as np
from scipy.stats import randint

import dataframe_image as dfi

import joblib

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler



from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline



from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

In [2]:


def get_metrics(y_test, y_pred):
    predicted_for_discharge = list(np.where(y_pred == False)[0])
    wrongful_discharge = y_test.reset_index(drop=True).iloc[predicted_for_discharge].sum()/len(predicted_for_discharge)
    print(f"WRONGFUL DISCHARGE RATE: {wrongful_discharge}")

    print(f"F1_SCORE: {f1_score(y_test, y_pred)}")
    print(f"RECALL: {recall_score(y_test, y_pred)}")
    print(f"PRECISION: {precision_score(y_test, y_pred)}")

    

    try:
        print(f"ROC AUC: {roc_auc_score(y_test, y_pred)}")
    except: "ROC AUC curve could not be calculated"

def add_numerical_features(data, column):
    data[f"{column}_squared"] = np.square(data[column]-data[column].mean())
    data[f"{column}_sqrt"] = np.sqrt(data[column])


In [3]:
data = pd.read_csv("data/cleaned_data.csv")

## change target value to boolean
data.readmitted = data.readmitted.replace(["Yes", "No"], [True, False])

data.head()

Unnamed: 0,admission_id,patient_id,race,gender,age,weight,admission_type_code,discharge_disposition_code,admission_source_code,time_in_hospital,...,blood_type,hemoglobin_level,blood_transfusion,max_glu_serum,A1Cresult,diuretics,insulin,change,diabetesMed,readmitted
0,0,199042938,Caucasian,Male,[50-60),,3.0,1.0,1.0,1,...,A+,14.5,False,,,No,No,No,Yes,False
1,1,91962954,Caucasian,Male,[80-90),,2.0,1.0,7.0,3,...,B+,15.7,False,,>7,No,No,No,No,True
2,3,157495374,African American,Female,[70-80),,,1.0,,2,...,AB-,13.5,False,,>8,No,No,No,Yes,False
3,4,82692360,Caucasian,Female,,,1.0,22.0,7.0,12,...,A+,13.0,False,,,No,No,No,No,False
4,5,218016576,Caucasian,Female,[70-80),,2.0,1.0,1.0,4,...,A+,13.1,False,,,No,No,No,Yes,True


In [6]:
#dummy model: predicting everything as true
get_metrics(data.readmitted, [True]*len(data))

WRONGFUL DISCHARGE RATE: nan
F1_SCORE: 0.20393259701086341
RECALL: 1.0
PRECISION: 0.11354395535015278
ROC AUC: 0.5


In [7]:
#dummy model2 - predict same rate of readmission as in dataset
predict_same_rate = data.readmitted.sample(frac=1).reset_index(drop=True)
get_metrics(data.readmitted, predict_same_rate)

WRONGFUL DISCHARGE RATE: 0.11360075211532435
F1_SCORE: 0.11310053380782918
RECALL: 0.11310053380782918
PRECISION: 0.11310053380782918
ROC AUC: 0.49974989084625243


In [8]:
def diagnosis_decoder(code):
    if "V" in str(code): 
        return "External causes of injury and supplemental classification"
    elif "E" in str(code):
        return "External causes of injury and supplemental classification"
    else:
        try:
        
            code = int(code)
            if code<140: return "infectious and parasitic diseases"
            if code<240: return "neoplasms"
            if code<280: return "endocrine, nutritional and metabolic diseases, and immunity disorders"
            if code<290: return "diseases of the blood and blood-forming organs"
            if code<320: return "mental disorders"
            if code<390: return "diseases of the nervous system and sense organs"
            if code<460: return "diseases of the circulatory system"
            if code<520: return "diseases of the respiratory system"
            if code<580: return "diseases of the digestive system"
            if code<630: return "diseases of the genitourinary system"
            if code<680: return "complications of pregnancy, childbirth, and the puerperium"
            if code<710: return "diseases of the skin and subcutaneous tissue"
            if code<740: return "diseases of the musculoskeletal system and connective tissue"
            if code<760: return "congenital anomalies"
            if code<780: return "certain conditions originating in the perinatal period"
            if code<800: return "symptoms, signs, and ill-defined conditions"
            if code<1000: return "injury and poisoning"
        except:
            return(np.nan)
        




#### Feature engineering

In [9]:
#code age groups as integers
data["age_as_int"] = data.age.replace(['[50-60)', '[80-90)', '[60-70)', '[70-80)', '[40-50)', '[30-40)',
 '[90-100)', '[20-30)', '[10-20)', '[0-10)'], [50, 80, 60, 70, 40, 30, 90, 20, 10, 0])


In [10]:
def was_test_done(data, column_name, not_done="None"):
    data[column_name+"_done"] = np.where(data[column_name]==not_done, "No", "Yes")
    data[column_name+"_done"] = data[column_name].replace("nan", np.nan)
    return data


In [13]:
#is patient insured
payer_codes = list(data.payer_code.dropna().unique())
payer_codes.remove("SP")

data["isInsured"] = data.payer_code.replace(list(payer_codes), True)
data["isInsured"] = data.isInsured.replace("SP", False)
data["isInsured"].unique()


array([nan, True, False], dtype=object)

In [14]:
def filter_common_categories(data, column_name, threshold):
    common_categories = list(data[column_name].value_counts()[data[column_name].value_counts()>threshold].index.values)
    common_categories.append(np.nan)
    data[column_name] = np.where(data[column_name].isin(common_categories), data[column_name], 'Other')
    data[column_name] = data[column_name].replace("nan", np.nan)
    return common_categories


threshold=100



In [15]:
#keep only common values for payer_code, set others as "Other"
column_name = "payer_code"
filter_common_categories(data, column_name, threshold)


['MC',
 'HM',
 'SP',
 'BC',
 'MD',
 'CP',
 'UN',
 'CM',
 'OG',
 'PO',
 'DM',
 'CH',
 'WC',
 nan]

In [16]:
#keep only common values for admission_type_code, set others as "Other"
column_name = "admission_type_code"
filter_common_categories(data, column_name, threshold)


[1.0, 3.0, 2.0, nan]

In [17]:
#keep only common values for discharge disposition, set others as "Other"
column_name = "discharge_disposition_code"
filter_common_categories(data, column_name, threshold)


[1.0, 3.0, 6.0, 2.0, 22.0, 5.0, 4.0, 7.0, 23.0, 28.0, nan]

In [18]:
#keep only common values for admission_source_code, set others as "Other"
column_name = "admission_source_code"
filter_common_categories(data, column_name, threshold)


[7.0, 1.0, 4.0, 6.0, 2.0, 5.0, 3.0, nan]

In [19]:
#keep only common values for medical_specialty set others as "Other"
column_name = "medical_specialty"
filter_common_categories(data, column_name, threshold)
#data[column_name].unique()

['InternalMedicine',
 'Emergency/Trauma',
 'Family/GeneralPractice',
 'Cardiology',
 'Surgery-General',
 'Nephrology',
 'Orthopedics',
 'Orthopedics-Reconstructive',
 'Radiologist',
 'Pulmonology',
 'Psychiatry',
 'Urology',
 'ObstetricsandGynecology',
 'Surgery-Cardiovascular/Thoracic',
 'Gastroenterology',
 'Surgery-Vascular',
 'Surgery-Neuro',
 'PhysicalMedicineandRehabilitation',
 'Oncology',
 'Pediatrics',
 'Neurology',
 'Hematology/Oncology',
 'Pediatrics-Endocrinology',
 'Otolaryngology',
 nan]

In [20]:
#simplify diagnosis codes
diag_columns = ['diag_1','diag_2','diag_3']
for col in diag_columns:
    data[f"{col}_simplified"] = data[col].str.replace(r"\.(.*)", "")  #remove any numbers that come after .
    data[f"{col}_simplified"] = data.apply(lambda row: diagnosis_decoder(row[f"{col}_simplified"]),axis=1)

    column_name = f"{col}_simplified"


In [21]:
data = data.drop(["admission_id", "patient_id", "age", "weight", "diag_1", "diag_2", "diag_3", "blood_type", "payer_code", "medical_specialty", "isInsured"], axis=1)

In [23]:
features_df = pd.DataFrame(list(zip((data.isnull().sum()/len(data)*100).values, data.nunique().values, data.dtypes.values)),  
    columns=["% of missing values", "Number of unique values", "Data type"], index=data.columns).drop("readmitted")

In [24]:
dfi.export(features_df, "features_used.png")


[0301/114238.567917:ERROR:gpu_init.cc(441)] Passthrough is not supported, GL is swiftshader
[0301/114239.114742:INFO:headless_shell.cc(648)] Written to file /tmp/tmpd2i_tyot/temp.png.


In [27]:
data.to_csv("data/data1_to_model", index=False)
new_data = pd.read_csv("data/first_round_to_model")

In [387]:
#data = pd.concat([data, new_data], ignore_index=True)

In [28]:
data.readmitted = data.readmitted.astype(bool)

In [29]:
all_features = data.columns
numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
categorical_features = data.select_dtypes(include=['O', 'bool']).drop(["readmitted"], axis=1).columns

data[numerical_features] = data[numerical_features].astype(float)

In [30]:
df_train, df_test = train_test_split(data, test_size=0.3, random_state=42)

In [395]:
classifiers = [
    LogisticRegression(random_state=42, n_jobs=-1, max_iter=10000),
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42, n_jobs=-1),
    GradientBoostingClassifier(random_state=42),
    SVC(random_state=42)

    
]

parameters = [
              {"C": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100], "penalty": ['none', 'l1', 'l2', 'elasticnet']},

              {"max_depth": [1, 3, 5, 7, 9]},

              {'n_estimators': [100, 200, 500, 1000], 'max_depth' : [1, 3, 5, 7, 9]},

              {'n_estimators': [100, 200, 500, 1000], 'max_depth' : [1, 3, 5, 7, 9]},

               {'kernel': ["linear", "poly", "rbf", "sigmoid"], 'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]},


                

              
             ]


In [37]:



def define_pipeline(classifier, params):


    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        #('scaler', RobustScaler())])

        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value=np.nan)),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)])




    feature_selection = Pipeline(steps = [
    ('feature_selection', SelectFromModel(LogisticRegression())),
    ])

    pipeline = make_pipeline(
        preprocessor,
        #feature_selection,
        GridSearchCV(classifier,
                    param_grid=params,
                    cv=5, scoring="f1",
                    refit=True)

    )

    return pipeline




In [32]:
df_train = df_train.dropna()
X_train = df_train[all_features]
y_train = df_train[target]


# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')
# fit and apply the transform
X_over, y_over = oversample.fit_resample(X_train, y_train)


# define undersampling strategy
undersample = RandomUnderSampler(sampling_strategy='majority')
# fit and apply the transform
X_under, y_under = undersample.fit_resample(X_train, y_train)




In [33]:
print(X_train.shape, df_test.shape)

(40123, 28) (23759, 28)


In [None]:
##save sets
all_data = pd.read_csv("data/cleaned_data.csv")

#is patient insured
payer_codes = list(all_data.payer_code.dropna().unique())
payer_codes.remove("SP")

all_data["isInsured"] = all_data.payer_code.replace(list(payer_codes), True)
all_data["isInsured"] = all_data.isInsured.replace("SP", False)
all_data["isInsured"].unique()


df_train_to_save = df_train.merge(all_data.iloc[df_train.index][["age", "medical_specialty", "isInsured"]], left_index=True, right_index=True)
df_test_to_save = df_test.merge(all_data.iloc[df_test.index][["age", "medical_specialty", "isInsured"]], left_index=True, right_index=True)


#df_train_to_save.to_csv("train_set.csv")
#X_under.to_csv("train_set_under_X.csv")
#y_under.to_csv("train_set_under_y.csv")
#df_test_to_save.to_csv("test_set.csv")


In [299]:
#X_under = pd.read_csv("train_set_under_X.csv")
#y_under = pd.read_csv("train_set_under_y.csv")

In [300]:
#X_under.drop(['Unnamed: 0'], axis=1, inplace=True)
#y_under.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
#df_test = pd.read_csv("test_set.csv")
#df_train = pd.read_csv("train_set.csv")

In [None]:
#all_features = X_under.columns
#target = y_under.columns
#X_train = df_train[all_features]
#y_train = df_train[target]


In [35]:
#classifiers = [RandomForestClassifier(n_jobs=-1, random_state=42)]
#parameters = [{'n_estimators': [500], 'max_depth' : [5]}]

In [38]:
best_params = []
for classifier, params in zip(classifiers, parameters):
    pipeline = define_pipeline(classifier, params)

    print(classifier)
    #pipeline.fit(X_under, y_under)
    #pipeline.fit(pd.DataFrame.sparse.from_spmatrix(X_under), pd.DataFrame.sparse.from_spmatrix(y_under))
    pipeline.fit(X_train, y_train)

    print(pipeline.named_steps['gridsearchcv'].best_params_)
     
    #make predictions
    X_test = df_test[all_features]

    y_test = df_test[target]

    y_pred = pipeline.predict(X_test)
    preds_proba = pipeline.predict_proba(X_test)[:, 1]

    get_metrics(y_test, y_pred)
    
    best_params.append(pipeline.named_steps['gridsearchcv'].best_params_)

    #joblib.dump(pipeline, f'pipeline_{str(classifier)}.pickle')





RandomForestClassifier(n_jobs=-1, random_state=42)
{'max_depth': 5, 'n_estimators': 500}
WRONGFUL DISCHARGE RATE: 0.11401995033461004
F1_SCORE: 0.0
RECALL: 0.0
PRECISION: 0.0
ROC AUC: 0.5


In [303]:
get_metrics(y_test, y_pred)
    
best_params.append(pipeline.named_steps['gridsearchcv'].best_params_)

joblib.dump(pipeline, f'pipeline_{str(classifier)}.pickle')

WRONGFUL DISCHARGE RATE: readmitted    0.071043
dtype: float64
F1_SCORE: 0.26649650927107577
RECALL: 0.6552233296419343
PRECISION: 0.16726347531096872
ROC AUC: 0.6177066766974517


['pipeline_RandomForestClassifier(n_jobs=-1, random_state=42).pickle']

In [None]:
hyperparameters_df = pd.DataFrame([best_params], columns=["LogisticRegression", "DecisionTreeClassifier", "RandomForestClassifier", "GradientBoostingClassifier", "SVC"])

In [None]:
hyperparameters_df.to_csv("hyperparameters.csv")