In [1]:
import matplotlib.pyplot as plt
import sklearn
import pandas as pd
import seaborn as sns
%matplotlib inline

In [77]:
from sklearn.model_selection import train_test_split



In [2]:
pd.set_option('display.max_columns', None)

In [3]:
d = pd.read_csv("../data/data.csv")

In [64]:
categorical_columns = ['gender',
                       'ecg',
                       'chestpain',
                       'restwma',
                       'posSE',
                       'newMI',
                       'newPTCA',
                       'newCABG',
                       'death',
                       'hxofCig',
                       'hxofHT',
                      'hxofDM',
                      'hxofMI',
                      'hxofPTCA',
                      'hxofCABG',
                       'any.event',
                      ]
numeric_columns = ['sbp',
                 'baseEF',
                 'dpmaxdo',
                 'mbp',
                 'basebp',
                 'basedp',
                 'pctMphr',
                 'dobdose',
                 'dp',
                 'maxhr',
                 'pkhr',
                 'age',
                 'bhr',
                  ]

targets = ['any.event',
           'death',
           'newMI',
           'newPTCA',
           'newCABG']

In [89]:
def coerce_heartrate(df) -> pd.DataFrame:
    """
    Negative heartrates get coerced to 60
    """
    bhr = df['bhr'].apply(lambda x: x if x > 0 else 60)
    df = df.copy()
    df['bhr'] = bhr
    return df

In [88]:
def coerce_age(df:pd.DataFrame) -> pd.DataFrame:
    """
    Ages above 100 get coerced to 100
    """
    age = df['age'].apply(lambda x: x if x < 100 else 100)
    df = df.copy()
    df['age'] = age
    return df

In [57]:
clean_data = pd.get_dummies(                           
    d.set_index(['PatientIdentifier']) 
    .fillna(d[categorical_columns].mode().iloc[0]) 
    .fillna(d[numeric_columns].mean()) 
    .pipe(coerce_heartrate)
    .pipe(coerce_age)
)

In [58]:
clean_data.describe()

Unnamed: 0,bhr,basebp,basedp,pkhr,sbp,dp,maxhr,pctMphr,mbp,dpmaxdo,dobdose,age,baseEF,chestpain,restwma,posSE,newMI,newPTCA,newCABG,death,hxofHT,hxofDM,hxofMI,hxofPTCA,hxofCABG,any.event,gender_female,gender_male,hxofCig_heavy,hxofCig_moderate,hxofCig_non-smoker,ecg_MI,ecg_equivocal,ecg_normal
count,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0,558.0
mean,75.130895,135.293165,10223.066298,120.643243,147.035185,17653.733333,119.306422,78.561798,155.615527,18547.702899,30.237226,67.989001,55.511111,0.299283,0.453405,0.236559,0.050179,0.048387,0.05914,0.039427,0.715054,0.369176,0.274194,0.073477,0.157706,0.159498,0.605735,0.394265,0.218638,0.247312,0.53405,0.12724,0.315412,0.557348
std,15.413446,20.762192,2553.73148,22.523132,36.085236,5157.10451,21.756273,14.895535,31.21185,4885.634142,9.457375,12.568604,10.251149,0.458355,0.498271,0.425351,0.21851,0.214775,0.236098,0.194782,0.451794,0.483015,0.446507,0.261151,0.364792,0.366469,0.489131,0.489131,0.413693,0.431837,0.499287,0.333541,0.465097,0.497146
min,42.0,85.0,5000.0,52.0,40.0,5100.0,58.0,38.0,84.0,7130.0,5.0,26.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,64.0,120.0,8467.0,107.0,122.0,14220.0,105.0,70.0,134.0,15261.0,25.0,61.0,52.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,73.0,133.0,9940.5,122.0,143.0,17243.0,120.0,78.561798,151.0,18202.0,30.0,68.91744,57.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
75%,84.0,150.0,11652.0,135.0,170.0,20568.0,133.0,88.0,174.0,21197.5,40.0,76.0,62.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
max,210.0,203.0,27300.0,210.0,309.0,45114.0,200.0,133.0,309.0,45114.0,40.0,100.0,83.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [60]:
clean_data['any.event'].value_counts()

0.0    469
1.0     89
Name: any.event, dtype: int64

In [72]:
feature_columns = list(set(clean_data.columns) - set(targets))

In [75]:
print(f"Feature column names: {feature_columns}")

Feature column names: ['basedp', 'pctMphr', 'dobdose', 'baseEF', 'restwma', 'dp', 'gender_male', 'hxofCig_heavy', 'hxofHT', 'sbp', 'gender_female', 'ecg_normal', 'mbp', 'pkhr', 'basebp', 'dpmaxdo', 'bhr', 'hxofCig_moderate', 'hxofCig_non-smoker', 'maxhr', 'ecg_MI', 'hxofDM', 'posSE', 'chestpain', 'hxofPTCA', 'ecg_equivocal', 'age', 'hxofMI', 'hxofCABG']


In [85]:
def get_split_data(df:pd.DataFrame, feature_columns:list, target_column:str) -> tuple:
    """
    
    """
    
    X_train, X_test, y_train, y_test = train_test_split(df[feature_columns],
                                                    df[target_column],
                                                    stratify=df[target_column],
                                                    test_size=0.25,
                                                    random_state=134)
    
    return X_train, X_test, y_train, y_test

In [86]:
X_train, X_test, y_train, y_test = get_split_data(clean_data, feature_columns, 'any.event')

In [87]:
# Assert that test and train have the same ratio of death events
round(y_test.mean(),2) == round(y_train.mean(),2)

True