In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from imblearn.over_sampling import SMOTE
import tools

In [2]:
#load data
df = pd.read_csv('duke_vital_model_imputed.csv')
df.head()
df["PostCond"].value_counts()

1    62257
0    21063
Name: PostCond, dtype: int64

In [3]:
subj_dict = tools.create_subj_dict(df)
subj_labels = np.array([np.array(list([key, subj_dict[key][0]["PostCond"]])) for key in subj_dict.keys()])
# subject_ids = subj_conds[:, 0]
# subject_labels = subj_conds[:, 1]
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=42)
patient_split = skf.split(subj_labels[:, 0], subj_labels[:, 1])
train_patient_index = dict()
test_patient_index = dict()
cnt = 0
for train_i, test_i in patient_split:
    train_patient_index[cnt] = train_i
    test_patient_index[cnt] = test_i
    cnt += 1

In [4]:
def filterer(pair):
    key, value = pair
    if len(value) >= 2:
        return True
    return False

In [5]:
categorical_columns = ["RACE_G", "GENDER", "HXCOPD", "HXDIAB", "HXHTN", "HXHYL", "HXSMOKE"]

train_subj_dict = {patient: subj_dict[patient] for patient in subj_dict.keys() if patient in subj_labels[train_patient_index[0], 0]}
test_subj_dict = {patient: subj_dict[patient] for patient in subj_dict.keys() if patient in subj_labels[test_patient_index[0], 0]}

train_df = tools.subj_dict_to_df(train_subj_dict)
test_df = tools.subj_dict_to_df(test_subj_dict)

trends = tools.return_trends(train_df, [[2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13]], ["DIASBP_R", "PULSE_R", "SYSBP_R", "WEIGHT_R", "CREATININE_R", "HDL_R", "LDL_R", "TOTCHOL_R", "wbc", "gluc", "tprot", "alb", "ast", "alt", "tbil", "cr", "ptinr"])

train_col_names = list(train_df.drop(labels=["RSUBJID", "PostCond"], axis=1).keys())
test_col_names = list(test_df.drop(labels=["RSUBJID", "PostCond"], axis=1).keys())

imputer = IterativeImputer(random_state=42)

X_train_for_imputed = imputer.fit_transform(train_df.drop(labels=["RSUBJID", "PostCond"], axis=1).to_numpy())
X_test_for_imputed = imputer.transform(test_df.drop(labels=["RSUBJID", "PostCond"], axis=1).to_numpy())

imputed_train_df = pd.DataFrame(X_train_for_imputed, columns=train_col_names)
imputed_test_df = pd.DataFrame(X_test_for_imputed, columns=test_col_names)

imputed_train_df["RSUBJID"] = train_df["RSUBJID"]
imputed_train_df["PostCond"] = train_df["PostCond"]
imputed_test_df["RSUBJID"] = test_df["RSUBJID"]
imputed_test_df["PostCond"] = test_df["PostCond"]

imputed_train_subj_dict = tools.create_subj_dict(imputed_train_df)
imputed_test_subj_dict = tools.create_subj_dict(imputed_test_df)

filtered_imputed_train_subj_dict = dict(filter(filterer, imputed_train_subj_dict.items()))
filtered_imputed_test_subj_dict = dict(filter(filterer, imputed_test_subj_dict.items()))

normalized_train_dict = tools.dist_from_age_mean(filtered_imputed_train_subj_dict, trends)
normalized_test_dict = tools.dist_from_age_mean(filtered_imputed_test_subj_dict, trends)

linearized_train_dict = tools.dict_to_linfit(normalized_train_dict, trend_cols=["DIASBP_R", "PULSE_R", "SYSBP_R", "WEIGHT_R", "CREATININE_R", "HDL_R", "LDL_R", "TOTCHOL_R", "wbc", "gluc", "tprot", "alb", "ast", "alt", "tbil", "cr", "ptinr"])
linearized_test_dict = tools.dict_to_linfit(normalized_test_dict, trend_cols=["DIASBP_R", "PULSE_R", "SYSBP_R", "WEIGHT_R", "CREATININE_R", "HDL_R", "LDL_R", "TOTCHOL_R", "wbc", "gluc", "tprot", "alb", "ast", "alt", "tbil", "cr", "ptinr"])

linearized_train_df = tools.lin_dict_to_df(linearized_train_dict)
linearized_test_df = tools.lin_dict_to_df(linearized_test_dict)

linearized_train_df[categorical_columns] = linearized_train_df[categorical_columns].astype(int)
linearized_test_df[categorical_columns] = linearized_test_df[categorical_columns].astype(int)
unique_vals = dict()
for col in categorical_columns:
    unique_vals[col] = np.unique(linearized_test_df[col])

def filter_func(row):
    for col in categorical_columns:
        if row[col] not in unique_vals[col]:
            return False
    return True

linearized_train_df = linearized_train_df[linearized_train_df.apply(filter_func, axis=1)]
linearized_train_df.reset_index()

# linearized_train_df = pd.get_dummies(linearized_train_df, columns=categorical_columns)
# linearized_test_df = pd.get_dummies(linearized_test_df, columns=categorical_columns)

print(linearized_train_df.shape)
print(linearized_test_df.shape)

print("linearized_train_df")
for column in linearized_train_df.columns:
    print(column)

print("linearized_test_df")
for column in linearized_test_df.columns:
    print(column)

X_feats = linearized_train_df.drop(labels=["RSUBJID", "PostCond"], axis=1)
X_labels = linearized_train_df["PostCond"]
cols = linearized_train_df.drop(labels=["RSUBJID"], axis=1).columns

resampler = SMOTE()

print(X_feats.shape)
print(X_labels.shape)

X_feats_resampled, X_labels_resampled = resampler.fit_resample(X_feats, X_labels)

X_feats_resampled["PostCond"] = X_labels_resampled

print("X_feats_resampled")
for column in X_feats_resampled.columns:
    print(column)

X_feats_resampled[categorical_columns] = X_feats_resampled[categorical_columns].astype(int)

X = pd.get_dummies(X_feats_resampled, columns=categorical_columns)
X_test = pd.get_dummies(linearized_test_df.drop(labels=["RSUBJID"], axis=1), columns=categorical_columns)

X_test = X_test[X.columns]

print(X.shape)
print(X_test.shape)

for col in X.columns:
    if col not in X_test.columns:
        X_test[col] = 0

for colX, colXtest in zip(X.columns, X_test.columns):
    if colX != colXtest:
        print(f'column mismatch: {colX}, {colXtest}')

# X = pd.get_dummies(pd.DataFrame(np.concatenate([X_feats_resampled, np.array(X_labels_resampled).reshape(len(X_labels_resampled), 1)], axis=1), columns=cols), columns = categorical_columns)

# X = pd.get_dummies(X_feats_resampled, columns = categorical_columns)

# X_test = pd.get_dummies(linearized_test_df.drop(labels=["RSUBJID"], axis=1), columns=categorical_columns)

# X = X_feats_resampled
# X_test = linearized_test_df.drop(labels=["RSUBJID"], axis=1)
print(X.columns)
print(X.shape)
print(X_test.columns)
print(X_test.shape)

# trends = tools.return_trends(df, [[2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13]], ["DIASBP_R", "PULSE_R", "SYSBP_R", "WEIGHT_R", "CREATININE_R", "HDL_R", "LDL_R", "TOTCHOL_R", "wbc", "gluc", "tprot", "alb", "ast", "alt", "tbil", "cr", "ptinr"])
# train_col_names = list(df.drop(labels=["RSUBJID", "PostCond"], axis=1).keys())
# imputer = IterativeImputer(random_state=42)
# X_train_for_imputed = imputer.fit_transform(df.drop(labels=["RSUBJID", "PostCond"], axis=1).to_numpy())
# imputed_train_df = pd.DataFrame(X_train_for_imputed, columns=train_col_names)
# imputed_train_df["RSUBJID"] = df["RSUBJID"]
# imputed_train_df["PostCond"] = df["PostCond"]
# imputed_train_subj_dict = tools.create_subj_dict(imputed_train_df)
# filtered_imputed_train_subj_dict = dict(filter(filterer, imputed_train_subj_dict.items()))
# normalized_train_dict = tools.dist_from_age_mean(filtered_imputed_train_subj_dict, trends)
# linearized_train_dict = tools.dict_to_linfit(normalized_train_dict, trend_cols=["DIASBP_R", "PULSE_R", "SYSBP_R", "WEIGHT_R", "CREATININE_R", "HDL_R", "LDL_R", "TOTCHOL_R", "wbc", "gluc", "tprot", "alb", "ast", "alt", "tbil", "cr", "ptinr"])
# linearized_train_df = tools.lin_dict_to_df(linearized_train_dict)
# X = linearized_train_df.drop(labels=["RSUBJID"], axis=1)


  linearized_train_dict = tools.dict_to_linfit(normalized_train_dict, trend_cols=["DIASBP_R", "PULSE_R", "SYSBP_R", "WEIGHT_R", "CREATININE_R", "HDL_R", "LDL_R", "TOTCHOL_R", "wbc", "gluc", "tprot", "alb", "ast", "alt", "tbil", "cr", "ptinr"])
  linearized_train_dict = tools.dict_to_linfit(normalized_train_dict, trend_cols=["DIASBP_R", "PULSE_R", "SYSBP_R", "WEIGHT_R", "CREATININE_R", "HDL_R", "LDL_R", "TOTCHOL_R", "wbc", "gluc", "tprot", "alb", "ast", "alt", "tbil", "cr", "ptinr"])
  linearized_train_dict = tools.dict_to_linfit(normalized_train_dict, trend_cols=["DIASBP_R", "PULSE_R", "SYSBP_R", "WEIGHT_R", "CREATININE_R", "HDL_R", "LDL_R", "TOTCHOL_R", "wbc", "gluc", "tprot", "alb", "ast", "alt", "tbil", "cr", "ptinr"])
  linearized_train_dict = tools.dict_to_linfit(normalized_train_dict, trend_cols=["DIASBP_R", "PULSE_R", "SYSBP_R", "WEIGHT_R", "CREATININE_R", "HDL_R", "LDL_R", "TOTCHOL_R", "wbc", "gluc", "tprot", "alb", "ast", "alt", "tbil", "cr", "ptinr"])
  linearized_train_dict 

(15309, 71)
(1701, 71)
linearized_train_df
RSUBJID
Unnamed: 0.1
Unnamed: 0
RDAYSFROMINDEX
RACE_G
GENDER
HXCOPD
HXDIAB
HXHTN
HXHYL
HXSMOKE
HEIGHT_R
preop_sao2
bmi
PostCond
start_age
end_age
AGE_G_0
AGE_G_1
AGE_G_interval
DIASBP_R_0
DIASBP_R_1
DIASBP_R_interval
PULSE_R_0
PULSE_R_1
PULSE_R_interval
SYSBP_R_0
SYSBP_R_1
SYSBP_R_interval
WEIGHT_R_0
WEIGHT_R_1
WEIGHT_R_interval
CREATININE_R_0
CREATININE_R_1
CREATININE_R_interval
HDL_R_0
HDL_R_1
HDL_R_interval
LDL_R_0
LDL_R_1
LDL_R_interval
TOTCHOL_R_0
TOTCHOL_R_1
TOTCHOL_R_interval
alb_0
alb_1
alb_interval
alt_0
alt_1
alt_interval
ast_0
ast_1
ast_interval
cr_0
cr_1
cr_interval
gluc_0
gluc_1
gluc_interval
ptinr_0
ptinr_1
ptinr_interval
tbil_0
tbil_1
tbil_interval
tprot_0
tprot_1
tprot_interval
wbc_0
wbc_1
wbc_interval
linearized_test_df
RSUBJID
Unnamed: 0.1
Unnamed: 0
RDAYSFROMINDEX
RACE_G
GENDER
HXCOPD
HXDIAB
HXHTN
HXHYL
HXSMOKE
HEIGHT_R
preop_sao2
bmi
PostCond
start_age
end_age
AGE_G_0
AGE_G_1
AGE_G_interval
DIASBP_R_0
DIASBP_R_1
DIASBP_R_in

In [6]:
print(X.shape)
X.to_csv("train_ds_resampled2.csv")
X_test.to_csv("test_ds_resampled2.csv")

(23394, 78)


In [7]:
print(X.shape)
print(X_test.shape)

(23394, 78)
(1701, 78)
