In [1]:
%load_ext autoreload
%autoreload 2

In [11]:
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
import xgboost as xgb
from utils import data_handler, feature_selection
import pickle

## Read Train Data and Aggregate

In [71]:
# X, y = data_handler.get_model_prepared_dataset('./data/train/')

100%|██████████| 20000/20000 [01:08<00:00, 290.51it/s]


In [72]:
# with open('data/X_y.pkl', 'wb') as f:
#     pickle.dump((X, y), f)

In [3]:
with open('data/X_y.pkl', 'rb') as f:
        X, y = pickle.load(f)

### Fit Logistic Regression and use only features with betas that are not zero
### TODO: Decide if we want to play with coef_cutoff

In [73]:
remove_cols = feature_selection.get_remove_cols_from_logistic_regression(X, y, coef_cutoff=0.005)

Columns removed are: Index(['pH_7', 'Chloride_7', 'Potassium_5', 'Potassium_7', 'Hgb_7', 'SOFA_3',
       'SOFA_6', 'SOFA_8', 'SOFA_9', 'O2Sat_nanstd', 'BaseExcess_skew',
       'SaO2_skew', 'Calcium_nanmean', 'Calcium_nanmedian', 'Chloride_skew',
       'Creatinine_nanmean', 'Creatinine_nanmedian', 'Magnesium_nanmean',
       'Phosphate_nanstd', 'WBC_nanmean'],
      dtype='object')


# XGBoost Model

In [74]:
# args = {'gamma': 1, 'learning_rate': 0.001, 'max_depth': 4,'subsample': 0.6}
# args = {'gamma': 1, 'learning_rate': 0.1, 'max_depth': 9, 'subsample': 0.6}
args = {'gamma': 0.5, 'learning_rate': 0.15, 'max_depth': 11, 'subsample': 0.6}
pipe = Pipeline([
    ('remove_cols', data_handler.RemoveColsTransformer(remove_cols=remove_cols)),
    ('impute', data_handler.CustomImputerTransformer()),
    ('xgboost', xgb.XGBClassifier(**args))
])

In [75]:
pipe.fit(X, y)

Pipeline(steps=[('remove_cols',
                 RemoveColsTransformer(remove_cols=Index(['pH_7', 'Chloride_7', 'Potassium_5', 'Potassium_7', 'Hgb_7', 'SOFA_3',
       'SOFA_6', 'SOFA_8', 'SOFA_9', 'O2Sat_nanstd', 'BaseExcess_skew',
       'SaO2_skew', 'Calcium_nanmean', 'Calcium_nanmedian', 'Chloride_skew',
       'Creatinine_nanmean', 'Creatinine_nanmedian', 'Magnesium_nanmean',
       'Phosphate_nanstd', 'WBC...
                               gamma=0.5, gpu_id=-1, grow_policy='depthwise',
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.15, max_bin=256,
                               max_cat_to_onehot=4, max_delta_step=0,
                               max_depth=11, max_leaves=0, min_child_weight=1,
                               missing=nan, monotone_constraints='()',
                               n_estimators=100, n_jobs=0, num_parallel_tree=1,
                               predictor='auto', random_st

In [76]:
pipe.steps[2][1].n_features_in_

399

In [85]:
with open('./model.pkl', 'wb') as f:
    pickle.dump(pipe, f)

## Cross-Validation

In [17]:
scores = cross_val_score(pipe, X, y, cv=5, scoring='f1')

In [20]:
import numpy as np
np.mean([np.around(score, 3) for score in scores])

0.7236

## Testing model

In [77]:
# X_test, ytrue = data_handler.get_model_prepared_dataset('./data/test/')

100%|██████████| 10000/10000 [00:34<00:00, 291.34it/s]


In [78]:
# with open('data/X_test_y_ture.pkl', 'wb') as f:
#     pickle.dump((X_test, ytrue), f)

In [8]:
with open('data/X_test_y_ture.pkl', 'rb') as f:
    X_test, ytrue = pickle.load(f)

In [79]:
ypreds = pipe.predict(X_test)
f1_score(ytrue, ypreds)

0.7335968379446641

In [84]:
set(X_real_test.columns) - set(X_test.columns)

{'SepsisLabel_nanmax',
 'SepsisLabel_nanmean',
 'SepsisLabel_nanmedian',
 'SepsisLabel_nanmin',
 'SepsisLabel_nanstd',
 'SepsisLabel_skew'}

In [10]:
ypreds = pipe.predict(X_test)
f1_score(ytrue, ypreds)

0.7335968379446641

In [11]:
with open('./model.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [10]:
ypreds = pipe.predict(X_test)
f1_score(ytrue, ypreds)

0.7335968379446641

In [21]:
train_ypreds = pipe.predict(X)
f1_score(y, train_ypreds)

0.9996465182043125

# Random Forest


In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

args = {'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'n_estimators': 100}
rf_pipe = Pipeline([
    ('remove_cols', data_handler.RemoveColsTransformer(remove_cols=remove_cols)),
    ('impute', data_handler.CustomImputerTransformer()),
    ('RF', RandomForestClassifier(random_state=42, **args))
])
rf_pipe.fit(X, y)


Pipeline(steps=[('remove_cols',
                 RemoveColsTransformer(remove_cols=Index(['pH_7', 'Chloride_7', 'Potassium_5', 'Potassium_7', 'Hgb_7', 'SOFA_3',
       'SOFA_6', 'SOFA_8', 'SOFA_9', 'O2Sat_nanstd', 'BaseExcess_skew',
       'SaO2_skew', 'Calcium_nanmean', 'Calcium_nanmedian', 'Chloride_skew',
       'Creatinine_nanmean', 'Creatinine_nanmedian', 'Magnesium_nanmean',
       'Phosphate_nanstd', 'WBC_nanmean'],
      dtype='object'))),
                ('impute', CustomImputerTransformer()),
                ('RF', RandomForestClassifier(random_state=42))])

In [23]:
ypreds = rf_pipe.predict(X_test)
f1_score(ytrue, ypreds)

0.6267029972752044

In [27]:
with open('./rf_model.pkl', 'wb') as f:
    pickle.dump(rf_pipe, f)

## Cross-Validation

In [24]:
scores = cross_val_score(rf_pipe, X, y, cv=5, scoring='f1')
np.mean([np.around(score, 3) for score in scores])

0.6326

In [25]:
train_ypreds = rf_pipe.predict(X)
f1_score(y, train_ypreds)

0.9996465182043125