### Import Libraries

In [9]:
import pandas as pd
import joblib
import numpy as np
import json

from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer

In [None]:
#In case I need to update datarobot-drum
!pip install datarobot-drum --upgrade

### Import Data

In [10]:
train = pd.read_csv('../data/readmissions_train.csv')

X = train.drop('readmitted',axis=1)
X.drop(['diag_1_desc', 'diag_1', 'diag_2', 'diag_3'],axis=1,inplace=True)
y = train.pop('readmitted')

### Define Preprocessing step per type of column

In [11]:
#Preprocessing for numerical features
numeric_features = list(X.select_dtypes('int64').columns)
for c in numeric_features:
    X[c] = X[c].fillna(0)
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

#Preprocessing for categorical features
categorical_features = list(X.select_dtypes('object').columns)
for c in categorical_features:
    X[c] = X[c].fillna('missing')
categorical_transformer = Pipeline(steps=[
    ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore'))])

#Preprocessor with all of the steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

### Fit the Preprocessing Pipeline

In [12]:
# Full preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

#Train the model-Pipeline
pipeline.fit(X,y)

#Preprocess x
preprocessed = pipeline.transform(X)

#I could also train the model with the sparse matrix. I transform it to padnas because the hook function in custom.py expected a pandas dataframe to be used for scoring.
preprocessed = pd.DataFrame.sparse.from_spmatrix(preprocessed)

### Train XGboost Classifier
Normally, the XGboost classifier could be part of the final scikit-learn pipeline. I am opting to keep them separate in order to create a more complicated example with different pkl files for preprocessing and scoring

In [13]:
model = XGBClassifier(colsample_bylevel=0.2, max_depth= 10, learning_rate = 0.02, n_estimators=300)
model.fit(preprocessed, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.2,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.02, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=300, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [14]:
joblib.dump(pipeline,'custom_model/preprocessing.pkl')
joblib.dump(model, 'custom_model/model.pkl') 

['custom_model/model.pkl']

### Save Custom Model files

joblib.dump(pipeline,'custom_model/preprocessing.pkl')
joblib.dump(model, 'custom_model/model.pkl') 

In [15]:
!drum validation --code-dir ./custom_model --input ../data/readmissions_test.csv --target-type binary --positive-class-label True --negative-class-label False

         True     False
0    0.629378  0.370622
1    0.680044  0.319956
2    0.705098  0.294902
3    0.628927  0.371073
4    0.731312  0.268688
..        ...       ...
495  0.629820  0.370180
496  0.547186  0.452814
497  0.614164  0.385836
498  0.710958  0.289042
499  0.593922  0.406078

[500 rows x 2 columns]
         True     False
0    0.589259  0.410741
1    0.673874  0.326126
2    0.709850  0.290150
3    0.631032  0.368968
4    0.730789  0.269211
..        ...       ...
495  0.637431  0.362569
496  0.548625  0.451375
497  0.610076  0.389924
498  0.681181  0.318819
499  0.551630  0.448370

[500 rows x 2 columns]
         True     False
0    0.626385  0.373615
1    0.682972  0.317028
2    0.706777  0.293223
3    0.645164  0.354836
4    0.734685  0.265315
..        ...       ...
495  0.635711  0.364289
496  0.549577  0.450423
497  0.623242  0.376758
498  0.688680  0.311320
499  0.554167  0.445833

[500 rows x 2 columns]
         True     False
0    0.589950  0.410050
1    0.619063  0

         True     False
0    0.587933  0.412067
1    0.680044  0.319956
2    0.706742  0.293258
3    0.630109  0.369891
4    0.729791  0.270209
..        ...       ...
495  0.614772  0.385228
496  0.550981  0.449019
497  0.612251  0.387749
498  0.691305  0.308695
499  0.553573  0.446427

[500 rows x 2 columns]
         True     False
0    0.584919  0.415081
1    0.683350  0.316650
2    0.722666  0.277334
3    0.631181  0.368819
4    0.729842  0.270158
..        ...       ...
495  0.609256  0.390744
496  0.547374  0.452626
497  0.622893  0.377107
498  0.691575  0.308425
499  0.552402  0.447598

[500 rows x 2 columns]
         True     False
0    0.587304  0.412696
1    0.679898  0.320102
2    0.704834  0.295166
3    0.630109  0.369891
4    0.730609  0.269391
..        ...       ...
495  0.621755  0.378245
496  0.550094  0.449906
497  0.619549  0.380451
498  0.690891  0.309109
499  0.553154  0.446846

[500 rows x 2 columns]
         True     False
0    0.587933  0.412067
1    0.680044  0

### Continue working`