### Import Libraries

In [2]:
import pandas as pd
import joblib
import numpy as np
import json
# !pip install xgboost
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer

Collecting xgboost
  Downloading xgboost-1.4.2-py3-none-macosx_10_14_x86_64.macosx_10_15_x86_64.macosx_11_0_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 2.4 MB/s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.4.2


XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed (vcomp140.dll or libgomp-1.dll for Windows, libomp.dylib for Mac OSX, libgomp.so for Linux and other UNIX-like OSes). Mac OSX users: Run `brew install libomp` to install OpenMP runtime.
  * You are running 32-bit Python on a 64-bit OS
Error message(s): ['dlopen(/Users/joel.gongora/opt/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /Users/joel.gongora/opt/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: image not found']


In [3]:
#In case I need to update datarobot-drum
!pip install datarobot-drum --upgrade

Collecting datarobot-drum
  Using cached datarobot_drum-1.5.11-py3-none-any.whl (8.8 MB)
Collecting datarobot==2.24.0
  Using cached datarobot-2.24.0-py3-none-any.whl (418 kB)
Collecting progress
  Using cached progress-1.6-py3-none-any.whl
Collecting strictyaml==1.4.2
  Using cached strictyaml-1.4.2-py3-none-any.whl
Collecting docker>=4.2.2<5.0.0
  Using cached docker-5.0.0-py2.py3-none-any.whl (146 kB)
Collecting texttable
  Using cached texttable-1.6.4-py2.py3-none-any.whl (10 kB)
Collecting memory-profiler<1.0.0
  Using cached memory_profiler-0.58.0-py3-none-any.whl
Collecting julia==0.5.6
  Using cached julia-0.5.6-py2.py3-none-any.whl (67 kB)
Collecting py4j~=0.10.9.0
  Using cached py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
Collecting mlpiper~=2.4.0
  Using cached mlpiper-2.4.2-py2.py3-none-any.whl (781 kB)
Collecting argcomplete==1.11.1
  Using cached argcomplete-1.11.1-py2.py3-none-any.whl (36 kB)
Collecting pyarrow==2.0.0
  Using cached pyarrow-2.0.0-cp38-cp38-macosx_10_13_x

### Import Data

In [4]:
train = pd.read_csv('../data/readmissions_train.csv')

X = train.drop('readmitted',axis=1)
X.drop(['diag_1_desc', 'diag_1', 'diag_2', 'diag_3'],axis=1,inplace=True)
y = train.pop('readmitted')

### Define Preprocessing step per type of column

In [5]:
#Preprocessing for numerical features
numeric_features = list(X.select_dtypes('int64').columns)
for c in numeric_features:
    X[c] = X[c].fillna(0)
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

#Preprocessing for categorical features
categorical_features = list(X.select_dtypes('object').columns)
for c in categorical_features:
    X[c] = X[c].fillna('missing')
categorical_transformer = Pipeline(steps=[
    ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore'))])

#Preprocessor with all of the steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

NameError: name 'Pipeline' is not defined

### Fit the Preprocessing Pipeline

In [4]:
# Full preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

#Train the model-Pipeline
pipeline.fit(X,y)

#Preprocess x
preprocessed = pipeline.transform(X)

#I could also train the model with the sparse matrix. I transform it to padnas because the hook function in custom.py expected a pandas dataframe to be used for scoring.
preprocessed = pd.DataFrame.sparse.from_spmatrix(preprocessed)

### Train XGboost Classifier
Normally, the XGboost classifier could be part of the final scikit-learn pipeline. I am opting to keep them separate in order to create a more complicated example with different pkl files for preprocessing and scoring

In [5]:
model = XGBClassifier(colsample_bylevel=0.2, max_depth= 10, learning_rate = 0.02, n_estimators=300)
model.fit(preprocessed, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.2,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.02, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=300, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

### Save Custom Model files

In [6]:
joblib.dump(pipeline,'custom_model/preprocessing.pkl')
joblib.dump(model, 'custom_model/model.pkl') 

['custom_model/model.pkl']

In [7]:
!drum validation --code-dir ./custom_model --input ../data/readmissions_test.csv --target-type binary --positive-class-label True --negative-class-label False

         True     False
0    0.615392  0.384608
1    0.716724  0.283276
2    0.652516  0.347484
3    0.614488  0.385512
4    0.819258  0.180742
..        ...       ...
495  0.616276  0.383724
496  0.451008  0.548992
497  0.470648  0.529352
498  0.518132  0.481868
499  0.430164  0.569836

[500 rows x 2 columns]
         True     False
0    0.535153  0.464847
1    0.704384  0.295616
2    0.662020  0.337980
3    0.618700  0.381300
4    0.818212  0.181788
..        ...       ...
495  0.631497  0.368503
496  0.453886  0.546114
497  0.462472  0.537528
498  0.458577  0.541423
499  0.345581  0.654419

[500 rows x 2 columns]
         True     False
0    0.609406  0.390594
1    0.722578  0.277422
2    0.655875  0.344125
3    0.646963  0.353037
4    0.826004  0.173996
..        ...       ...
495  0.628056  0.371943
496  0.455790  0.544210
497  0.488803  0.511197
498  0.473575  0.526425
499  0.350654  0.649346

[500 rows x 2 columns]
         True     False
0    0.536535  0.463465
1    0.594760  0

### Validate model can work as `Custom Training Model`

In [8]:
!drum fit --code-dir ./custom_model --input ../data/readmissions_train.csv --target-type binary --target readmitted --positive-class-label True --negative-class-label False

Files were overwritten: {'/var/folders/v3/1cwf1zz90_nczrfzhvxwyjfr0000gp/T/tmpvnquru3s/model.pkl', '/var/folders/v3/1cwf1zz90_nczrfzhvxwyjfr0000gp/T/tmpvnquru3s/preprocessing.pkl'}
Validation Complete 🎉 Your model can be fit to your data,  and predictions can be made on the fit model! 
 You're ready to add it to DataRobot. 


In [9]:
!drum score --code-dir ./custom_model --input ../data/readmissions_test.csv --target-type binary --positive-class-label True --negative-class-label False

         True     False
0    0.532502  0.467498
1    0.716724  0.283276
2    0.655804  0.344196
3    0.616853  0.383147
4    0.816218  0.183782
..        ...       ...
495  0.586178  0.413822
496  0.458597  0.541403
497  0.466822  0.533178
498  0.478825  0.521175
499  0.349466  0.650534

[500 rows x 2 columns]
