### Import Libraries

In [1]:
import pandas as pd
import joblib
import numpy as np
import json

from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer

In [2]:
#In case I need to update datarobot-drum
!pip install datarobot-drum --upgrade

Requirement already up-to-date: datarobot-drum in /Users/joel.gongora/opt/anaconda3/envs/drum_corrected/lib/python3.7/site-packages (1.5.11)
Processing /Users/joel.gongora/Library/Caches/pip/wheels/4b/ec/62/dc7dab8452e83d6487282f0eed4fc0a01b94fefe50ceede3aa/strictyaml-1.4.2-cp37-none-any.whl
Collecting argcomplete==1.11.1
  Using cached argcomplete-1.11.1-py2.py3-none-any.whl (36 kB)
Collecting datarobot==2.24.0
  Using cached datarobot-2.24.0-py3-none-any.whl (418 kB)
Collecting julia==0.5.6
  Using cached julia-0.5.6-py2.py3-none-any.whl (67 kB)
Collecting pyarrow==2.0.0
  Using cached pyarrow-2.0.0-cp37-cp37m-macosx_10_13_x86_64.whl (13.4 MB)
Collecting Pillow==8.2.0
  Using cached Pillow-8.2.0-cp37-cp37m-macosx_10_10_x86_64.whl (2.8 MB)
Collecting ruamel.yaml==0.17.4
  Using cached ruamel.yaml-0.17.4-py3-none-any.whl (101 kB)
Collecting importlib-metadata<2,>=0.23; python_version == "3.7"
  Using cached importlib_metadata-1.7.0-py2.py3-none-any.whl (31 kB)
Collecting flask-cors
  U

### Import Data

In [77]:
train = pd.read_csv('../data/readmissions_train.csv')
test = pd.read_csv('../data/readmissions_test.csv')

X = train.drop(['id', 'readmitted'],axis=1)
X.drop(['diag_2_desc', 'diag_3_desc'],axis=1,inplace=True)
test.drop(['id', 'diag_2_desc', 'diag_3_desc'],axis=1,inplace=True)
y = train.pop('readmitted')

In [78]:
len(test.columns)

48

In [79]:
len(X.columns)

48

### Define Preprocessing step per type of column

In [80]:
#Preprocessing for numerical features
numeric_features = list(X.select_dtypes('int64').columns)
for c in numeric_features:
    X[c] = X[c].fillna(0)
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

#Preprocessing for categorical features
categorical_features = list(X.select_dtypes('object').columns)
for c in categorical_features:
    X[c] = X[c].fillna('missing')
    
categorical_transformer = Pipeline(steps=[
    ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore'))
])

#Preprocessor with all of the steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)]
)

### Fit the Preprocessing Pipeline

In [81]:
# Full preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

#Train the model-Pipeline
pipeline.fit(X,y)

#Preprocess x
preprocessed = pipeline.transform(X)

#I could also train the model with the sparse matrix. I transform it to padnas because the hook function in custom.py expected a pandas dataframe to be used for scoring.
preprocessed = pd.DataFrame.sparse.from_spmatrix(preprocessed)

### Train XGboost Classifier
Normally, the XGboost classifier could be part of the final scikit-learn pipeline. I am opting to keep them separate in order to create a more complicated example with different pkl files for preprocessing and scoring

In [68]:
model = XGBClassifier(colsample_bylevel=0.2, max_depth= 10, learning_rate = 0.02, n_estimators=300)
model.fit(preprocessed, y)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.2,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.02, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=300, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [69]:
pwd

'/Users/joel.gongora/DataRobot/repos/FORKED/custom-models/custom_inference/python/readmissions/Readmission_level_3'

In [70]:
joblib.dump(pipeline,'custom_model/preprocessing.pkl')
joblib.dump(model, 'custom_model/model.pkl') 

['custom_model/model.pkl']

### Save Custom Model files

joblib.dump(pipeline,'custom_model/preprocessing.pkl')
joblib.dump(model, 'custom_model/model.pkl') 

In [82]:
!drum validation --code-dir ./custom_model --input ../data/readmissions_test.csv --target-type binary --positive-class-label True --negative-class-label False

Traceback (most recent call last):
  File "/Users/joel.gongora/opt/anaconda3/envs/drum_corrected/bin/drum", line 6, in <module>
    main()
  File "/Users/joel.gongora/opt/anaconda3/envs/drum_corrected/lib/python3.7/site-packages/datarobot_drum/drum/main.py", line 168, in main
    CMRunner(runtime).run()
  File "/Users/joel.gongora/opt/anaconda3/envs/drum_corrected/lib/python3.7/site-packages/datarobot_drum/drum/drum.py", line 457, in run
    self._run_fit_or_predictions_pipelines_in_mlpiper()
  File "/Users/joel.gongora/opt/anaconda3/envs/drum_corrected/lib/python3.7/site-packages/datarobot_drum/drum/drum.py", line 753, in _run_fit_or_predictions_pipelines_in_mlpiper
    _pipeline_executor.run_pipeline(cleanup=False)
  File "/Users/joel.gongora/opt/anaconda3/envs/drum_corrected/lib/python3.7/site-packages/mlpiper/pipeline/executor.py", line 261, in run_pipeline
    self._run_pipeline()
  File "/Users/joel.gongora/opt/anaconda3/envs/drum_corrected/lib/python3.7/site-packages/mlpiper/pip

### Validate model can work as `Custom Training Model`

In [45]:
!drum fit --code-dir ./custom_model --input ../data/readmissions_train.csv --target-type binary --target readmitted --positive-class-label True --negative-class-label False

Files were overwritten: {'/var/folders/nz/89yd1h313yn8t9lr6lw41zf80000gq/T/tmpwj6xao3u/model.pkl', '/var/folders/nz/89yd1h313yn8t9lr6lw41zf80000gq/T/tmpwj6xao3u/preprocessing.pkl'}

Failure in predict server: {"message":"ERROR: Model transform hook failed to transform dataset: X has 45 features, but ColumnTransformer is expecting 48 features as input."}

