# From Model to BentoML

**Dataset**

In [1]:
#!wget https://github.com/gastonstat/CreditScoring/raw/master/CreditScoring.csv --quiet

**Libraries**

In [2]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_selector as selector
import xgboost as xgb 

  from pandas import MultiIndex, Int64Index


In [3]:
df = pd.read_csv('CreditScoring.csv')

In [4]:
# make all columns in lower
df.columns = df.columns.str.lower()

In [5]:
# mapping the target
df['status'] = df['status'].map({
    1:'ok',
    2:'default',
    0:'unk'
})

In [6]:
# mapping categorical features
def mapping_categorical(df, cat, cat_lst):
  to_lst = df[cat].value_counts().sort_index().index.to_list()
  cat_lst = cat_lst

  df[cat] = (
      df[cat].map({
          k:v for (k,v) in zip(to_lst, cat_lst)
      })
   )

cols = ['home', 'marital', 'records', 'job']

home_lst = ['unk', 'rent', 'owner', 'private', 'ignore', 'parents', 'other']
marital_lst = ['unk', 'single', 'married', 'widow', 'separated', 'divorced']
records_lst = ['no', 'yes', 'unk']
job_lst = ['unk', 'fixed', 'partime', 'freelance', 'others']
cat_lst = [home_lst, marital_lst, records_lst, job_lst]

for col, cat in zip(cols, cat_lst):
  mapping_categorical(df, col, cat)

In [7]:
# fix missing values
def fix_missing_values(df, val_to_rep, rep, *f_lst):
  for f in f_lst:
    df[f] = df[f].replace(val_to_rep, rep)

fix_missing_values(df, 99999999.0, np.nan, ['income', 'assets', 'debt'])

In [8]:
# don't nees unk in status
df = df[df.status != 'unk']

In [9]:
# data preparation
data, target = df.drop(columns=['status']), df['status'].map({'ok':0, 'default':1})

def tweaking(data, target):
  numerical = selector(dtype_include=np.number)(data)
  categorical = selector(dtype_include=object)(data)

  num_imputer = SimpleImputer(missing_values=np.NaN, strategy='constant', fill_value=0)
  cat_imputer = SimpleImputer(strategy='most_frequent', fill_value='unk')
  cat_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
  
  X_full_train, X_test, y_full_train, y_test = model_selection.train_test_split(
      data,
      target,
      test_size=.2,
      random_state=11,
    )
  X_train, X_dev, y_train, y_dev = model_selection.train_test_split(
          X_full_train,
          y_full_train,
          test_size=.25,
          random_state=11,
        )


  X_train.loc[:, numerical] = num_imputer.fit_transform(X_train[numerical])
  X_train.loc[:, categorical] = cat_imputer.fit_transform(X_train[categorical])
  X_train.loc[:, categorical] = cat_encoder.fit_transform(X_train[categorical])

  X_dev.loc[:, numerical] = num_imputer.transform(X_dev[numerical])
  X_dev.loc[:, categorical] = cat_imputer.transform(X_dev[categorical])
  X_dev.loc[:, categorical] = cat_encoder.transform(X_dev[categorical])

  X_test.loc[:, numerical] = num_imputer.transform(X_test[numerical])
  X_test.loc[:, categorical] = cat_imputer.transform(X_test[categorical])
  X_test.loc[:, categorical] = cat_encoder.transform(X_test[categorical])

  return X_train, y_train, X_dev, y_dev, X_test, y_test

In [10]:
X_train, y_train, X_dev, y_dev, X_test, y_test = tweaking(data, target)

In [11]:
# wrap data into DMatrix — a special
# data structure for finding splits efficiently.
dtrain = xgb.DMatrix(
    X_train.values, 
    label=y_train.values, 
    feature_names=X_train.columns
)

# for validation
dval = xgb.DMatrix(
    X_dev.values,
    label=y_dev.values,
    feature_names=X_dev.columns
)

# specifying the parameters for training
xgb_params = {
    'eta':.3,
    'max_depth':6,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'nthread': -1,
    'seed': 1,
    'silent':1
}

# For training an XGBoost model, we use the train function
model = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=10
)

y_pred = model.predict(dval)
metrics.roc_auc_score(y_dev, y_pred)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




0.8121897023564457

In [12]:
watchlist = [(dtrain, 'train'), (dval, 'dev')]

xgb_params = {
    'eta':.05,
    'max_depth':3,
    'min_child_weight': 30,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': -1,
    'seed': 1,
    'silent':1
}

model = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=500,
    evals=watchlist,
    verbose_eval=10
)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-auc:0.76600	dev-auc:0.73459
[10]	train-auc:0.82062	dev-auc:0.76979
[20]	train-auc:0.83677	dev-auc:0.78571
[30]	train-auc:0.84640	dev-auc:0.79379
[40]	train-auc:0.85774	dev-auc:0.80618
[50]	train-auc:0.86499	dev-auc:0.81203
[60]	train-auc:0.86914	dev-auc:0.81505
[70]	train-auc:0.87350	dev-auc:0.81941
[80]	train-auc:0.87660	dev-auc:0.82213
[90]	train-auc:0.87878	dev-auc:0.82233
[100]	train-auc:0.88088	dev-auc:0.82450
[110]	train-auc:0.88243	dev-auc:0.82651
[120]	train-auc:0.88379	dev-auc:0.82784
[130]	train-auc:0.88528	dev-auc:0.82841
[140]	train-auc:0.88664	dev-auc:0.82930
[150]	train-auc:0.88783	dev-auc:0.82986
[160]	train-auc:0.88901	dev-auc:0.83080
[170]	train-auc:0.

In [13]:
dtest = xgb.DMatrix(
    X_test.values,
    label=y_test.values, 
    feature_names=X_test.columns
)

y_pred_dev = model.predict(dval)
y_pred_test = model.predict(dtest)

print(f"AUC-dev: {metrics.roc_auc_score(y_dev, y_pred_dev): .3f}")
print(f"AUC-test: {metrics.roc_auc_score(y_test, y_pred_test): .3f}")

AUC-dev:  0.833
AUC-test:  0.820


In [14]:
X_full_train = pd.concat([X_train, X_dev])
y_full_train = pd.concat([y_train, y_dev])

dfulltrain = xgb.DMatrix(
    X_full_train.values, 
    label=y_full_train.values, 
    feature_names=X_full_train.columns
)

xgb_params = {
    'eta': 0.1, 
    'max_depth': 3,
    'min_child_weight': 1,

    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dfulltrain, num_boost_round=175)

y_pred = model.predict(dtest)
metrics.roc_auc_score(y_test, y_pred)

0.8329749279116367

In [18]:
import bentoml

bentoml.xgboost.save_model("credit_risk_model", model)

ModuleNotFoundError: No module named 'bentoml'