# Load Dependencies

In [3]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.4.1-py3-none-manylinux2010_x86_64.whl (166.7 MB)
[K     |████████████████████████████████| 166.7 MB 91 kB/s s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.4.1


In [4]:
import pandas as pd
import numpy as np
import sklearn
import xgboost as xgb

In [5]:
print('Pandas version', pd.__version__)
print('Numpy version', np.__version__)
print('Scikit Learn version', sklearn.__version__)
print('XGBoost version', xgb.__version__)

Pandas version 1.1.3
Numpy version 1.19.2
Scikit Learn version 0.23.2
XGBoost version 1.4.1


# Load Dataset

In [187]:
df = pd.read_csv('../datasets/census.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [188]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


We can pre-load the dataset with specific values (e.g., ?) depicting missing values (NaNs)

In [344]:
df = pd.read_csv('../datasets/census.csv', na_values='?')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      30718 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  31978 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [345]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


We remove the `fnlwgt` column to keep things simple

In [346]:
final_columns = set(df.columns) - set(['fnlwgt'])
final_columns = list(final_columns)
final_columns

['relationship',
 'marital.status',
 'workclass',
 'occupation',
 'race',
 'income',
 'age',
 'hours.per.week',
 'native.country',
 'sex',
 'education.num',
 'education',
 'capital.loss',
 'capital.gain']

In [347]:
df = df[final_columns]
df.head()

Unnamed: 0,relationship,marital.status,workclass,occupation,race,income,age,hours.per.week,native.country,sex,education.num,education,capital.loss,capital.gain
0,Not-in-family,Widowed,,,White,<=50K,90,40,United-States,Female,9,HS-grad,4356,0
1,Not-in-family,Widowed,Private,Exec-managerial,White,<=50K,82,18,United-States,Female,9,HS-grad,4356,0
2,Unmarried,Widowed,,,Black,<=50K,66,40,United-States,Female,10,Some-college,4356,0
3,Unmarried,Divorced,Private,Machine-op-inspct,White,<=50K,54,40,United-States,Female,4,7th-8th,3900,0
4,Own-child,Separated,Private,Prof-specialty,White,<=50K,41,40,United-States,Female,10,Some-college,3900,0


# Split Dataset into train and test Datsets

We split the dataset into a standard 70:30 train-test split using stratified sampling to keep the distributions of classes similar in train and test datasets

In [348]:
X = df.drop(columns=['income'])
y = df['income']

In [349]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train.shape, X_test.shape

((22792, 13), (9769, 13))

In [350]:
y_train.value_counts()

<=50K    17303
>50K      5489
Name: income, dtype: int64

In [351]:
y_test.value_counts()

<=50K    7417
>50K     2352
Name: income, dtype: int64

# Missing Values Imputation and Feature Encoding

Here we will perform the following strategy for missing value imputation and feature encoding

- Separate features into numeric and categorical from the training data
- Fill missing values in categorical data with 'Not Available' given they can't be guessed
- Perform one-hot encoding of the categorical data to get dummy-encoded features
- Fill missing values in numeric data using a K-nearest neighbors model
- Combine the one-hot encoded categorical features and the numeric features to form the final featureset


- Apply similar transformations on the test data

In [352]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22792 entries, 24582 to 7468
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   relationship    22792 non-null  object
 1   marital.status  22792 non-null  object
 2   workclass       21497 non-null  object
 3   occupation      21493 non-null  object
 4   race            22792 non-null  object
 5   age             22792 non-null  int64 
 6   hours.per.week  22792 non-null  int64 
 7   native.country  22388 non-null  object
 8   sex             22792 non-null  object
 9   education.num   22792 non-null  int64 
 10  education       22792 non-null  object
 11  capital.loss    22792 non-null  int64 
 12  capital.gain    22792 non-null  int64 
dtypes: int64(5), object(8)
memory usage: 2.4+ MB


In [353]:
X_train_cat = X_train.select_dtypes(include='object')
X_train_num = X_train.select_dtypes(exclude='object')

In [354]:
X_train_cat.head()

Unnamed: 0,relationship,marital.status,workclass,occupation,race,native.country,sex,education
24582,Own-child,Never-married,Private,Sales,White,United-States,Male,HS-grad
10298,Own-child,Never-married,Self-emp-not-inc,Farming-fishing,White,United-States,Male,7th-8th
16626,Own-child,Never-married,Private,Handlers-cleaners,White,United-States,Male,Some-college
11039,Not-in-family,Widowed,Private,Exec-managerial,White,United-States,Female,Some-college
2535,Husband,Married-civ-spouse,Self-emp-inc,Exec-managerial,White,United-States,Male,Some-college


In [355]:
X_train_num.head()

Unnamed: 0,age,hours.per.week,education.num,capital.loss,capital.gain
24582,28,48,9,0,0
10298,24,50,4,0,0
16626,20,35,10,0,0
11039,70,12,10,0,0
2535,44,45,10,0,7688


## Impute missing data in Categorical Features

In [356]:
X_train_cat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22792 entries, 24582 to 7468
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   relationship    22792 non-null  object
 1   marital.status  22792 non-null  object
 2   workclass       21497 non-null  object
 3   occupation      21493 non-null  object
 4   race            22792 non-null  object
 5   native.country  22388 non-null  object
 6   sex             22792 non-null  object
 7   education       22792 non-null  object
dtypes: object(8)
memory usage: 1.6+ MB


In [357]:
from sklearn.impute import SimpleImputer

cat_cols = X_train_cat.columns
cat_imputer = SimpleImputer(strategy='constant', fill_value='Not Available', verbose=1)
X_train_cat = pd.DataFrame(cat_imputer.fit_transform(X_train_cat), columns=cat_cols)

## One-hot encode Categorical Features

In [358]:
from sklearn.preprocessing import OneHotEncoder

cat_dummy_encoder = OneHotEncoder(handle_unknown='ignore')
X_train_cat_ohe = cat_dummy_encoder.fit_transform(X_train_cat).toarray()
X_train_cat_ohe, cat_dummy_encoder.get_feature_names(cat_cols)

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 1.],
        ...,
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 1.]]),
 array(['relationship_Husband', 'relationship_Not-in-family',
        'relationship_Other-relative', 'relationship_Own-child',
        'relationship_Unmarried', 'relationship_Wife',
        'marital.status_Divorced', 'marital.status_Married-AF-spouse',
        'marital.status_Married-civ-spouse',
        'marital.status_Married-spouse-absent',
        'marital.status_Never-married', 'marital.status_Separated',
        'marital.status_Widowed', 'workclass_Federal-gov',
        'workclass_Local-gov', 'workclass_Never-worked',
        'workclass_Not Available', 'workclass_Private',
        'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc',
        'workclass_State-gov', 'workclass_Without-pay',
        'occupation_Adm-clerical', 'occupatio

In [359]:
cat_ohe_cols = cat_dummy_encoder.get_feature_names(cat_cols)
X_train_cat_ohe = pd.DataFrame(X_train_cat_ohe, 
                               columns=cat_ohe_cols)

## Impute missing data in Numeric Features

In [360]:
from sklearn.impute import KNNImputer

num_cols = X_train_num.columns
knn_imp = KNNImputer(n_neighbors=5)
X_train_num = pd.DataFrame(knn_imp.fit_transform(X_train_num), columns=num_cols)

## Combine Numeric and encoded Categorical Features

In [361]:
X_train = pd.concat([X_train_num, X_train_cat_ohe], axis=1)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22792 entries, 0 to 22791
Columns: 106 entries, age to education_Some-college
dtypes: float64(106)
memory usage: 18.4 MB


In [362]:
X_train.head(3)

Unnamed: 0,age,hours.per.week,education.num,capital.loss,capital.gain,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,...,education_9th,education_Assoc-acdm,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college
0,28.0,48.0,9.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,24.0,50.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20.0,35.0,10.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [363]:
final_dataset_columns = X_train.columns.values

In [364]:
final_dataset_columns

array(['age', 'hours.per.week', 'education.num', 'capital.loss',
       'capital.gain', 'relationship_Husband',
       'relationship_Not-in-family', 'relationship_Other-relative',
       'relationship_Own-child', 'relationship_Unmarried',
       'relationship_Wife', 'marital.status_Divorced',
       'marital.status_Married-AF-spouse',
       'marital.status_Married-civ-spouse',
       'marital.status_Married-spouse-absent',
       'marital.status_Never-married', 'marital.status_Separated',
       'marital.status_Widowed', 'workclass_Federal-gov',
       'workclass_Local-gov', 'workclass_Never-worked',
       'workclass_Not Available', 'workclass_Private',
       'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc',
       'workclass_State-gov', 'workclass_Without-pay',
       'occupation_Adm-clerical', 'occupation_Armed-Forces',
       'occupation_Craft-repair', 'occupation_Exec-managerial',
       'occupation_Farming-fishing', 'occupation_Handlers-cleaners',
       'occupation_Machi

## Apply imputation and feature encoding transformations on Test data

In [365]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9769 entries, 2168 to 4293
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   relationship    9769 non-null   object
 1   marital.status  9769 non-null   object
 2   workclass       9228 non-null   object
 3   occupation      9225 non-null   object
 4   race            9769 non-null   object
 5   age             9769 non-null   int64 
 6   hours.per.week  9769 non-null   int64 
 7   native.country  9590 non-null   object
 8   sex             9769 non-null   object
 9   education.num   9769 non-null   int64 
 10  education       9769 non-null   object
 11  capital.loss    9769 non-null   int64 
 12  capital.gain    9769 non-null   int64 
dtypes: int64(5), object(8)
memory usage: 1.0+ MB


In [366]:
X_test_cat = X_test[cat_cols]
X_test_num = X_test[num_cols]

In [367]:
X_test_cat = pd.DataFrame(cat_imputer.transform(X_test_cat), columns=cat_cols)
X_test_cat_ohe = cat_dummy_encoder.transform(X_test_cat).toarray()
X_test_cat_ohe = pd.DataFrame(X_test_cat_ohe, 
                               columns=cat_ohe_cols)

X_test_num = pd.DataFrame(knn_imp.transform(X_test_num), columns=num_cols)

X_test = pd.concat([X_test_num, X_test_cat_ohe], axis=1)
X_test = X_test[final_dataset_columns]
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9769 entries, 0 to 9768
Columns: 106 entries, age to education_Some-college
dtypes: float64(106)
memory usage: 7.9 MB


In [368]:
X_test.head(3)

Unnamed: 0,age,hours.per.week,education.num,capital.loss,capital.gain,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,...,education_9th,education_Assoc-acdm,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college
0,51.0,50.0,13.0,0.0,14084.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,48.0,20.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,65.0,30.0,9.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Train and Tune ML Model

Here we stick to just using a state-of-the-art XGBoost Classifier though you are welcome to try more models. Do note we do NOT involve the test dataset at all during the training or tuning process

## Simple 5-fold Cross Validation

In [60]:
from sklearn.model_selection import cross_val_score

xgc = xgb.XGBClassifier()
cross_val_score(xgc, X_train, y_train, cv=5)









array([0.87343716, 0.87255977, 0.87209302, 0.87823607, 0.85870996])

## Train and Tune XGBoost Classifier with Randomized Search + K-fold

In [79]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200, 1500],
    'learning_rate': stats.uniform(0.01, 0.3).rvs(15),
    'max_depth': [3, 5, 7, 9, 15, None]             
}

def f1_scorer(clf, X, y):
    y_pred = clf.predict(X)
    return f1_score(y, y_pred, pos_label='>50K')
    
xgc = xgb.XGBClassifier()

clf = RandomizedSearchCV(estimator=xgc, 
                         param_distributions=param_grid,
                         cv= 5,  
                         n_iter=15, 
                         scoring = f1_scorer, 
                         verbose = 3, 
                         n_jobs = -1)

search = clf.fit(X_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed: 39.2min finished




In [85]:
pd.set_option('max_colwidth', None)
results = pd.DataFrame(search.cv_results_)
results = results[['params', 'rank_test_score', 'mean_test_score', 'std_test_score', 
                   'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']].sort_values(by=['rank_test_score'])
results

Unnamed: 0,params,rank_test_score,mean_test_score,std_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
13,"{'n_estimators': 600, 'max_depth': 3, 'learning_rate': 0.12188470309348745}",1,0.714561,0.012794,0.725394,0.713217,0.709904,0.730392,0.693898
4,"{'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.16902596088334138}",2,0.713772,0.016112,0.725767,0.714004,0.711814,0.732064,0.685212
11,"{'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.15395564899591388}",3,0.713389,0.016381,0.724878,0.710957,0.71633,0.731088,0.683694
9,"{'n_estimators': 1200, 'max_depth': 5, 'learning_rate': 0.01136783306512155}",4,0.712899,0.009393,0.72167,0.708144,0.709904,0.725275,0.699502
0,"{'n_estimators': 400, 'max_depth': None, 'learning_rate': 0.1510725187028566}",5,0.712244,0.016468,0.72128,0.713163,0.714567,0.730527,0.681685
2,"{'n_estimators': 1000, 'max_depth': 3, 'learning_rate': 0.12188470309348745}",6,0.71203,0.013671,0.719882,0.709806,0.714499,0.728341,0.687623
12,"{'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.21729298730485747}",7,0.707518,0.018718,0.723754,0.703795,0.701928,0.730695,0.677419
6,"{'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.2944068816754488}",8,0.70652,0.016961,0.721629,0.709804,0.704958,0.721075,0.675134
1,"{'n_estimators': 400, 'max_depth': 9, 'learning_rate': 0.12188470309348745}",9,0.704709,0.015993,0.723754,0.704102,0.701172,0.717162,0.677357
10,"{'n_estimators': 1500, 'max_depth': None, 'learning_rate': 0.15395564899591388}",10,0.699198,0.018552,0.717726,0.695946,0.697203,0.717803,0.66731


In [86]:
search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.12188470309348745, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=600, n_jobs=2, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

## Train and futher fine-tune XGBoost Classifier with Grid Search + K-fold

In [90]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [550, 600, 650],
    'learning_rate': [0.10, 0.12, 0.13],
    'max_depth': [3, 5, 7]             
}

    
xgc = xgb.XGBClassifier()

clf = GridSearchCV(estimator=xgc, 
                   param_grid=param_grid,
                   cv= 5,  
                   scoring = f1_scorer, 
                   verbose = 3, 
                   n_jobs = -1)

search2 = clf.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed: 42.8min
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed: 48.0min finished




While running the above tuning processes, it is definitely memory and compute intensive and you can see the debug outputs on your terminal. Sample output is depicted in the screengrab below.

![](https://i.imgur.com/dhE0Zmj.png)

In [91]:
results = pd.DataFrame(search2.cv_results_)
results = results[['params', 'rank_test_score', 'mean_test_score', 'std_test_score', 
                   'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']].sort_values(by=['rank_test_score'])
results

Unnamed: 0,params,rank_test_score,mean_test_score,std_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
0,"{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 550}",1,0.716646,0.013763,0.728713,0.711644,0.715285,0.733268,0.694321
2,"{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 650}",2,0.71635,0.013634,0.729423,0.712288,0.713147,0.732353,0.69454
19,"{'learning_rate': 0.13, 'max_depth': 3, 'n_estimators': 600}",3,0.71616,0.014316,0.729146,0.711577,0.712425,0.733954,0.693698
18,"{'learning_rate': 0.13, 'max_depth': 3, 'n_estimators': 550}",4,0.715689,0.014987,0.73075,0.712425,0.710777,0.732712,0.691781
9,"{'learning_rate': 0.12, 'max_depth': 3, 'n_estimators': 550}",5,0.715433,0.012113,0.72772,0.713643,0.708963,0.72969,0.697148
1,"{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 600}",6,0.715136,0.013155,0.726733,0.710934,0.713573,0.730845,0.693596
11,"{'learning_rate': 0.12, 'max_depth': 3, 'n_estimators': 650}",7,0.714854,0.011437,0.724595,0.714571,0.711111,0.728256,0.695737
20,"{'learning_rate': 0.13, 'max_depth': 3, 'n_estimators': 650}",8,0.714509,0.013378,0.725672,0.712575,0.712425,0.730128,0.691744
10,"{'learning_rate': 0.12, 'max_depth': 3, 'n_estimators': 600}",9,0.714305,0.012803,0.726115,0.714286,0.707071,0.729597,0.694458
3,"{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 550}",10,0.712661,0.016513,0.724004,0.708105,0.711814,0.73414,0.685239


In [92]:
search2.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=550, n_jobs=2, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

# Evaluate Best Models performance on Test Data

In [369]:
from sklearn.metrics import classification_report, confusion_matrix

model1 = search.best_estimator_
class_labels = model1.classes_

model1.fit(X_train, y_train)

predictions = model1.predict(X_test)
print(classification_report(y_test, predictions))
pd.DataFrame(confusion_matrix(y_test, predictions), columns=class_labels, index=class_labels)







              precision    recall  f1-score   support

       <=50K       0.90      0.94      0.92      7417
        >50K       0.77      0.66      0.71      2352

    accuracy                           0.87      9769
   macro avg       0.84      0.80      0.82      9769
weighted avg       0.87      0.87      0.87      9769



Unnamed: 0,<=50K,>50K
<=50K,6960,457
>50K,792,1560


In [293]:
model2 = search2.best_estimator_
class_labels = model2.classes_

model2.fit(X_train, y_train)

predictions = model2.predict(X_test)
print(classification_report(y_test, predictions))
pd.DataFrame(confusion_matrix(y_test, predictions), columns=class_labels, index=class_labels)







              precision    recall  f1-score   support

       <=50K       0.90      0.94      0.92      7417
        >50K       0.77      0.66      0.71      2352

    accuracy                           0.87      9769
   macro avg       0.83      0.80      0.81      9769
weighted avg       0.87      0.87      0.87      9769



Unnamed: 0,<=50K,>50K
<=50K,6958,459
>50K,811,1541


# Finalize Best Model and ML Model Artifacts

To build a ML inference pipeline we need to starting thinking beyond notebooks. Hence we need to persist our best trained models along with all other artifacts like feature names, encoders, imputers, vectorizers and so on.

In [294]:
final_model = model1

In [370]:
model_artifacts = {
    'dummy_encoder': cat_dummy_encoder,
    'cat_init_features': cat_cols.values,
    'num_init_features': num_cols.values,
    'cat_ohe_features': cat_ohe_cols,
    'cat_imputer': cat_imputer,
    'num_imputer': knn_imp,
    'xgb_model': final_model,
    'column_names_order': final_dataset_columns
}

In [371]:
model_artifacts

{'dummy_encoder': OneHotEncoder(handle_unknown='ignore'),
 'cat_init_features': array(['relationship', 'marital.status', 'workclass', 'occupation',
        'race', 'native.country', 'sex', 'education'], dtype=object),
 'num_init_features': array(['age', 'hours.per.week', 'education.num', 'capital.loss',
        'capital.gain'], dtype=object),
 'cat_ohe_features': array(['relationship_Husband', 'relationship_Not-in-family',
        'relationship_Other-relative', 'relationship_Own-child',
        'relationship_Unmarried', 'relationship_Wife',
        'marital.status_Divorced', 'marital.status_Married-AF-spouse',
        'marital.status_Married-civ-spouse',
        'marital.status_Married-spouse-absent',
        'marital.status_Never-married', 'marital.status_Separated',
        'marital.status_Widowed', 'workclass_Federal-gov',
        'workclass_Local-gov', 'workclass_Never-worked',
        'workclass_Not Available', 'workclass_Private',
        'workclass_Self-emp-inc', 'workclass_Self

# Save ML Model Artifacts

In [103]:
!pip install dill

Collecting dill
  Downloading dill-0.3.3-py2.py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 9.4 MB/s  eta 0:00:01
[?25hInstalling collected packages: dill
Successfully installed dill-0.3.3


In [372]:
import dill

In [105]:
!pwd

/home/ec2-user/ml_model_deployment_example/notebooks


In [106]:
!mkdir ../ml_app/saved_models

In [373]:
with open("../ml_app/saved_models/census_xgb_artifacts.pkl", "wb") as dill_file:
    dill.dump(model_artifacts, dill_file)

In [374]:
!tree ../

[01;34m../[00m
├── [01;34mdatasets[00m
│   └── census.csv
├── LICENSE
├── [01;34mml_app[00m
│   └── [01;34msaved_models[00m
│       └── census_xgb_artifacts.pkl
├── [01;34mnotebooks[00m
│   └── ML\ Prototyping\ and\ Exploration.ipynb
└── README.md

4 directories, 5 files


# Try out a test inference pipeline with our test dataset

## Load Data

In [375]:
df = pd.read_csv('../datasets/census.csv', na_values='?')
final_columns = set(df.columns) - set(['fnlwgt'])
final_columns = list(final_columns)
df = df[final_columns]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [376]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9769 entries, 2168 to 4293
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   relationship    9769 non-null   object
 1   marital.status  9769 non-null   object
 2   workclass       9228 non-null   object
 3   occupation      9225 non-null   object
 4   race            9769 non-null   object
 5   age             9769 non-null   int64 
 6   hours.per.week  9769 non-null   int64 
 7   native.country  9590 non-null   object
 8   sex             9769 non-null   object
 9   education.num   9769 non-null   int64 
 10  education       9769 non-null   object
 11  capital.loss    9769 non-null   int64 
 12  capital.gain    9769 non-null   int64 
dtypes: int64(5), object(8)
memory usage: 1.0+ MB


## Load Saved Model Artifacts

In [377]:
with open("../ml_app/saved_models/census_xgb_artifacts.pkl", "rb") as dill_infile:
    xgb_model_artifacts = dill.load(dill_infile)

In [378]:
xgb_model_artifacts

{'dummy_encoder': OneHotEncoder(handle_unknown='ignore'),
 'cat_init_features': array(['relationship', 'marital.status', 'workclass', 'occupation',
        'race', 'native.country', 'sex', 'education'], dtype=object),
 'num_init_features': array(['age', 'hours.per.week', 'education.num', 'capital.loss',
        'capital.gain'], dtype=object),
 'cat_ohe_features': array(['relationship_Husband', 'relationship_Not-in-family',
        'relationship_Other-relative', 'relationship_Own-child',
        'relationship_Unmarried', 'relationship_Wife',
        'marital.status_Divorced', 'marital.status_Married-AF-spouse',
        'marital.status_Married-civ-spouse',
        'marital.status_Married-spouse-absent',
        'marital.status_Never-married', 'marital.status_Separated',
        'marital.status_Widowed', 'workclass_Federal-gov',
        'workclass_Local-gov', 'workclass_Never-worked',
        'workclass_Not Available', 'workclass_Private',
        'workclass_Self-emp-inc', 'workclass_Self

In [379]:
categorical_features_init = xgb_model_artifacts['cat_init_features']
categorical_features_ohe = xgb_model_artifacts['cat_ohe_features']
numeric_features_init = xgb_model_artifacts['num_init_features']
column_names = xgb_model_artifacts['column_names_order']

categorical_imputer = xgb_model_artifacts['cat_imputer']
categorical_ohe = xgb_model_artifacts['dummy_encoder']

numeric_imputer = xgb_model_artifacts['num_imputer']

ml_model = xgb_model_artifacts['xgb_model']

## Pre-processing and Feature Encoding

In [380]:
X_test_cat = X_test[categorical_features_init]
X_test_num = X_test[numeric_features_init]

X_test_cat = pd.DataFrame(categorical_imputer.transform(X_test_cat), 
                          columns=categorical_features_init)
X_test_cat_ohe = categorical_ohe.transform(X_test_cat).toarray()
X_test_cat_ohe = pd.DataFrame(X_test_cat_ohe, 
                               columns=categorical_features_ohe)

X_test_num = pd.DataFrame(numeric_imputer.transform(X_test_num), columns=numeric_features_init)

X_test = pd.concat([X_test_num, X_test_cat_ohe], axis=1)
X_test = X_test[column_names]
X_test.head(3)

Unnamed: 0,age,hours.per.week,education.num,capital.loss,capital.gain,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,...,education_9th,education_Assoc-acdm,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college
0,51.0,50.0,13.0,0.0,14084.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,48.0,20.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,65.0,30.0,9.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## ML Model Inference

In [381]:
predictions = ml_model.predict(X_test)
predictions



array(['>50K', '<=50K', '<=50K', ..., '<=50K', '>50K', '<=50K'],
      dtype=object)

In [382]:
labels = ml_model.classes_

print(classification_report(y_test, predictions))
pd.DataFrame(confusion_matrix(y_test, predictions), columns=labels, index=labels)

              precision    recall  f1-score   support

       <=50K       0.90      0.94      0.92      7417
        >50K       0.77      0.66      0.71      2352

    accuracy                           0.87      9769
   macro avg       0.84      0.80      0.82      9769
weighted avg       0.87      0.87      0.87      9769



Unnamed: 0,<=50K,>50K
<=50K,6960,457
>50K,792,1560
