In [0]:
# IMPORT GENERIC PACKAGES
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

sns.set(rc={'figure.figsize':(20,10)})
#from sklearn import datasets
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score 
from sklearn import metrics
import pandas_profiling as pp

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [0]:
data = pd.read_csv("census.csv", delimiter=",")

In [0]:
#report = pp.ProfileReport(data)
#report.to_file('data_assessment.html')
#report

In [0]:
sns.scatterplot(x="education_level", y = "education-num", data = data)

<matplotlib.axes._subplots.AxesSubplot at 0x7f1fb6c03e10>

In [0]:
# drop capital gain and loss almost 90% zeros
data = data.drop(columns=['capital-gain', 'capital-loss','education_level'])
bins = [10, 20, 30, 40, 50, 60, 70, 80, 90]
age_group = ['10-20','20-30','30-40','40-50','50-60', '60-70','70-80','80-90']
#age_group=[1,2,3,4,5,6,7,8]
age_categories = pd.cut(data['age'], bins, labels=age_group)
data['age_category'] = age_categories
#print(data['age_category'], data['age'])
data = data.drop(columns='age')
#sns.boxplot(x="age_category", y = "sex", hue="income", data = data)

In [0]:
data.dtypes

workclass           object
education-num      float64
marital-status      object
occupation          object
relationship        object
race                object
sex                 object
hours-per-week     float64
native-country      object
income              object
age_category      category
dtype: object

In [0]:
target = "income"
cat_names = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'age_category'] 
cont_names = ['education-num', 'hours-per-week']
df_cat = pd.get_dummies(data[cat_names].astype(str))
# df_cat = pd.get_dummies(data[cat_names].astype(str),drop_first=True)
df_cont = data[cont_names]
X = pd.concat([df_cat, df_cont], axis = 1)
y = data['income'] == '>50K'
X.shape

(45222, 92)

In [0]:
def feat_func():
  select_feat = pd.DataFrame()
  select_feat['feature'] = X.columns
  select_feat['rank'] = selector.ranking_
  select_feat.head()
  feat = select_feat[select_feat['rank'] == 1]['feature']
  feat = feat.tolist()
  return feat

In [0]:
# XGBoost with RFE
estimator = xgb.XGBClassifier()
selector = RFE(estimator, 80, step=1)
selector = selector.fit(X, y)
selector.ranking_

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
       10,  1,  1,  1,  1,  1,  1,  3,  5,  7,  1, 13, 12,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,
        1,  6,  1,  9, 11,  1,  1,  8,  4,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1])

In [0]:
feat = feat_func()
X_Xgboost = X[feat]

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X_Xgboost, y, test_size = 0.4, random_state = 42)

In [0]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
recall = metrics.recall_score(y_test, y_pred)
print("Recall: %.2f%%" % (recall * 100.0))
precision = metrics.precision_score(y_test, y_pred)
print("Precision: %.2f%%" % (precision * 100.0))

Accuracy: 83.69%
Recall: 56.75%
Precision: 71.14%


In [0]:
# Random Forest with RFE
estimator = RandomForestClassifier()
selector = RFE(estimator, 75, step=1)
selector = selector.fit(X, y)
selector.ranking_

array([ 1,  1,  1,  1,  1,  1,  7,  1,  1,  1,  1,  1,  1,  1,  1, 15,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  3,  1,  1,  1,
        1,  1,  8,  4, 18, 17, 11,  6,  1,  1,  1,  1,  1,  1, 14,  1,  9,
       16,  5,  1,  1,  1,  1, 13,  1,  1, 12, 10,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1])

In [0]:
feat = feat_func()
X_RF = X[feat]

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X_RF, y, test_size = 0.4, random_state = 42)

In [0]:
model = RandomForestClassifier(max_depth=100, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
recall = metrics.recall_score(y_test, y_pred)
print("Recall: %.2f%%" % (recall * 100.0))
precision = metrics.precision_score(y_test, y_pred)
print("Precision: %.2f%%" % (precision * 100.0))

Accuracy: 81.30%
Recall: 56.12%
Precision: 63.64%


In [0]:
# XGBoost with HPO
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)

In [0]:
params={
    "learning_rate"     :[0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth"         :[3,4,5,6,8,10,12,15],
    "min_child_weight"  :[1,3,5,7],
    "gamma"             :[0.0,0.1,0.2,0.3,0.4],
    "colsample_bytree"  :[0.3,0.4,0.5,0.7]
}

model = xgb.XGBClassifier()
random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=3)
random_search.fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  2.9min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=0, reg_alpha=0...
                                           seed=None, silent=None, subsample=1,
                                           verbosity=1),
                   iid='warn', n_iter=5, n_jobs=-1,
                   param_distribu

In [0]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, gamma=0.4,
              learning_rate=0.15, max_delta_step=0, max_depth=12,
              min_child_weight=7, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
random_search.best_params_

{'colsample_bytree': 0.4,
 'gamma': 0.4,
 'learning_rate': 0.15,
 'max_depth': 12,
 'min_child_weight': 7}

In [0]:
model = random_search.best_estimator_
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, gamma=0.4,
              learning_rate=0.15, max_delta_step=0, max_depth=12,
              min_child_weight=7, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
y_pred = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
recall = metrics.recall_score(y_test, y_pred)
print("Recall: %.2f%%" % (recall * 100.0))
precision = metrics.precision_score(y_test, y_pred)
print("Precision: %.2f%%" % (precision * 100.0))

Accuracy: 83.80%
Recall: 59.35%
Precision: 70.24%


In [0]:
score=cross_val_score(model, X_train, y_train, cv=10)

In [0]:
print(score) 
print(score.mean())

[0.82645542 0.83271923 0.84450995 0.82498158 0.84156227 0.83265757
 0.82860302 0.83744932 0.83849558 0.83702065]
0.8344454584511276


In [0]:
# XGBoost parameter tuning reference
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
# https://www.youtube.com/watch?v=9HomdnM12o4