In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV,train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost.sklearn import XGBClassifier

In [2]:
file=r'E:/Python_data Science/data/census_income.csv'

In [3]:
train=pd.read_csv(file)

In [4]:
train.shape

(32561, 15)

In [5]:
train.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education.num      int64
marital.status    object
occupation        object
relationship      object
race              object
sex               object
capital.gain       int64
capital.loss       int64
hours.per.week     int64
native.country    object
Y                 object
dtype: object

In [6]:
train['Y'].value_counts()

 <=50K    24720
 >50K      7841
Name: Y, dtype: int64

In [7]:
train['Y']=(train['Y']==' >50K').astype(int)

In [8]:
train.drop('education',axis=1,inplace=True)

In [9]:
cat_cols=train.select_dtypes(['object']).columns

In [10]:
cat_cols

Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race',
       'sex', 'native.country'],
      dtype='object')

In [11]:
for col in cat_cols:
    k=train[col].value_counts()
    cats=k.index[k>300][:-1]
    for cat in cats:
        name=col+"_"+str(cat)
        train[name]=(train[col]==cat).astype(int)
    del train[col]

In [12]:
train1,test=train_test_split(train,test_size=0.2,random_state=2)

In [13]:
train1.reset_index(drop=True,inplace=True)

In [14]:
test.reset_index(drop=True,inplace=True)

In [15]:
x_train=train1.drop('Y',axis=1)
y_train=train1['Y']

x_test=test.drop('Y',axis=1)
y_test=test['Y']

In [18]:
gbm_params={'n_estimators':[50,100,200,500],           
            'learning_rate': [0.01,.05,0.1,0.4,0.8,1],            
            'max_depth':[1,2,3,4,5,6],            
            'subsample':[0.5,0.8,1],            
            'max_features':[0.1,0.3,0.5,0.8,1]}


In [21]:
from sklearn.ensemble import GradientBoostingClassifier 

In [22]:
model = GradientBoostingClassifier()

In [23]:
random_search=RandomizedSearchCV(model,scoring='roc_auc',param_distributions=gbm_params,cv=10,n_iter=10,n_jobs=-1,verbose=False)
# n_jobs=-1,  indicates that the code can use all the available CPU cores present in the system, helping us take advantage of multicare processing


In [24]:
random_search.fit(x_train,y_train)

RandomizedSearchCV(cv=10, error_score='raise-deprecating',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'n_estimators': [50, 100, 200, 500], 'learning_rate': [0.01, 0.05, 0.1, 0.4, 0.8, 1], 'max_depth': [1, 2, 3, 4, 5, 6], 'subsample': [0.5, 0.8, 1], 'max_features': [0.1, 0.3, 0.5, 0.8, 1]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=False)

In [25]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.5f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [26]:
report(random_search.cv_results_,3)

Model with rank: 1
Mean validation score: 0.921 (std: 0.00438)
Parameters: {'subsample': 0.5, 'n_estimators': 50, 'max_features': 0.3, 'max_depth': 5, 'learning_rate': 0.1}

Model with rank: 2
Mean validation score: 0.920 (std: 0.00515)
Parameters: {'subsample': 1, 'n_estimators': 200, 'max_features': 0.1, 'max_depth': 1, 'learning_rate': 0.8}

Model with rank: 3
Mean validation score: 0.920 (std: 0.00398)
Parameters: {'subsample': 0.5, 'n_estimators': 500, 'max_features': 1, 'max_depth': 5, 'learning_rate': 0.05}



# Xgboost implementation 

In [18]:

from xgboost.sklearn import XGBClassifier


In [20]:
param_dist = {
              "max_depth": [2,3],
              "learning_rate":[0.01,0.05,0.1,0.3,0.5],
    "min_child_weight":[4,5,6],
              "subsample":[i/10.0 for i in range(6,10)],
 "colsample_bytree":[i/10.0 for i in range(6,10)],
               "reg_alpha":[1e-5, 1e-2, 0.1, 1, 100],
              "gamma":[i/10.0 for i in range(0,5)],
    "n_estimators":[100,150],
    'scale_pos_weight':[2,3,4,5,6,7,8,9]}
    

In [23]:
clf=XGBClassifier(objective='binary:logistic')

In [24]:
n_iter=5

random_search=RandomizedSearchCV(clf,n_jobs=-1,verbose=2,cv=10,n_iter=n_iter,scoring='roc_auc',
                                 param_distributions=param_dist)

In [25]:
random_search.fit(x_train,y_train)

Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.9min finished


RandomizedSearchCV(cv=10, error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
          fit_params=None, iid='warn', n_iter=5, n_jobs=-1,
          param_distributions={'max_depth': [2, 3], 'learning_rate': [0.01, 0.05, 0.1, 0.3, 0.5], 'min_child_weight': [4, 5, 6], 'subsample': [0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9], 'reg_alpha': [1e-05, 0.01, 0.1, 1, 100], 'gamma': [0.0, 0.1, 0.2, 0.3, 0.4], 'n_estimators': [100, 150], 'scale_pos_weight': [2, 3, 4, 5, 6, 7, 8, 9]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
        

In [None]:
xgb_best=XGBClassifier(subsample=0.8,scale_pos_weight=3,reg_alpha=1e-05,n_estimators=500,min_child_weight=4,
                       max_depth=4,learning_rate=0.05,gamma=0.3,colsample_bytree=0.8
                      )

In [None]:
xgb_best.fit(x_train,y_train)

In [None]:
p=xgb_best.predict_proba(x_test)[:,1]

In [None]:
roc_auc_score(y_test,p)