In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier

In [27]:
# data prep from previous module
file=r'census_income.csv'

ci=pd.read_csv(file)


In [28]:
ci.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,Y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [29]:
# there is perfect correspondance between education and education.num, we'll drop education
ci.drop('education',axis=1,inplace=True)

# convert target Y to 1,0
ci['Y']=(ci['Y']==' >50K').astype(int)

In [30]:
ci.head()

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,Y
0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [31]:
cat_cols=ci.select_dtypes(['object']).columns 

In [32]:
cat_cols

Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race',
       'sex', 'native.country'],
      dtype='object')

In [33]:
for col in cat_cols:
    freqs=ci[col].value_counts()
    selected_cats=freqs.index[freqs>100][:-1]
    
    print(col)
    for cat in selected_cats:
        name=col+'_'+cat
        
        ci[name]=(ci[col]==cat).astype(int)
    del ci[col]
    

workclass
marital.status
occupation
relationship
race
sex
native.country


In [34]:
print(selected_cats)
print(freqs.head())
ci.head()

Index([' United-States', ' Mexico', ' ?', ' Philippines', ' Germany',
       ' Canada', ' Puerto-Rico'],
      dtype='object')
 United-States    29170
 Mexico             643
 ?                  583
 Philippines        198
 Germany            137
Name: native.country, dtype: int64


Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,Y,workclass_ Private,workclass_ Self-emp-not-inc,workclass_ Local-gov,...,race_ Asian-Pac-Islander,race_ Amer-Indian-Eskimo,sex_ Male,native.country_ United-States,native.country_ Mexico,native.country_ ?,native.country_ Philippines,native.country_ Germany,native.country_ Canada,native.country_ Puerto-Rico
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
1,50,83311,13,0,0,13,0,0,1,0,...,0,0,1,1,0,0,0,0,0,0
2,38,215646,9,0,0,40,0,1,0,0,...,0,0,1,1,0,0,0,0,0,0
3,53,234721,7,0,0,40,0,1,0,0,...,0,0,1,1,0,0,0,0,0,0
4,28,338409,13,0,0,40,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
ci.shape

(32561, 48)

In [37]:
x_train=ci.drop('Y',axis=1)
y_train=ci['Y']


In [38]:
gbm_params={'n_estimators':[50,100,200,500,700],
           'learning_rate': [0.01,.05,0.1,0.4,0.8,1],
            'max_depth':[1,2,3,4,5,6],
            #'min_samples_split':[2,5,10,20],
            #'min_samples_leaf':[2,5,10,20],
            'subsample':[0.5,0.8,1],
            'max_features':[5,10,15,20,30,45]
           }

In [39]:
gbm=GradientBoostingClassifier()

In [40]:
random_search=RandomizedSearchCV(gbm,scoring='roc_auc',param_distributions=gbm_params,
                                 cv=5,n_iter=1,n_jobs=-1)

In [41]:
random_search.fit(x_train,y_train)

# this might take upto 30-45 miins to finish , if you try cv=10 and larger number for n_iter
# ( dont be impatient :) . Also dont be alarmed if it finishes early :))

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                        init=None,
                                                        learning_rate=0.1,
                                                        loss='deviance',
                                                        max_depth=3,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                     

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.4, loss='deviance', max_depth=3,
              max_features=20, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=0.8, verbose=0, warm_start=False)
              
use the above result in the class, its a result from previous run. This can be definitely different on a rerun. use this to save time in class so that you dont have to wait for the randomised search to finish

In [42]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.5f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [43]:
report(random_search.cv_results_,5)

Model with rank: 1
Mean validation score: 0.900 (std: 0.00702)
Parameters: {'subsample': 0.8, 'n_estimators': 500, 'max_features': 10, 'max_depth': 6, 'learning_rate': 0.4}



top 5 classfiers from the previous run were as follows : 

Model with rank: 1

Mean validation score: 0.925 (std: 0.00188)

Parameters: {'max_features': 20, 'max_depth': 3, 'subsample': 0.8, 'learning_rate': 0.4, 'n_estimators': 100}

~~~~~~~~~~

Model with rank: 2

Mean validation score: 0.924 (std: 0.00121)

Parameters: {'max_features': 15, 'max_depth': 4, 'subsample': 1, 'learning_rate': 0.4, 'n_estimators': 100}

~~~~~~~~~~

Model with rank: 3

Mean validation score: 0.923 (std: 0.00250)

Parameters: {'max_features': 5, 'max_depth': 4, 'subsample': 0.5, 'learning_rate': 0.05, 'n_estimators': 500}

~~~~~~~~~~

Model with rank: 4

Mean validation score: 0.914 (std: 0.00290)

Parameters: {'max_features': 10, 'max_depth': 5, 'subsample': 1, 'learning_rate': 0.05, 'n_estimators': 50}

~~~~~~~~~~

Model with rank: 5

Mean validation score: 0.913 (std: 0.00174)

Parameters: {'max_features': 30, 'max_depth': 5, 'subsample': 0.8, 'learning_rate': 0.4, 'n_estimators': 200}

tentative performance : 0.925 for the best classfier 

**Note: you can use the random search predict,predict_proba function to make prediction as randomisedsearchcv automatically fits the best candidate on complete data. If you want to look into feature_importance etc, then fit the best estimator separately**

In [44]:
# Considering the best estimator
gbm = GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=0.4, loss='deviance', max_depth=3,
                                 max_features=20, max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, 
                                 min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, presort='auto', 
                                 random_state=None, subsample=0.8, verbose=0, warm_start=False)

In [45]:
gbm.fit(x_train, y_train)









GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.4, loss='deviance', max_depth=3,
                           max_features=20, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=1e-07,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=0.8, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [46]:
from sklearn.model_selection import cross_val_score

In [47]:
cross_val_score(gbm,x_train,y_train,scoring='roc_auc',verbose=10,n_jobs=-1,cv=10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   18.9s remaining:   18.9s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   19.3s remaining:    8.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   25.5s finished


array([0.92393224, 0.92496367, 0.92648402, 0.91732764, 0.92457411,
       0.92675465, 0.92620074, 0.92957863, 0.92839574, 0.92721723])

In [48]:
scores=[0.92611697,  0.92263246,  0.92613728,  0.91891016,  0.92345649,
        0.92834801,  0.92543864,  0.92872648,  0.92744581,  0.92505475]

In [49]:
import numpy as np

In [50]:
np.mean(scores)

0.9252267049999998

In [51]:
np.std(scores)

0.0027995376319144213