In [1]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier

In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator,TransformerMixin

from mypipes import *

In [3]:
train_file=r'C:\Users\chirag\Desktop\ML IITK\Untitled Folder\DATA\rg_train.csv'
test_file=r'C:\Users\chirag\Desktop\ML IITK\Untitled Folder\DATA\rg_test.csv'

bd_train=pd.read_csv(train_file)
bd_test=pd.read_csv(test_file)

In [4]:
num_vars=list(bd_train.select_dtypes(exclude=['object']).columns)
num_vars=[_ for _ in num_vars if _ not in ['REF_NO','Revenue.Grid']]

In [5]:
cat_vars=list(bd_train.select_dtypes(include=['object']).columns)

In [6]:
cat_vars=[_ for _ in cat_vars if _ not in 
         ['children','age_band','post_code','post_area','family_income']]

In [7]:
p1=pdPipeline([
    ('var_select',VarSelector(num_vars)),
    ('missing_trt',DataFrameImputer())
])

p2=pdPipeline([
    ('var_select',VarSelector(cat_vars)),
    ('missing_trt',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(70))
])

p3=pdPipeline([
    ('var_select',VarSelector(['age_band'])),
    ('custom_fico',custom_age_band()),
    ('missing_trt',DataFrameImputer())
])

p4=pdPipeline([
    ('var_select',VarSelector(['family_income'])),
    ('custom_fico',custom_family_income()),
    ('missing_trt',DataFrameImputer())
])

p5=pdPipeline([
    ('var_select',VarSelector(['children'])),
    ('string_clean1',string_clean(replace_it='Zero',replace_with='0')),
    ('string_clean2',string_clean(replace_it='4+',replace_with='4')),
    ('convert_to_numeric',convert_to_numeric()),
    ('missing_trt',DataFrameImputer())
])

In [8]:
data_pipe=FeatureUnion([
    ('num',p1),
    ('obj_to_dum',p2),
    ('age_band',p3),
    ('family_income',p4),
    ('children',p5)
])

In [9]:
x_train=pd.DataFrame(data=data_pipe.fit_transform(bd_train),
                    columns=data_pipe.get_feature_names())

In [10]:
x_test=pd.DataFrame(data=data_pipe.transform(bd_test),
                   columns=data_pipe.get_feature_names())

In [11]:
y_train=(bd_train['Revenue.Grid']==1).astype(int)

In [12]:
x_train.shape

(8124, 71)

In [13]:
gbm_params={
    'n_estimators':[50,100,200,500,700],
    'learning_rate':[0.01,0.05,0.1,0.4,0.8,1],
    'max_depth':[1,2,3,4,5,6],
    'subsample':[0.5,0.8,1],
    'max_features':[5,10,15,20,30,45,55,65]
}

In [14]:
gbm=GradientBoostingClassifier()

In [16]:
random_search=RandomizedSearchCV(gbm,
                                scoring='roc_auc',
                                param_distributions=gbm_params,
                                cv=10,
                                n_iter=10,
                                n_jobs=-1,
                                verbose=20)

In [17]:
random_search.fit(x_train,y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


RandomizedSearchCV(cv=10, estimator=GradientBoostingClassifier(), n_jobs=-1,
                   param_distributions={'learning_rate': [0.01, 0.05, 0.1, 0.4,
                                                          0.8, 1],
                                        'max_depth': [1, 2, 3, 4, 5, 6],
                                        'max_features': [5, 10, 15, 20, 30, 45,
                                                         55, 65],
                                        'n_estimators': [50, 100, 200, 500,
                                                         700],
                                        'subsample': [0.5, 0.8, 1]},
                   scoring='roc_auc', verbose=20)

In [18]:
def report(results,n_top=3):
    for i in range(1,n_top+1):
        candidates=np.flatnonzero(results['rank_test_score']==i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean Validation Score: {0:.6f} (std: {1:.6f})".format(
                    results['mean_test_score'][candidate],
                    results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [19]:
report(random_search.cv_results_,5)

Model with rank: 1
Mean Validation Score: 0.988692 (std: 0.004558)
Parameters: {'subsample': 0.5, 'n_estimators': 700, 'max_features': 65, 'max_depth': 5, 'learning_rate': 0.1}

Model with rank: 2
Mean Validation Score: 0.986120 (std: 0.003092)
Parameters: {'subsample': 0.8, 'n_estimators': 100, 'max_features': 15, 'max_depth': 6, 'learning_rate': 0.1}

Model with rank: 3
Mean Validation Score: 0.974753 (std: 0.004576)
Parameters: {'subsample': 0.8, 'n_estimators': 100, 'max_features': 30, 'max_depth': 5, 'learning_rate': 0.01}

Model with rank: 4
Mean Validation Score: 0.945003 (std: 0.038040)
Parameters: {'subsample': 0.8, 'n_estimators': 100, 'max_features': 55, 'max_depth': 2, 'learning_rate': 0.8}

Model with rank: 5
Mean Validation Score: 0.944126 (std: 0.108185)
Parameters: {'subsample': 1, 'n_estimators': 200, 'max_features': 10, 'max_depth': 6, 'learning_rate': 0.8}

