### Hyperparameters Fine_tuning

In [None]:
import sys
import numpy as np
import pandas as pd
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [85]:
print('Python       :', sys.version.split('\n')[0])
print('Numpy        :', np.__version__)
print('Pandas       :', pd.__version__)
print('Sklearn      :', sklearn.__version__)

Python       : 3.7.10 | packaged by conda-forge | (default, Feb 19 2021, 16:07:37) 
Numpy        : 1.19.5
Pandas       : 1.2.3
Sklearn      : 0.24.1


In [86]:
np.random.seed(0)

In [87]:
!pip freeze > requirements.txt

In [88]:
class AutoRFClassifier:
    
    def __init__(self, 
        scoring_function = 'accuracy', 
        n_iter = 50):
        self.scoring_function = scoring_function
        self.n_iter = n_iter
        
    def fit(self,X,y):
        X_train = X
        y_train = y

        categorical_values = []

        cat_subset = X_train.select_dtypes(include = ['object','category','bool'])

        for i in range(cat_subset.shape[1]):
            categorical_values.append(list(cat_subset.iloc[:,i].dropna().unique()))
        
        num_pipeline = Pipeline([
            ('cleaner',SimpleImputer()),
            ('scaler',StandardScaler())
            ])

        cat_pipeline = Pipeline([
            ('cleaner',SimpleImputer(strategy = 'most_frequent')),
            ('encoder',OneHotEncoder(sparse = False, categories=categorical_values))
            ])


        preprocessor = ColumnTransformer([
            ('numerical', num_pipeline, make_column_selector(dtype_exclude=['object','category','bool'])),
            ('categorical', cat_pipeline, make_column_selector(dtype_include=['object','category','bool']))
            ])
            
        model_pipeline_steps = []
        model_pipeline_steps.append(('preprocessor',preprocessor))
        model_pipeline_steps.append(('feature_selector',SelectKBest(f_classif,k='all')))
        model_pipeline_steps.append(('reduce_dim',PCA()))
        model_pipeline_steps.append(('estimator',RandomForestClassifier()))
        model_pipeline = Pipeline(model_pipeline_steps)

        total_features = preprocessor.fit_transform(X_train).shape[1]

        optimization_grid = [{
            'preprocessor__numerical__scaler' : [None],
            'preprocessor__numerical__cleaner__strategy':['mean','median'],
            'feature_selector__k': list(np.arange(1,total_features,5)) + ['all'],
            'reduce_dim' : ['passtrough', PCA(10), PCA(15), PCA(20), PCA(25), PCA(30)],
            'estimator': [RandomForestClassifier(random_state=0, 
                                                 n_jobs = -1
                                                )],
            'estimator__n_estimators':np.arange(200,400,1),
            'estimator__criterion':['gini','entropy'],
            'estimator__max_features' : ['auto', 'sqrt', 'log2'],
            'estimator__class_weight' : ['balanced', 'balanced_subsample', None],
            'estimator__warm_start' : [False, True],
            'estimator__verbose' : np.arange(0, 5, 1)
        }]
        
        search = RandomizedSearchCV(
                          model_pipeline,
                          optimization_grid,
                          n_iter=self.n_iter,
                          scoring = self.scoring_function, 
                          n_jobs = -1, 
                          random_state = 0, 
                          verbose = 3,
                          cv = 5
                            )

        search.fit(X_train, y_train)
        
        self.best_estimator_ = search.best_estimator_
        self.best_pipeline = search.best_params_
        self.best_score = search.best_score_
        self.cv_results = search.cv_results_
        
    def predict(self,X,y = None):
        return self.best_estimator_.predict(X)

    def predict_proba(self,X,y = None):
        return self.best_estimator_.predict_proba(X)

In [89]:
DATA_DIR = '../input/findata-creditscoring/'
train = pd.read_csv(DATA_DIR + 'credit_train.csv')
test = pd.read_csv(DATA_DIR + 'credit_test.csv')
sample_submission = pd.read_csv(DATA_DIR + 'credit__sample.csv')
train.drop_duplicates(keep='first', inplace = True)
train['Loan Status'] = train['Loan Status'].map({"Fully Paid": 1, "Charged Off": 0})
train.drop(['Loan ID','Customer ID',], axis = 1, inplace=True)
test.drop(['Loan ID','Customer ID',], axis = 1, inplace=True)

In [90]:
y = train['Loan Status'].values     
X = train.drop(['Loan Status'], axis=1)

In [91]:
model = AutoRFClassifier()
model.fit(X, y)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


 0.79670324 0.79723785        nan 0.79830707 0.79715988 0.77506265
        nan        nan 0.79772791 0.79717102        nan 0.79775018
        nan 0.79712647 0.79779473 0.79920922        nan        nan
 0.79749401        nan 0.79830707        nan        nan        nan
 0.79790611 0.77449463        nan        nan 0.79815114 0.79809545
        nan        nan 0.79785042        nan        nan 0.79790611
 0.7971933         nan        nan 0.79749401        nan 0.79583449
        nan        nan]
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 382
building tree 2 of 382
building tree 3 of 382
building tree 4 of 382
building tree 5 of 382
building tree 6 of 382
building tree 7 of 382
building tree 8 of 382
building tree 9 of 382
building tree 10 of 382
building tree 11 of 382
building tree 12 of 382
building tree 13 of 382
building tree 14 of 382
building tree 15 of 382
building tree 16 of 382
building tree 17 of 382
building tree 18 of 382
building tree 19 of 382
building tree 20 of 382
building tree 21 of 382
building tree 22 of 382
building tree 23 of 382
building tree 24 of 382
building tree 25 of 382
building tree 26 of 382
building tree 27 of 382
building tree 28 of 382


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    7.5s


building tree 29 of 382
building tree 30 of 382
building tree 31 of 382
building tree 32 of 382
building tree 33 of 382
building tree 34 of 382
building tree 35 of 382
building tree 36 of 382
building tree 37 of 382
building tree 38 of 382
building tree 39 of 382
building tree 40 of 382
building tree 41 of 382
building tree 42 of 382
building tree 43 of 382
building tree 44 of 382
building tree 45 of 382
building tree 46 of 382
building tree 47 of 382
building tree 48 of 382
building tree 49 of 382
building tree 50 of 382
building tree 51 of 382
building tree 52 of 382
building tree 53 of 382
building tree 54 of 382
building tree 55 of 382
building tree 56 of 382
building tree 57 of 382
building tree 58 of 382
building tree 59 of 382
building tree 60 of 382
building tree 61 of 382
building tree 62 of 382
building tree 63 of 382
building tree 64 of 382
building tree 65 of 382
building tree 66 of 382
building tree 67 of 382
building tree 68 of 382
building tree 69 of 382
building tree 70

[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   37.0s


building tree 124 of 382
building tree 125 of 382
building tree 126 of 382
building tree 127 of 382
building tree 128 of 382
building tree 129 of 382
building tree 130 of 382
building tree 131 of 382
building tree 132 of 382
building tree 133 of 382
building tree 134 of 382
building tree 135 of 382
building tree 136 of 382
building tree 137 of 382
building tree 138 of 382
building tree 139 of 382
building tree 140 of 382
building tree 141 of 382
building tree 142 of 382
building tree 143 of 382
building tree 144 of 382
building tree 145 of 382
building tree 146 of 382
building tree 147 of 382
building tree 148 of 382
building tree 149 of 382
building tree 150 of 382
building tree 151 of 382
building tree 152 of 382
building tree 153 of 382
building tree 154 of 382
building tree 155 of 382
building tree 156 of 382
building tree 157 of 382
building tree 158 of 382
building tree 159 of 382
building tree 160 of 382
building tree 161 of 382
building tree 162 of 382
building tree 163 of 382


[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.4min


building tree 285 of 382
building tree 286 of 382
building tree 287 of 382
building tree 288 of 382
building tree 289 of 382
building tree 290 of 382
building tree 291 of 382
building tree 292 of 382
building tree 293 of 382
building tree 294 of 382
building tree 295 of 382
building tree 296 of 382
building tree 297 of 382
building tree 298 of 382
building tree 299 of 382
building tree 300 of 382
building tree 301 of 382
building tree 302 of 382
building tree 303 of 382
building tree 304 of 382
building tree 305 of 382
building tree 306 of 382
building tree 307 of 382
building tree 308 of 382
building tree 309 of 382
building tree 310 of 382
building tree 311 of 382
building tree 312 of 382
building tree 313 of 382
building tree 314 of 382
building tree 315 of 382
building tree 316 of 382
building tree 317 of 382
building tree 318 of 382
building tree 319 of 382
building tree 320 of 382
building tree 321 of 382
building tree 322 of 382
building tree 323 of 382
building tree 324 of 382


[Parallel(n_jobs=-1)]: Done 382 out of 382 | elapsed:  2.0min finished


In [92]:
model.best_pipeline

{'reduce_dim': PCA(n_components=15),
 'preprocessor__numerical__scaler': None,
 'preprocessor__numerical__cleaner__strategy': 'median',
 'feature_selector__k': 41,
 'estimator__warm_start': False,
 'estimator__verbose': 3,
 'estimator__n_estimators': 382,
 'estimator__max_features': 'log2',
 'estimator__criterion': 'entropy',
 'estimator__class_weight': 'balanced',
 'estimator': RandomForestClassifier(class_weight='balanced', criterion='entropy',
                        max_features='log2', n_estimators=382, n_jobs=-1,
                        random_state=0, verbose=3)}

In [93]:
model.best_score

0.7992092220304059