In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from Amex_Metric import amex_metric

## Reading data-file 
data = pd.read_csv('Delinquency_Features.csv')
data.head()

Unnamed: 0,customer_ID,target,D_39_mean,D_39_median,D_39_min,D_39_max,D_39_range,D_39_IQR,D_39_values_above_mean
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0,0.010704,0.002483,0.001082,0.091492,0.090393,0.003593,1.0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0,0.215088,0.21167,0.002224,0.567383,0.564941,0.293152,5.0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0,0.004181,0.00423,0.000802,0.009705,0.008904,0.003944,7.0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0,0.048859,0.007423,0.00066,0.268555,0.267822,0.032093,3.0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0,0.004642,0.004147,3e-05,0.008682,0.008652,0.00474,6.0


In [2]:
## Defining input and target variables
X = data.drop(columns = ['customer_ID', 'target'], axis = 1)
Y = data['target']

## Spliting the data into train, validation, and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Defining the customized scoring function 
amex_function = make_scorer(amex_metric, greater_is_better = True, needs_proba = True)

## Defining list to store results
features_to_select = list()

for i in range(0, 2):

    ## Running RFE with Random forest
    RF_auto_feature = RFECV(estimator = RandomForestClassifier(n_estimators = 50, max_depth = 5), step = 1, scoring = amex_function, min_features_to_select = 5, cv = 3).fit(X_train, Y_train)
    
    ## Appending results 
    features_to_select.append(X_train.columns[RF_auto_feature.support_])
    
print(features_to_select)

[Index(['D_39_mean', 'D_39_median', 'D_39_max', 'D_39_range', 'D_39_IQR',
       'D_39_values_above_mean'],
      dtype='object'), Index(['D_39_mean', 'D_39_median', 'D_39_max', 'D_39_range', 'D_39_IQR',
       'D_39_values_above_mean'],
      dtype='object')]


In [5]:
pd.DataFrame(features_to_select)

Unnamed: 0,0,1,2,3,4,5
0,D_39_mean,D_39_median,D_39_max,D_39_range,D_39_IQR,D_39_values_above_mean
1,D_39_mean,D_39_median,D_39_max,D_39_range,D_39_IQR,D_39_values_above_mean


In [6]:
features_to_select.append(['X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 'X_7'])

In [8]:
pd.DataFrame(features_to_select)

Unnamed: 0,0,1,2,3,4,5,6
0,D_39_mean,D_39_median,D_39_max,D_39_range,D_39_IQR,D_39_values_above_mean,
1,D_39_mean,D_39_median,D_39_max,D_39_range,D_39_IQR,D_39_values_above_mean,
2,X_1,X_2,X_3,X_4,X_5,X_6,X_7


In [12]:
RF_auto_feature.ranking_

array([1, 1, 2, 1, 1, 1, 1])

In [13]:
RF_auto_feature.support_

array([ True,  True, False,  True,  True,  True,  True])

In [14]:
RF_auto_feature

RFECV(cv=3, estimator=RandomForestClassifier(max_depth=5, n_estimators=50),
      min_features_to_select=5,
      scoring=make_scorer(amex_metric, needs_proba=True))

In [8]:
## Identifying important features 
print(X_train.columns[RF_auto_feature.support_])

Index(['D_39_mean', 'D_39_median', 'D_39_min', 'D_39_max', 'D_39_range',
       'D_39_IQR', 'D_39_values_above_mean'],
      dtype='object')


In [15]:
## Identifying important features 
print(X_train.columns[RF_auto_feature.support_])

Index(['D_39_mean', 'D_39_median', 'D_39_max', 'D_39_range', 'D_39_IQR',
       'D_39_values_above_mean'],
      dtype='object')


In [10]:
X_train.shape

(367130, 7)

In [None]:
## Defining input and target variables
X = data.drop(columns = ['customer_ID', 'target'], axis = 1)
Y = data['target']

## Spliting the data into train, validation, and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Defining hyper-paramerters for RF
RF_param_grid = {'n_estimators': [100, 300, 500],
                 'min_samples_split': [10, 15],
                 'min_samples_leaf': [5, 7],
                 'max_depth' : [3, 5, 7]}

## Defining the customized scoring function 
amex_function = make_scorer(amex_metric, greater_is_better = True, needs_proba = True)

## Performing grid search
RF_grid_search = GridSearchCV(RandomForestClassifier(), RF_param_grid, cv = 3, scoring = amex_function, n_jobs = -1).fit(X_train, Y_train)
