In [81]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import RFECV

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

import pickle

In [62]:
# load the training imputed engineered data
train_data = pd.read_csv('../../data/processed/engineered_features_data/train_imputed_engineered_poly.csv')

In [63]:
train_data.columns

Index(['RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents', 'RevolvingUtilizationOfUnsecuredLines^2',
       'RevolvingUtilizationOfUnsecuredLines age',
       'RevolvingUtilizationOfUnsecuredLines NumberOfTime30-59DaysPastDueNotWorse',
       'RevolvingUtilizationOfUnsecuredLines DebtRatio',
       'RevolvingUtilizationOfUnsecuredLines MonthlyIncome',
       'RevolvingUtilizationOfUnsecuredLines NumberOfOpenCreditLinesAndLoans',
       'RevolvingUtilizationOfUnsecuredLines NumberOfTimes90DaysLate',
       'RevolvingUtilizationOfUnsecuredLines NumberRealEstateLoansOrLines',
       'RevolvingUtilizationOfUnsecuredLines NumberOfTime60-89DaysPastDueNotWorse',
       'RevolvingUtilizationOfUnsecuredLines NumberOfDependents', 'age^2',
       'age 

In [64]:
# define X and y
y = train_data['SeriousDlqin2yrs']
X = train_data.drop(columns=['SeriousDlqin2yrs'])

# make y into a categorical variable
y = y.astype('category')

In [65]:
# implement RFECV feature selection with GradientBoostingClassifier

# create the model
model = XGBClassifier(n_jobs=-1, n_estimators=120)

# create the RFECV object
rfecv_xgb = RFECV(estimator=model, step=1, cv=5, scoring='roc_auc', n_jobs=-1, verbose=2)

# fit the RFECV object
rfecv_xgb.fit(X, y)

print(f'Mean test score: ', rfecv_xgb.cv_results_['mean_test_score'])
print(f'Standard deviation: ', rfecv_xgb.cv_results_['std_test_score'])

# print the results
print("Optimal number of features : %d" % rfecv_xgb.n_features_)
print("Best features : ", X.columns[rfecv_xgb.support_])

Fitting estimator with 67 features.
Fitting estimator with 67 features.
Fitting estimator with 67 features.
Fitting estimator with 67 features.
Fitting estimator with 67 features.
Fitting estimator with 66 features.
Fitting estimator with 66 features.
Fitting estimator with 66 features.
Fitting estimator with 66 features.
Fitting estimator with 66 features.
Fitting estimator with 65 features.
Fitting estimator with 65 features.
Fitting estimator with 65 features.
Fitting estimator with 65 features.
Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 64 features.
Fitting estimator with 64 features.
Fitting estimator with 64 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 63 features.
Fitting estimator with 63 features.
Fitting estimator with 63 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 62 features.
Fitting estimator with 62 fe

In [66]:
# implement RFECV feature selection with RandomForestClassifier

# create the model
model = RandomForestClassifier(n_estimators=50)

# create the RFECV object
rfecv_rf = RFECV(estimator=model, step=1, cv=3, scoring='roc_auc', n_jobs=-1, verbose=2)

# fit the RFECV object
rfecv_rf.fit(X, y)

print(f'Mean score: ', rfecv_rf.cv_results_['mean_test_score'])
print(f'Standard deviation: ', rfecv_rf.cv_results_['std_test_score'])

# print the results
print("Optimal number of features : %d" % rfecv_rf.n_features_)
print("Best features : ", X.columns[rfecv_rf.support_])

Fitting estimator with 67 features.
Fitting estimator with 67 features.
Fitting estimator with 67 features.
Fitting estimator with 66 features.
Fitting estimator with 66 features.
Fitting estimator with 66 features.
Fitting estimator with 65 features.
Fitting estimator with 65 features.
Fitting estimator with 65 features.
Fitting estimator with 64 features.
Fitting estimator with 64 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 63 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 62 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 61 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 60 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 59 features.
Fitting estimator with 59 features.
Fitting estimator with 58 fe

In [83]:
# save the rfecv as a pickle file
with open('../../models/feature_selection/rfcev_xgboost.pkl', 'wb') as file:
    pickle.dump(rfecv_xgb, file)

In [84]:
# save the rfecv as a pickle file
with open('../../models/feature_selection/rfcev_randomforest.pkl', 'wb') as file:
    pickle.dump(rfecv_rf, file)

In [78]:
# create a dataframe with the results
results = pd.DataFrame({
    'Feature': X.columns,
    'GradientBoosting': rfecv_xgb.support_,
    'RandomForest': rfecv_rf.support_
})

# add a column with the sum of the two columns
results['Sum'] = results['GradientBoosting'] & results['RandomForest']

# order the dataframe by the sum of the two columns
results = results.sort_values(by='Sum', ascending=False)

In [79]:
# remove print limit
pd.set_option('display.max_rows', None)
print(results[['Feature', 'Sum']])

                                              Feature    Sum
0                RevolvingUtilizationOfUnsecuredLines   True
13     RevolvingUtilizationOfUnsecuredLines DebtRatio   True
46              MonthlyIncome NumberOfTimes90DaysLate   True
42     DebtRatio NumberOfTime60-89DaysPastDueNotWorse   True
41             DebtRatio NumberRealEstateLoansOrLines   True
39          DebtRatio NumberOfOpenCreditLinesAndLoans   True
38                            DebtRatio MonthlyIncome   True
1                                                 age   True
26                   age NumberRealEstateLoansOrLines   True
17  RevolvingUtilizationOfUnsecuredLines NumberRea...   True
16  RevolvingUtilizationOfUnsecuredLines NumberOfT...   True
15  RevolvingUtilizationOfUnsecuredLines NumberOfO...   True
66                                   DelinquencyRatio   True
12  RevolvingUtilizationOfUnsecuredLines NumberOfT...   True
3                                           DebtRatio   True
53  NumberOfOpenCreditLi

In [80]:
# chosen features
chosen_features = results[results['Sum'] == True]['Feature'].values

# original features
original_features = ['RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents']

# original features not selected
original_features_not_selected = [feature for feature in original_features if feature not in chosen_features]

# original features not considered
original_features_not_considered = []
for original_feature in original_features_not_selected:
    considered = False
    for chosen_feature in chosen_features:
        if original_feature in chosen_feature:
            considered = True
    if not considered:
        original_features_not_considered.append(original_feature)


print(f'Original features not selected: {original_features_not_selected}')
print(f'Original features not considered: {original_features_not_considered}')

Original features not selected: ['NumberOfTime30-59DaysPastDueNotWorse', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']
Original features not considered: ['NumberOfDependents']


In [82]:
# features to keep
features_to_keep = list(chosen_features) + original_features_not_considered + ['SeriousDlqin2yrs']

print(f'{len(features_to_keep)} features to keep: \n{features_to_keep}')

# save the features to keep in a pickle file
with open('../../data/processed/selected_features/rfecv_features_to_keep.pkl', 'wb') as f:
    pickle.dump(features_to_keep, f)

17 features to keep: 
['RevolvingUtilizationOfUnsecuredLines', 'RevolvingUtilizationOfUnsecuredLines DebtRatio', 'MonthlyIncome NumberOfTimes90DaysLate', 'DebtRatio NumberOfTime60-89DaysPastDueNotWorse', 'DebtRatio NumberRealEstateLoansOrLines', 'DebtRatio NumberOfOpenCreditLinesAndLoans', 'DebtRatio MonthlyIncome', 'age', 'age NumberRealEstateLoansOrLines', 'RevolvingUtilizationOfUnsecuredLines NumberRealEstateLoansOrLines', 'RevolvingUtilizationOfUnsecuredLines NumberOfTimes90DaysLate', 'RevolvingUtilizationOfUnsecuredLines NumberOfOpenCreditLinesAndLoans', 'DelinquencyRatio', 'RevolvingUtilizationOfUnsecuredLines NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'NumberOfDependents', 'SeriousDlqin2yrs']
