In [251]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


#'----------------------------------------------------'
def my_feature_selector(mytype,X,y=[],threshold=None,k=None):  
    
    #reduces # of features in given x dataset by one of 3 ways:
    #1) selectkbest  2) rfecv (recursive feature elim)  3) variancethreshold

    mytype = str.lower(mytype)
    
    #Retain original column names
    orig_cols = X.columns
    
    #Evaluate type and instatiate object
    if mytype == 'selectkbest':
        if k == None:
            print('Try Again: To run selectKBest you must pass a k for number of features to select')
            return None  
        if len(y) == 0:
            print('Try Again: To run selectKBest you must pass a y vector reflecting outcomes')
            return None             
        
        selector = SelectKBest(chi2, k=k)
        df_new = selector.fit_transform(X,y)
        keep_list = selector.get_support()
        scores = [orig_cols, selector.pvalues_, keep_list]
        score_type = 'PValue'
        
    if mytype == 'rfecv':
        if len(y) == 0:
            print('Try Again: To run rfecv you must pass a y vector reflecting outcomes')
            return None 
       
        estimator = SVR(kernel="linear")
        selector = RFECV(estimator, n_jobs=-1)
        df_new = selector.fit_transform(X,y)
        keep_list = selector.get_support()
        scores = [orig_cols, selector.ranking_, keep_list]
        score_type = 'Ranking'
            
    if mytype == 'variancethreshold':
        if threshold == None:
            print('Try Again: To run VarianceThreshold you must pass a threshold value from 0 to 1. (0 returns all)')
            return None     
    
        selector = VarianceThreshold(threshold=threshold)
        df_new = selector.fit_transform(X)
        keep_list = selector.get_support()   
        scores = [orig_cols, selector.variances_, keep_list]
        score_type = 'Variance'
        
    if (mytype != 'selectkbest') & (mytype != 'rfecv') & (mytype != 'variancethreshold'):  
        print('Try Again: Type must be passed as selectkbest, rfecv, or variancethreshold')
        return None  
    
    
    #Print Scores
    print('---- Running Feature Selection Method: ' + mytype + '-------\n')
    scores_df = pd.DataFrame(scores)
    scores_df = scores_df.transpose()
    scores_df.columns= ['Feature','PValue','Keep']
    print(scores_df.sort_values(by=['Keep','PValue'], ascending=[False,True]))

  
    #List features that were removed vs Kept
    i = 0
    keep = []

    for item in keep_list:
        col_name = orig_cols[i]
        if item==True:
            keep.append(col_name)
        i = i + 1

    #Place resultset of kept features into DataFrame with correct column headers
    df_keep = pd.DataFrame(df_new, columns=keep)
    
    #Print Stats
    print('\nShape before: ' + str(X.shape))
    print('Shape after: ' + str(df_keep.shape))
    print('\n')
    return df_keep

#'----------------------------------------------------'
def my_remove_highly_correlated(X,threshold):

    cols = X.columns

    #run correlation matrix
    df = X.corr()

    #put df into array
    a = df.values

    #label top half with -99999 
    #we want to ignore top half of matrix
    iu1 = np.triu_indices(len(df))
    a[iu1] = -99999

    #put data back into daraframe
    df = pd.DataFrame(a, columns=cols)
    df['var'] = cols

    #unstack to get a list of var1, var2, correlation
    df = pd.melt(df, id_vars='var')

    #remove those flagged with -99999
    df = df[df.value != -99999].sort_values(by='var', ascending=True)

    #flag remove vs keep based on corr threshold
    df_remove = df[df.value > threshold]
    keep_list = df.loc[df.value <= threshold,'var'].unique()
 
    print('{} out of {} vars removed due to corr greater than {}'.
          format(df_remove.shape[0],X.shape[1],threshold))
    
    print(df_remove)
    print('\nShape before: ' + str(X.shape))
    print('Shape after: ' + str(X[keep_list].shape))
    print('\n')
    return X[keep_list]

#----------------------------------------------------------

def my_drop_na_columns(X,NANthreshold):

    df = X
    colcount = df.shape[1]
    #Get count of NA in each column
    series_cols = df.isnull().sum(axis = 0).sort_values(ascending=False)

    #filter the list to include only those with counts above threshold
    series_cols_remove = series_cols[series_cols.values >= NANthreshold]
    series_cols_keep = series_cols[(series_cols.values < NANthreshold) & (series_cols.values > 0) ]
    
    #put the to-remove column names in a list
    list_colstodrop = series_cols_remove.index.tolist()

    #drop the columns
    df = df.drop(labels = list_colstodrop, axis=1)

    #print the results
    print('Dropped {} of {} Columns - containing more than {} NANs\n'.format(
          series_cols_remove.shape[0],colcount,NANthreshold))
    print(series_cols_remove)
    
    print('\n{} Columns remain with NANs\n'.format(series_cols_keep.shape[0]))
    print(series_cols_keep)
    print('\nShape before: ' + str(X.shape))
    print('Shape after: ' + str(df.shape))
    print('\n')    
    return df

In [256]:
# Replace the path with the correct path for your data.
y2015 = pd.read_csv(
    'LoanStats3d.csv',
    skipinitialspace=True,
    header=1
)

#y2015.drop(columns='id', axis=1, inplace=True)
# Note the warning about dtypes.


y2015.shape

  interactivity=interactivity, compiler=compiler, result=result)


(421097, 145)

In [264]:
#data cleaning
y2015_cleaned = y2015
y2015_cleaned = y2015_cleaned[:-2]
y2015_cleaned = my_drop_na_columns(y2015,3000)
# Interest Rate to numeric.
y2015_cleaned['int_rate'] = pd.to_numeric(y2015['int_rate'].str.strip('%'), errors='coerce')

# Drop other columns with many unique variables
y2015_cleaned.drop(['zip_code', 'earliest_cr_line', 'revol_util', 'last_credit_pull_d', 'last_pymnt_d',
                    'title','issue_d','purpose',
            'sub_grade', 'addr_state'], 1, inplace=True)

# Drop columns related to Payment Amount or Outstanding Principal
y2015_cleaned.drop(['last_pymnt_amnt','out_prncp','out_prncp_inv','total_pymnt',
                   'total_pymnt_inv',], 1, inplace=True)

y2015_cleaned = y2015_cleaned[y2015_cleaned.total_rec_late_fee >= 0]
y2015_cleaned.shape

Dropped 67 of 145 Columns - containing more than 3000 NANs

sec_app_earliest_cr_line                      421097
revol_bal_joint                               421097
member_id                                     421097
url                                           421097
sec_app_inq_last_6mths                        421097
sec_app_mort_acc                              421097
sec_app_open_acc                              421097
sec_app_revol_util                            421097
sec_app_open_act_il                           421097
sec_app_num_rev_accts                         421097
sec_app_chargeoff_within_12_mths              421097
sec_app_collections_12_mths_ex_med            421097
sec_app_mths_since_last_major_derog           421097
id                                            421095
desc                                          421052
dti_joint                                     420588
annual_inc_joint                              420586
verification_status_joint              

(421088, 63)

In [246]:
categorical = y2015_cleaned.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i)
    print(column.nunique())

term
2
grade
7
home_ownership
4
verification_status
3
loan_status
7
pymnt_plan
2
initial_list_status
2
application_type
2
hardship_flag
2
disbursement_method
1
debt_settlement_flag
2


In [265]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

sample = y2015_cleaned.sample(n=100000)

rfc = ensemble.RandomForestClassifier()
X = sample.drop('loan_status', 1)
X = pd.get_dummies(X, drop_first=True)
Y = sample['loan_status']
X = X.dropna(axis=1)

cross_val_score(rfc, X, Y, cv=10)



array([0.92013195, 0.91293483, 0.92490751, 0.90990901, 0.91460854,
       0.90958192, 0.91268254, 0.91558312, 0.91788358, 0.92537761])

In [272]:
#----------- Feature Selection ---------------
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFECV

X_Train = X

#Remove Highly correlated features
X_Train = my_remove_highly_correlated(X_Train,threshold=.80)

#Remove Features that contribute a low variance
#X_Train = my_feature_selector(mytype='variancethreshold', X=X_Train, y=Y, threshold=.80 )

#Take remaining 10 best
X_Train = my_feature_selector(mytype='selectkbest', X=X_Train, y=Y,  k=19 )

23 out of 69 vars removed due to corr greater than 0.8
                             var           variable     value
1543                 avg_cur_bal        tot_cur_bal  0.837989
1121     collection_recovery_fee         recoveries  0.995823
1                    funded_amnt          loan_amnt  1.000000
2                funded_amnt_inv          loan_amnt  0.999995
71               funded_amnt_inv        funded_amnt  0.999995
4483             hardship_flag_Y       pymnt_plan_y  0.912848
4                    installment          loan_amnt  0.940956
142                  installment    funded_amnt_inv  0.940935
73                   installment        funded_amnt  0.940956
2311             num_actv_rev_tl     num_actv_bc_tl  0.807111
2312                 num_bc_sats     num_actv_bc_tl  0.841027
659                num_op_rev_tl           open_acc  0.828869
2384               num_op_rev_tl    num_actv_rev_tl  0.806029
2523               num_rev_accts          num_bc_tl  0.839610
2317         nu

In [273]:
cross_val_score(rfc, X_Train, Y, cv=10)



array([0.91113555, 0.90793683, 0.9070093 , 0.90490951, 0.90380962,
       0.90928186, 0.9034807 , 0.89998   , 0.9009802 , 0.9116735 ])