In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
import os
import re
from collections import Counter
# import statsmodels.api as sm
from xgboost import XGBClassifier
#import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, auc, precision_score, recall_score, f1_score, confusion_matrix
from datetime import datetime 
from sklearn.model_selection import RandomizedSearchCV
import time
# from shapely import wkt
import warnings
warnings.filterwarnings('ignore')

# Not importing lists bc run lists script below
#import lists# Long-ass lists of vars etc. (keep in same folder)
import utils# Define util functions

%matplotlib inline
%run lists.py

pd.options.display.max_rows = 1000

Loading lists of vars and printing out some list sizes...
75
45


# Data Import

In [2]:
FPATH = "/Users/david.duong/dev/lending-club/"
FNAME= "accepted_2007_2018.csv"
OFNAME = "post_fe_dat.csv"

QAFNAME = "qavals.pkl"

DEFAULT_CATS = ['Default', 'Charged Off', 'Does not meet the credit policy. Status:Charged Off']
FULLY_PAID_CATS = ['Does not meet the credit policy. Status:Fully Paid','Fully Paid']

FILTER_TERMS = [" 36 months"]# Remove the 60 months

# List of cat vars to use for feature engineering
CAT_VARS = ['term', 'grade', 'emp_length', 'home_ownership', 
            'verification_status', 'purpose', 'initial_list_status', 
            'application_type', 'disbursement_method']



In [3]:
dat = pd.read_csv(FPATH + FNAME)
print(dat.shape)

(2260701, 151)


# Data Preprocessing


### DP1a. Round-1 Filters (NA, Terms, Determinant labels etc.)

In [5]:
def apply_filters1(dat):
    # Drop NAs:
    dat = dat.loc[~dat.loan_amnt.isnull()]
    print ('Drop the NA value...')
    dat = dat.dropna(subset = ['zip_code'])
    print(dat.shape)

    # Keep only our filters:
    dat = dat[dat['term'].isin(FILTER_TERMS)]
    print(dat.shape)

    # Keep only determinant labels (e.g. clear defaults or clear paid)
    dat = dat[dat['loan_status'].isin(DEFAULT_CATS + FULLY_PAID_CATS)]
    print(dat.shape)
    return dat

dat = apply_filters1(dat)

Drop the NA value...
(1023205, 151)
(1023205, 151)
(1023205, 151)


### DP1b. Create labels & remove ambiguous labels

In [6]:
# Check default loans:
def create_labels(dat):
    dat['Default_flag'] = dat['loan_status'].apply(lambda x: 1 if x in DEFAULT_CATS else 0)

    print('Default loans among all samples: {} at {:.2f} percent'.format(
        sum(dat['Default_flag']), 
        sum(dat['Default_flag'])/dat.shape[0]*100))

    # Get paid flag
    dat['Paid_flag'] = dat['loan_status'].apply(lambda x: 1 if x in FULLY_PAID_CATS else 0)
    print ('Number of paid loans:', sum(dat.Paid_flag))
    ###
    return dat
    
dat = create_labels(dat)

print(dat.shape)

Default loans among all samples: 163926 at 16.02 percent
Number of paid loans: 859279
(1023205, 153)


In [7]:
# MAIN TODOs
"""

1. Review and code the labels like I should (e.g. should paricular cats go into 1,
and others go into 0?)

x. Later: Consider whether to include initial_list_status = w

"""


'\n\n1. Review and code the labels like I should (e.g. should paricular cats go into 1,\nand others go into 0?)\n\nx. Later: Consider whether to include initial_list_status = w\n\n'

### DP1c. Issue dates & Months (for (i) data review (ii) keep to split later)
TODO: Use Years/ Months to redo my sampling (use smaller data)

In [8]:
# Create convenient issue year-month vars (for sample splitting etc.)
def create_yyyymm_util(y,m):
    if m < 10:
        return (str(y) + "0" + str(m))
    else:
        return (str(y) + str(m))

def create_timing_cols(dat):
    dat['issue_d'] = pd.to_datetime(dat['issue_d'], format = '%b-%Y')
    dat['issue_year'] = dat['issue_d'].apply(lambda d: d.year)
    dat['issue_ym'] = dat['issue_d'].apply(lambda d: create_yyyymm_util(d.year, d.month))
    return dat

dat = create_timing_cols(dat)

In [9]:
#dat.groupby('issue_ym')['Default_flag'].mean().sort_index().head()
"""Description: 
200706 and 200707: Pre-recession
200708-200807: Recession 20% - 30%
200808-200901: ~ 15%
200902+: Not bad
201603 - 201709: ~ 20% again wtf
"""
# Comment: Maybe the distribution of loan types has changed!!

'Description: \n200706 and 200707: Pre-recession\n200708-200807: Recession 20% - 30%\n200808-200901: ~ 15%\n200902+: Not bad\n201603 - 201709: ~ 20% again wtf\n'

### QA1. Output the check list for the vars' values

In [10]:
def get_cont_distribution_dict(dat, varlist=[], filter_ys = None):
    """filter is None or a list of years"""
    # Temp: Check distribution of vars
    if filter_ys != None:# If there's a filter     
        dat = dat[dat['issue_year'].isin(filter_ys)]
    
    tempdict = {}    
    for col in varlist:
        tempdict[col] = {}
    
    for col in varlist:
        tempdict[col]['min'] = np.nanmin(dat[col].values)
        tempdict[col]['max'] = np.nanmax(dat[col].values)
        tempdict[col]['95th'] = np.nanpercentile(dat[col].values, 95)
        # TODO: Add missingness % as well
    print(len(tempdict))
    return tempdict

#get_cont_distribution_dict = utils.get_cont_distribution_dict# TODO: Uncomment this later

qavals_dict = {'continuous':{}, 'cat':{}}
qavals_dict['continuous'] = get_cont_distribution_dict(dat, varlist = numeric_cols_4_model, filter_ys = [2015])

# Now output
pickle.dump(qavals_dict, open(FPATH + QAFNAME, "wb"))

75


#### Get 3 digit zipcode with high risks (for dummy_creation later)
ToDo 01/09/20: Might group some zips together to make more than one var
Or might create one var only that has levels

In [11]:
def find_highrisk_zips(dat):
    dat['zip3digit'] = dat.zip_code.apply(lambda z: z[:3])

    # convert zip3digit variable to string
    dat['zip3digit'] = dat.zip3digit.astype(str)

    # Got top zipcodes that have largest average default rate
    zipcode_default = dat.groupby('zip3digit')['Default_flag'].sum()
    zipcode_all = dat.groupby('zip3digit').size()
    zipcode_default_ratio = zipcode_default/zipcode_all
    zipcode_default_ratio = zipcode_default_ratio.sort_values(ascending=False)

    # Get numer of zipcode to collect using a default threshold
    topn = (zipcode_default_ratio > 0.3).sum()

    # Get the list of high risk zipcode
    highr_zips = zipcode_default_ratio[:topn].index.tolist()
    return highr_zips

highrisk_zips = find_highrisk_zips(dat)
print(highrisk_zips)

['938', '643', '682', '502', '692', '345', '503', '522', '528', '555', '709', '742', '901', '007', '835', '204', '569', '969', '203', '738']


Some special variables to notice: replace NA by 0 would be wrong `mths_since_last_major_derog`, `mths_since_recent_bc_dlq`, `mths_since_recent_revol_delinq` ==> have dummy variables to mark those NA

In [12]:
def one_hot_encode_pandas(df, cat_colname):
    '''
    Create one-hot-encode columns for categorical variables
   
    Input: a dataframe with name of column that has categorical data
    Output: a dataframe with number of columns equivalent to number of categories
    '''
    df[cat_colname] = pd.Categorical(df[cat_colname])
    dfDummies = pd.get_dummies(df[cat_colname], prefix = cat_colname)
   
    return dfDummies
 
def process_cat_vars(df, cat_vars, highRiskZip):
    """
    - Change values to be convenient (match with new data, lowering cases etc.)
    - Then create features"""
    
    # Return all cat_names created here
    cat_names_list = []
    
    # Get 3 digit zipcode
    df['zip3digit'] = df.zip_code.apply(lambda z: z[:3])
    # Convert zip3digit to str:
    df['zip3digit'] = df.zip3digit.astype(str)
    # Create a column to flag people in high risk location
    df['highRiskZip'] = df['zip3digit'].apply(lambda x: 1 if x in highRiskZip else 0)
    
    # add name of new column to created_cat_names
    cat_names_list.append('highRiskZip')
    # Create dummy variable to flag month_since_$ variables
    age_vars = ['mths_since_last_major_derog','mths_since_recent_bc_dlq','mths_since_recent_revol_delinq']

    dummy_vars = ['major_derog_NA','bc_dlq_NA','revol_delinq_NA']

    for a,d in zip(age_vars, dummy_vars):
        df[d] = df[a].isnull().astype(int)
        cat_names_list.append(d)
        
    # Clean up some cat vars
    df['term'] = df['term'].apply(lambda x: '36' if '36' in str(x) else '60')
    df['grade']=df['grade'].apply(lambda x: str(x).lower())
    df['emp_length'] = df['emp_length'].apply(lambda x: str(x).lower())
    df['home_ownership']=df['home_ownership'].apply(lambda x: str(x).lower())
    df['purpose'] =df['purpose'].apply(lambda x: re.sub('[^a-z]+',' ', str(x).lower()) )
    df['disbursement_method'] = df['disbursement_method'].apply(lambda x: str(x).lower())
    df['application_type'] =df['application_type'].apply(lambda x: str(x).lower())
    df['verification_status'] = df['verification_status'].apply(lambda x: str(x).lower())
    
    # One-hot-encode categorical variables
    dfCats = pd.DataFrame()

    for col in cat_vars:
        dfDummies = one_hot_encode_pandas(df, col)

        assert dfDummies.shape[0] == df.shape[0]

        dfCats = pd.concat([dfCats, dfDummies], axis=1)

    # Rename one column so XGBoost can fit the model
    dfCats.rename(columns = {'emp_length_< 1 year': 'emp_length_less 1 year'}, inplace = True)
    
    # Add new column names to list
    cat_names_list.extend(dfCats.columns)    
    df = pd.concat([df, dfCats], axis = 1)
    
    return df[['id'] + CAT_NAMES_TO_KEEP], cat_names_list

def feature_engineer_numeric_vars(df, numeric_vars):
    """This takes numeric vars and process them (fill missings) then outputs that list with features only"""
    return df[['id'] + numeric_vars].fillna(0)# Note btw that id doesn't have missing so it won't be filled w any 0s


In [13]:
numeric_postfe_df = feature_engineer_numeric_vars(dat, numeric_cols_4_model)
print(numeric_postfe_df.shape)

cat_postfe_df, all_cat_vars_created = process_cat_vars(dat, CAT_VARS, highrisk_zips)
print(cat_postfe_df.shape)

other_vars_postfe_df= dat[['id', 'Default_flag', 'issue_ym']]
print(other_vars_postfe_df.shape)


(1023205, 76)
(1023205, 46)
(1023205, 3)


## OUTPUT:

In [14]:
outfile = FPATH + OFNAME

pd.merge(pd.merge(other_vars_postfe_df, cat_postfe_df, on = ['id']), numeric_postfe_df, on = ['id']).to_csv(outfile, index=False)