In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import pickle
import seaborn as sns
import os
import pickle
import re
from collections import Counter
# import statsmodels.api as sm
from xgboost import XGBClassifier
#import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, auc, precision_score, recall_score, f1_score, confusion_matrix
from datetime import datetime 
from sklearn.model_selection import RandomizedSearchCV
import time
# from shapely import wkt
import warnings
warnings.filterwarnings('ignore')

import lists# Long-ass lists of vars etc. (keep in same folder)

%matplotlib inline
%run lists.py

pd.options.display.max_rows = 1000

Loading lists of vars and printing out some list sizes...
75
45


In [2]:
FPATH = "/Users/david.duong/dev/lending-club/"
FNAME= "loans_to_score_2020a.csv"
QAFNAME = "qavals.pkl"

In [5]:
dat = pd.read_csv(FPATH + FNAME)
print(dat.shape)

(53, 122)


## 1. Data QA and clean up 
(no feature eng/ missing impute yet)

### 1a. Numeric Vars

In [8]:
def check_var_is_there(dat, varlist):
    print("Check that all vars is there...")
    for col in varlist:
        if col not in dat.columns: 
            print("Found some column not in train, e.g. ", col)
            return "FAIL"    
    return "PASS"
        
check_var_is_there(dat, numeric_cols_4_model)


Check that all vars is there...


'PASS'

#### Main Processing: convert str into numerics

In [10]:
def find_non_numeric_vars(dat, varlist):
    dtypes_dict = dat[varlist].dtypes
    non_numeric_list = [col for col in varlist if dtypes_dict[col] not in (['int64', 'float64'])]
    set_of_non_numeric_types = set(dat[non_numeric_list].dtypes)
    #print("Set of data types in potential to-convert list: ", set_of_non_numeric_types)
    if len(set_of_non_numeric_types) > 1:
        raise ValueError("There are potentially some unrecognized types (e.g. non-object (non-string) types)!!")
    return non_numeric_list

def util_str_to_numeric(instr):
    try:
        return float(instr)# Note: This can take out the spaces e.g. " .95  " can be converted no problem
    except:
        stripped = instr.strip()
        if len(stripped) == 0 or stripped == "null":# i.e. instr is an empty string of any length
            return np.nan# Note that np.nan is not = np.nan, but np.isnan(np.nan) is True
            # Also note that later we will impute the missings for these vars so we shouldn't be doing that now
        else:
            raise ValueError("Unrecognize Non-numeric string values, don't know how to convert: ", instr)
        
def apply_convert_non_numeric(dat, nn_varlist):
    cdat = dat.copy()
    for var in dat[nn_varlist]:
        cdat[var] = cdat[var].apply(util_str_to_numeric)
    return cdat
        
non_numeric_vars = find_non_numeric_vars(dat, numeric_cols_4_model)
dat = apply_convert_non_numeric(dat, non_numeric_vars)

print("Check missingness after converting to float: ")
print(dat[numeric_cols_4_model].isnull().sum())

Check missingness after converting to float: 
loan_amnt                          0
int_rate                           0
installment                        0
annual_inc                         0
acc_now_delinq                     0
acc_open_past_24mths               0
bc_open_to_buy                     1
percent_bc_gt_75                   1
bc_util                            1
dti                                0
delinq_2yrs                        0
delinq_amnt                        0
fico_range_low                     0
fico_range_high                    0
inq_last_6mths                     0
mths_since_last_delinq            31
mths_since_last_record            39
mths_since_recent_inq              6
mths_since_recent_revol_delinq    37
mths_since_recent_bc               1
mort_acc                           0
open_acc                           0
pub_rec                            0
total_bal_ex_mort                  0
revol_bal                          0
revol_util                   

#### Main QA: Review distribution and compare to old (train) data

In [112]:
# Remove this def later to move to utils.py
# TODO 01/11/20: Write func to turn var into continuous first
# TODO: The below np.nanpercentile will NOT WORK if I still have string vars okay?

def get_cont_distribution_dict(dat, varlist=[], filter_ys = None):
    """filter is None or a list of years"""
    # Temp: Check distribution of vars
    if filter_ys != None:# If there's a filter     
        dat = dat[dat['issue_year'].isin(filter_ys)]
    
    tempdict = {}    
    for col in varlist:
        tempdict[col] = {}
    
    for col in varlist:
        tempdict[col]['min'] = np.nanmin(dat[col].values)
        tempdict[col]['max'] = np.nanmax(dat[col].values)
        tempdict[col]['95th'] = np.nanpercentile(dat[col].values, 95)
    print(len(tempdict))
    return tempdict

def check_vars_within_min_max(dat, varlist, dict0, dict1):
    list_not_working = []
    for col in varlist:
        try:
            if dict1['continuous'][col]['min'] < dict0['continuous'][col]['min']:
                print("Val lower than min: ", col, dict1['continuous'][col]['min'], dict0['continuous'][col]['min'])
            elif dict1['continuous'][col]['max'] > dict0['continuous'][col]['max']:
                print("Val greater than max: ", col, dict1['continuous'][col]['max'], "vs. ", dict0['continuous'][col]['max'])
            else:
                print(col, "[ok]")
        except:
            list_not_working.append(col)
            print("Cannot compare for col ", col)
            print("Distrb (new vs. existing): ", dict1['continuous'][col], dict0['continuous'][col])
    if len(list_not_working) > 0:
        raise ValueError("Some vars still not comparable between new and old data: ", list_not_working)

def check_vars_with_abnormally_high_values(dat, varlist, dict0, dict1):
    for col in varlist:
        try:
            m1 = dict1['continuous'][col]['max']
            h0 = dict0['continuous'][col]['95th']
            if m1 > h0:
                print("Max Val greater than 95th: Max ", col, dict1['continuous'][col]['max'], "vs. ", dict0['continuous'][col]['95th'])
                if m1 > 2 * h0:
                    print("!!!! HEAVY WARNING: More than double!!!\n")
            else:
                print(col, "[ok]")
        except:
            raise ValueError("Hmm, still cannot compare")


75


In [122]:
# i. Get distribution
#get_distribution_dict = utils.get_distribution_dict# Restore this later
distribution_dict = {'continuous':{}, 'cat':{}}
distribution_dict['continuous'] = get_cont_distribution_dict(dat, varlist = numeric_cols_4_model)
qavals_dict = pickle.load(open(FPATH + QAFNAME, "rb"))

# ii. Compare min-max to old data:
check_vars_within_min_max(dat, numeric_cols_4_model, qavals_dict, distribution_dict)    

# iii. Compare max to high percentile e.g. 95th in old data:
check_vars_with_abnormally_high_values(dat, numeric_cols_4_model, qavals_dict, distribution_dict)

Val greater than max:  loan_amnt 40000.0 vs.  35000.0
int_rate [ok]
installment [ok]
annual_inc [ok]
acc_now_delinq [ok]
acc_open_past_24mths [ok]
bc_open_to_buy [ok]
percent_bc_gt_75 [ok]
bc_util [ok]
dti [ok]
delinq_2yrs [ok]
delinq_amnt [ok]
fico_range_low [ok]
fico_range_high [ok]
inq_last_6mths [ok]
mths_since_last_delinq [ok]
mths_since_last_record [ok]
mths_since_recent_inq [ok]
mths_since_recent_revol_delinq [ok]
mths_since_recent_bc [ok]
mort_acc [ok]
open_acc [ok]
pub_rec [ok]
total_bal_ex_mort [ok]
revol_bal [ok]
revol_util [ok]
total_bc_limit [ok]
total_acc [ok]
total_il_high_credit_limit [ok]
num_rev_accts [ok]
mths_since_recent_bc_dlq [ok]
pub_rec_bankruptcies [ok]
num_accts_ever_120_pd [ok]
chargeoff_within_12_mths [ok]
collections_12_mths_ex_med [ok]
tax_liens [ok]
mths_since_last_major_derog [ok]
num_sats [ok]
num_tl_op_past_12m [ok]
mo_sin_rcnt_tl [ok]
tot_hi_cred_lim [ok]
tot_cur_bal [ok]
avg_cur_bal [ok]
num_bc_tl [ok]
num_actv_bc_tl [ok]
num_bc_sats [ok]
pct_tl_nvr

Max Val greater than 95th: Max  loan_amnt 40000.0 vs.  30000.0
Max Val greater than 95th: Max  int_rate 25.65 vs.  17.86
Max Val greater than 95th: Max  installment 1336.23 vs.  982.02
Max Val greater than 95th: Max  annual_inc 300000.0 vs.  155000.0
acc_now_delinq [ok]
Max Val greater than 95th: Max  acc_open_past_24mths 15 vs.  10.0
Max Val greater than 95th: Max  bc_open_to_buy 91372.0 vs.  37627.399999999965

percent_bc_gt_75 [ok]
Max Val greater than 95th: Max  bc_util 99.7 vs.  98.2
Max Val greater than 95th: Max  dti 48.49 vs.  33.71
Max Val greater than 95th: Max  delinq_2yrs 4 vs.  2.0
delinq_amnt [ok]
fico_range_low [ok]
fico_range_high [ok]
inq_last_6mths [ok]
Max Val greater than 95th: Max  mths_since_last_delinq 76.0 vs.  74.0
Max Val greater than 95th: Max  mths_since_last_record 118.0 vs.  113.0
Max Val greater than 95th: Max  mths_since_recent_inq 23.0 vs.  19.0
Max Val greater than 95th: Max  mths_since_recent_revol_delinq 82.0 vs.  75.0
Max Val greater than 95th: Max 

### 2. Apply feature engineers

In [128]:
def feature_engineer_numeric_vars(df, numeric_vars):
    """This takes numeric vars and process them (fill missings) then outputs that list with features only"""
    return df[['id'] + numeric_vars].fillna(0)# Note btw that id doesn't have missing so it won't be filled w any 0s

numeric_post_fe_df = feature_engineer_numeric_vars(dat, numeric_cols_4_model)
print(numeric_post_fe_df.shape)

(53, 76)
