## EDA and data preparation code development

### Northwestern Banking

#### Loan prediction project
##### Updated 10-10-2020 Lescher

In [1]:
# Initiate Libraries

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import datetime
import os


In [2]:
# Import CSV
datapath = os.path
df = pd.read_csv("smallerdata.csv",  sep=',')
# df = pd.read_excel("smallerdata.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### Declare all Functions here

In [3]:
#a function that receives the dataframe, data column & row you wish to alter a string into a date format
def dateconvert(datestring):

    #empty 
    if type(datestring) == float:
        year = "1900"
        month = "Jan"
    #3-Jan for 01/01/2003
    elif len(datestring)==5:
        year = "200" + datestring[0:1]
        month = datestring[2:5]
    #19-Mar for 03/01/2019
    elif datestring[0:2].isnumeric():
        year = datestring[0:2]
        month = datestring[3:6]
    #Feb-2000 for 02/01/2000    
    elif datestring[4:8].isnumeric() and len (datestring[4:8])==4:
        month = datestring[0:3]
        year = datestring[4:8]
    #Feb-01 for 02/01/2019
    elif datestring[4:8].isnumeric() and len (datestring[4:8])==2:
        month = datestring[0:3]
        year = datestring[4:6]
    
    #this is a manual process to convert 2 year dates to 4 since the automatic one doesn't work
    if len(year)==2:
        if int(year) < 21:
            year="20"+year
        else:
            year = "19"+year
    
    date_time_str = month +' 01 '+ year
    #all dates have to be forced to a 4 year, otherwise we get like 2065 as dates
    date_time_obj = datetime.datetime.strptime(date_time_str, '%b %d %Y')
    
    return date_time_obj

###  Data Cleaning Code

#### id

In [4]:
#removes those wierd summary rows as they all contain the word "amount" in the id column
#converts ID to string to then search it, then changes it back to integer to ensure no issues
df['id']=df['id'].astype(str)
df = df[~df['id'].str.contains("amount")]
df['id']=df['id'].astype(np.int32)
#resets the index in order so further coding can be easier
df = df.reset_index(drop=True)

#### Date related columns

In [5]:
#add the column headers you need to this date converting list
datecol_list = ['issue_d','earliest_cr_line','last_pymnt_d','last_credit_pull_d',
                'hardship_start_date','hardship_end_date','payment_plan_start_date',
                'debt_settlement_flag_date','settlement_date']

#loops the date columns through the dateconvert function and creates new columns with a "2" at the end
for c in range(0,len(datecol_list)):
    df[datecol_list[c]+"2"] = df.apply(lambda x: dateconvert(x[datecol_list[c]]), axis =1)
    df[datecol_list[c]+"2"] = pd.to_datetime(df[datecol_list[c]+"2"])

#### int_rate

In [6]:
# convert 'int_rate' to string
df['int_rate']=df['int_rate'].astype(str)
# strip off % sign and convert to float
df['int_rate'] = df['int_rate'].str.rstrip('%').astype('float') / 100.0

## Feature Generation

#### len_credit

In [7]:
#calculates the length of credit they've had in years
df['len_credit'] = pd.to_numeric((df['issue_d2']-df['earliest_cr_line2']).dt.days)/365

#### max_fico_high

In [8]:
df['max_fico_high']= df[["fico_range_high", "sec_app_fico_range_high"]].max(axis=1)

#### max_fico_low

In [9]:
df['max_fico_low']= df[["fico_range_low", "sec_app_fico_range_low"]].max(axis=1)

#### delinq_amt_pct 

In [10]:
df['delinq_amt_pct']=(df['delinq_amnt']/df['out_prncp'])

#### sats_pct 

In [11]:
df['sats_pct']=(df['num_sats']/df['open_acc'])

#### emp_length2

In [12]:
#converts employment lengt to a number (imputes 0 years for null)
df['emp_length2'] = df['emp_length'].str[0:1]
df.loc[df['emp_length']=="< 1 year", 'emp_length2'] = "0"
df.loc[df['emp_length']=="10+ years", 'emp_length2'] = ">"
df.loc[df['emp_length2']==">", 'emp_length2'] = "10"
df.loc[df['emp_length'].isnull(), 'emp_length2'] = "0"
df['emp_length2']=df['emp_length2'].astype(int)

#### initial_list_status (create dummy)

In [13]:
df.loc[df['initial_list_status']=="w", 'initial_list_status2'] = 0
df.loc[df['initial_list_status']=="f", 'initial_list_status2'] = 1

#### hardship_flag

In [14]:
df.loc[df['hardship_flag']=="N", 'hardship_flag2'] = 0
df.loc[df['hardship_flag']=="Y", 'hardship_flag2'] = 1

# UPDATE
#### home_ownership

In [15]:
df['home_ownership2'] = df['home_ownership']

# UPDATE
#### desc

In [16]:
df['desc2'] = df['desc']

#### verification_status

In [17]:
df.loc[df['verification_status']=="Not Verified", 'verification_status2'] = 0
df.loc[df['verification_status']=="Verified", 'verification_status2'] = 1
df.loc[df['verification_status']=="Source Verified", 'verification_status2'] = 1

#### pymnt_plan

In [18]:
df.loc[df['pymnt_plan']=="n", 'pymnt_plan2'] = 0
df.loc[df['pymnt_plan']=="y", 'pymnt_plan2'] = 1

#### default_ind

In [19]:
#creates binary indicator for defaulted or not
#just an error checker to ensure that all statuses are accounted for
df['default_ind']=2
#should be a lsit of all good status loans
df.loc[(df['loan_status'] == 'Fully Paid') | 
       (df['loan_status'] == 'Current') | 
        (df['loan_status'] == 'Does not meet the credit policy. Status:Fully Paid') |
        (df['loan_status'] == 'In Grace Period') |
        (df['loan_status'] == 'Late (16-30 days)') |
        (df['loan_status'] == 'Late (31-120 days)'), 'default_ind'] = 0
#should be a list of all defaulted loans
df.loc[(df['loan_status'] == 'Charged Off') | 
       (df['loan_status'] == 'Default') | 
        (df['loan_status'] == 'Does not meet the credit policy. Status:Charged Off'), 'default_ind'] = 1

## Row/bad data deletion

#### Bad dates

In [20]:
#removes rows with empty dates as they were all set to 01/01/1990
df=df[(df['issue_d2']!="1900-01-01") | (df['earliest_cr_line2']!="1900-01-01")]

## Modeling data set creation

In [21]:
#keep only columns we need
modeldf = df[['id', 'default_ind', 'loan_amnt', 'funded_amnt', 
              'term', 'int_rate', 'installment', 'grade', 'sub_grade', 
              'emp_length2', 'home_ownership2', 'annual_inc', 
              'verification_status2',  'pymnt_plan2', 'desc2', 'purpose', 
              'dti', 'delinq_2yrs', 'earliest_cr_line2', 'mths_since_last_delinq', 
              'open_acc', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status2', 
              'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'application_type', 
              'annual_inc_joint', 'dti_joint', 'verification_status_joint', 
              'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'all_util', 
              'inq_last_12m', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt', 
              'num_accts_ever_120_pd', 'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m', 
              'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 
              'tax_liens', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'hardship_flag2', 'delinq_amt_pct', 
              'sats_pct', 'max_fico_high', 'max_fico_low','len_credit' ]]

In [22]:
#split into training and testing 
#needs to actually be coded in
modeldftrain=modeldf

In [23]:
#save files
modeldftrain.to_csv('modelingdftrain.csv')  