# Data Preparation

## Import Libraries

In [1]:
import numpy as np
import pandas as pd

## Import Data

In [2]:
loan_data = pd.read_csv('pt_1_loan_data_train.csv', index_col = 0)

In [3]:
loan_data_targets = pd.DataFrame(loan_data['good_bad'])

## Explore Data

## General Preprocessing

### Preprocessing few discrete variables

In [4]:
# We create dummy variables from all 8 original independent variables, and save them into a list.
# Note that we are using a particular naming convention for all variables: original variable name, colon, category name.
loan_data_dummies = [pd.get_dummies(loan_data['grade'], prefix = 'grade', prefix_sep = ':'),
                     pd.get_dummies(loan_data['home_ownership'], prefix = 'home_ownership', prefix_sep = ':'),
                     pd.get_dummies(loan_data['purpose'], prefix = 'purpose', prefix_sep = ':'),
                     pd.get_dummies(loan_data['verification_status'], prefix = 'verification_status', prefix_sep = ':')]
# We concatenate the dummy variables and this turns them into a dataframe.
loan_data_dummies = pd.concat(loan_data_dummies, axis = 1)
# Here we concatenate the dataframe with original data with the dataframe with dummy variables, along the columns. 
loan_data = pd.concat([loan_data, loan_data_dummies], axis = 1)

# PD model

## Data preparation

In [5]:
df_inputs_prepr = loan_data
df_targets_prepr = loan_data_targets

### Preprocessing Discrete Variables: Automating Calculaions

In [6]:
# WoE function for discrete unordered variables
def woe_discrete(df, discrete_variabe_name, good_bad_variable_df):
    df = pd.concat([df[discrete_variabe_name], good_bad_variable_df], axis = 1)
    df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),
                    df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)
    df = df.iloc[:, [0, 1, 3]]
    df.columns = [df.columns.values[0], 'n_obs', 'prop_good']
    df['prop_n_obs'] = df['n_obs'] / df['n_obs'].sum()
    df['n_good'] = df['prop_good'] * df['n_obs']
    df['n_bad'] = (1 - df['prop_good']) * df['n_obs']
    df['prop_n_good'] = df['n_good'] / df['n_good'].sum()
    df['prop_n_bad'] = df['n_bad'] / df['n_bad'].sum()
    df['WoE'] = np.log(df['prop_n_good'] / df['prop_n_bad'])
    df = df.sort_values(['WoE'])
    df = df.reset_index(drop = True)
    df['diff_prop_good'] = df['prop_good'].diff().abs()
    df['diff_WoE'] = df['WoE'].diff().abs()
    df['IV'] = (df['prop_n_good'] - df['prop_n_bad']) * df['WoE']
    df['IV'] = df['IV'].sum()
    return df
# Here we combine all of the operations above in a function.
# The function takes 3 arguments: a dataframe, a string, and a dataframe. The function returns a dataframe as a result.

### Preprocessing Discrete Variables: Visualizing Results

In [7]:
import matplotlib.pyplot as plt
import seaborn as sns
# Imports the libraries we need.
sns.set()
# We set the default style of the graphs to the seaborn style. 

In [8]:
# Below we define a function that takes 2 arguments: a dataframe and a number.
# The number parameter has a default value of 0.
# This means that if we call the function and omit the number parameter, it will be executed with it having a value of 0.
# The function displays a graph.
def plot_by_woe(df_WoE, rotation_of_x_axis_labels = 0):
    x = np.array(df_WoE.iloc[:, 0].apply(str))
    # Turns the values of the column with index 0 to strings, makes an array from these strings, and passes it to variable x.
    y = df_WoE['WoE']
    # Selects a column with label 'WoE' and passes it to variable y.
    plt.figure(figsize=(18, 6))
    # Sets the graph size to width 18 x height 6.
    plt.plot(x, y, marker = 'o', linestyle = '--', color = 'k')
    # Plots the datapoints with coordiantes variable x on the x-axis and variable y on the y-axis.
    # Sets the marker for each datapoint to a circle, the style line between the points to dashed, and the color to black.
    plt.xlabel(df_WoE.columns[0])
    # Names the x-axis with the name of the column with index 0.
    plt.ylabel('Weight of Evidence')
    # Names the y-axis 'Weight of Evidence'.
    plt.title(str('Weight of Evidence by ' + df_WoE.columns[0]))
    # Names the grapth 'Weight of Evidence by ' the name of the column with index 0.
    plt.xticks(rotation = rotation_of_x_axis_labels)
    # Rotates the labels of the x-axis a predefined number of degrees.

### Preprocessing Discrete Variables

In [9]:
# Add your code here

### Preprocessing Continuous Variables: Automating Calculations and Visualizing Results

In [10]:
# WoE function for ordered discrete and continuous variables
def woe_ordered_continuous(df, discrete_variabe_name, good_bad_variable_df):
    df = pd.concat([df[discrete_variabe_name], good_bad_variable_df], axis = 1)
    df = pd.concat([df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].count(),
                    df.groupby(df.columns.values[0], as_index = False)[df.columns.values[1]].mean()], axis = 1)
    df = df.iloc[:, [0, 1, 3]]
    df.columns = [df.columns.values[0], 'n_obs', 'prop_good']
    df['prop_n_obs'] = df['n_obs'] / df['n_obs'].sum()
    df['n_good'] = df['prop_good'] * df['n_obs']
    df['n_bad'] = (1 - df['prop_good']) * df['n_obs']
    df['prop_n_good'] = df['n_good'] / df['n_good'].sum()
    df['prop_n_bad'] = df['n_bad'] / df['n_bad'].sum()
    df['WoE'] = np.log(df['prop_n_good'] / df['prop_n_bad'])
    #df = df.sort_values(['WoE'])
    #df = df.reset_index(drop = True)
    df['diff_prop_good'] = df['prop_good'].diff().abs()
    df['diff_WoE'] = df['WoE'].diff().abs()
    df['IV'] = (df['prop_n_good'] - df['prop_n_bad']) * df['WoE']
    df['IV'] = df['IV'].sum()
    return df
# Here we define a function similar to the one above, ...
# ... with one slight difference: we order the results by the values of a different column.
# The function takes 3 arguments: a dataframe, a string, and a dataframe. The function returns a dataframe as a result.

In [11]:
loan_data['term'].unique()

array(['36 months', '60 months'], dtype=object)

In [12]:
loan_data['term:numberic'] = pd.to_numeric(loan_data['term'].str.replace(' months',''))
# loan_data['term:numberic'].info

In [13]:
loan_data['term:numberic'].describe()

count    24000.000000
mean        42.618000
std         10.725617
min         36.000000
25%         36.000000
50%         36.000000
75%         60.000000
max         60.000000
Name: term:numberic, dtype: float64

In [14]:
loan_data['acc_now_delinq'].unique()

array([ 0., nan,  2.,  1.,  3.])

In [15]:
df_temp = woe_discrete(loan_data,'acc_now_delinq',loan_data_targets)
df_temp

Unnamed: 0,acc_now_delinq,n_obs,prop_good,prop_n_obs,n_good,n_bad,prop_n_good,prop_n_bad,WoE,diff_prop_good,diff_WoE,IV
0,2.0,11,0.727273,0.000491,8.0,3.0,0.000417,0.000931,-0.803451,,,inf
1,1.0,77,0.844156,0.003437,65.0,12.0,0.003389,0.003726,-0.0948,0.116883,0.708651,inf
2,0.0,22314,0.856323,0.996027,19108.0,3206.0,0.996142,0.995343,0.000803,0.012168,0.095602,inf
3,3.0,1,1.0,4.5e-05,1.0,0.0,5.2e-05,0.0,inf,0.143677,inf,inf


In [16]:
loan_data['acc_now_delinq'].count()

22403

In [17]:
loan_data['acc_now_delinq'].isnull().value_counts()

acc_now_delinq
False    22403
True      1597
Name: count, dtype: int64

In [18]:
print(22314+22403)

44717


In [19]:
loan_data['acc_now_delinq'].describe()

count    22403.000000
mean         0.004553
std          0.076042
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          3.000000
Name: acc_now_delinq, dtype: float64

In [20]:
loan_data['annual_inc'].isnull().value_counts()

annual_inc
False    24000
Name: count, dtype: int64

In [21]:
loan_data['annual_inc'].fillna(loan_data['annual_inc'].mean(),inplace=True)
loan_data['annual_inc'].isnull().value_counts()

annual_inc
False    24000
Name: count, dtype: int64