In [1]:
# imports
import sqlalchemy
import psycopg2
from sqlalchemy import create_engine
from sql_config import protocol, username, password, host, port, database_name
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

# create db connection and engine
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)

In [2]:
# Read in the postgres table into a dataframe
df = pd.read_sql_query('select * from proj4_sch.vw_target_with_ext', con=engine)

In [3]:
df.head()

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,...,0,0,0,0,,,,,,
4,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df.describe(include = 'all').round(3)

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
count,307511.0,307511,307511,307511,307511,307511.0,307511.0,307511.0,307499.0,307233.0,...,307511.0,307511.0,307511.0,307511.0,265992.0,265992.0,265992.0,265992.0,265992.0,265992.0
unique,,2,3,2,2,,,,,,...,,,,,,,,,,
top,,Cash loans,F,N,Y,,,,,,...,,,,,,,,,,
freq,,278232,202448,202924,213312,,,,,,...,,,,,,,,,,
mean,0.081,,,,,0.417,168797.9,599026.0,27108.574,538396.207,...,0.008,0.001,0.001,0.0,0.006,0.007,0.034,0.267,0.265,1.9
std,0.272,,,,,0.722,237123.1,402490.777,14493.737,369446.461,...,0.09,0.024,0.023,0.018,0.084,0.111,0.205,0.916,0.794,1.869
min,0.0,,,,,0.0,25650.0,45000.0,1615.5,40500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,,,,,0.0,112500.0,270000.0,16524.0,238500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,,,,,0.0,147150.0,513531.0,24903.0,450000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,,,,,1.0,202500.0,808650.0,34596.0,679500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0


In [5]:
# clean NaN
df = df.fillna(0)

In [6]:
# build a list of all the int64 variable
string_cols = df.select_dtypes(include=['object']).columns.tolist()
print(string_cols)

['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']


In [7]:
for col in string_cols:
      print(df[col].value_counts())

Cash loans         278232
Revolving loans     29279
Name: NAME_CONTRACT_TYPE, dtype: int64
F      202448
M      105059
XNA         4
Name: CODE_GENDER, dtype: int64
N    202924
Y    104587
Name: FLAG_OWN_CAR, dtype: int64
Y    213312
N     94199
Name: FLAG_OWN_REALTY, dtype: int64
Unaccompanied      248526
Family              40149
Spouse, partner     11370
Children             3267
Other_B              1770
0                    1292
Other_A               866
Group of people       271
Name: NAME_TYPE_SUITE, dtype: int64
Working                 158774
Commercial associate     71617
Pensioner                55362
State servant            21703
Unemployed                  22
Student                     18
Businessman                 10
Maternity leave              5
Name: NAME_INCOME_TYPE, dtype: int64
Secondary / secondary special    218391
Higher education                  74863
Incomplete higher                 10277
Lower secondary                    3816
Academic degree              

---
# Transform - binning


In [8]:
df.shape

(307511, 92)

#### Gender

In [9]:
df['CODE_GENDER'].value_counts()

F      202448
M      105059
XNA         4
Name: CODE_GENDER, dtype: int64

In [24]:
# remove genders other than M and F
df_clean = df[df['CODE_GENDER'] != 'XNA']
df_clean.shape

(307507, 92)

#### CNT_CHILDREN

In [25]:
df_clean['CNT_CHILDREN'].value_counts()

0     215369
1      61118
2      26748
3       3717
4        429
5         84
6         21
7          7
14         3
8          2
9          2
12         2
10         2
19         2
11         1
Name: CNT_CHILDREN, dtype: int64

In [26]:
# Bin a column

col_to_bin = 'CNT_CHILDREN'         # name of the column
Cutoff = 500                        # cut off value for binning    
bin_number = 4                      # number/name of the collection bin

a = df_clean[col_to_bin].value_counts()
application_types_to_replace = a[a < Cutoff].index.tolist()

# Replace in dataframe
for app in application_types_to_replace:
    df_clean[col_to_bin] = df_clean[col_to_bin].replace(app,bin_number)

# Check to make sure binning was successful
df_clean[col_to_bin].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[col_to_bin] = df_clean[col_to_bin].replace(app,bin_number)


0    215369
1     61118
2     26748
3      3717
4       555
Name: CNT_CHILDREN, dtype: int64

#### NAME_INCOME_TYPE

In [27]:
df_clean['NAME_INCOME_TYPE'].value_counts()

Working                 158771
Commercial associate     71616
Pensioner                55362
State servant            21703
Unemployed                  22
Student                     18
Businessman                 10
Maternity leave              5
Name: NAME_INCOME_TYPE, dtype: int64

In [28]:
# Bin a column

col_to_bin = 'NAME_INCOME_TYPE'         # name of the column
Cutoff = 100                        # cut off value for binning    
bin_number = 'Other'                      # number/name of the collection bin

a = df_clean[col_to_bin].value_counts()
application_types_to_replace = a[a < Cutoff].index.tolist()

# Replace in dataframe
for app in application_types_to_replace:
    df_clean[col_to_bin] = df_clean[col_to_bin].replace(app,bin_number)

# Check to make sure binning was successful
df_clean[col_to_bin].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[col_to_bin] = df_clean[col_to_bin].replace(app,bin_number)


Working                 158771
Commercial associate     71616
Pensioner                55362
State servant            21703
Other                       55
Name: NAME_INCOME_TYPE, dtype: int64

#### NAME_EDUCATION_TYPE

In [29]:
df_clean['NAME_EDUCATION_TYPE'].value_counts()

Secondary / secondary special    218389
Higher education                  74862
Incomplete higher                 10276
Lower secondary                    3816
Academic degree                     164
Name: NAME_EDUCATION_TYPE, dtype: int64

#### NAME_FAMILY_STATUS

In [30]:
df_clean['NAME_FAMILY_STATUS'].value_counts()

Married                 196429
Single / not married     45444
Civil marriage           29774
Separated                19770
Widow                    16088
Unknown                      2
Name: NAME_FAMILY_STATUS, dtype: int64

In [31]:
# drop NAME_FAMILY_STATUS greater than 5 
df_clean = df_clean[df_clean['NAME_FAMILY_STATUS'] != 'Unknown']
df_clean['NAME_FAMILY_STATUS'].value_counts()

Married                 196429
Single / not married     45444
Civil marriage           29774
Separated                19770
Widow                    16088
Name: NAME_FAMILY_STATUS, dtype: int64

#### FLAG_MOBIL

In [32]:
df_clean['FLAG_MOBIL'].value_counts()

1    307504
0         1
Name: FLAG_MOBIL, dtype: int64

In [33]:
# drop FLAG_MOBIL since there is no variablility 
df_clean = df_clean.drop(columns = ['FLAG_MOBIL'])
df_clean.shape

(307505, 91)

#### Recode day of week to weekday and weekend


In [35]:
# creat at new variable Weekend = true if WEEKDAY_APPR_PROCESS_START = Saturday or Sunday
df_clean['WEEKDAY_APPR_PROCESS_START'] = df_clean['WEEKDAY_APPR_PROCESS_START'].replace({'MONDAY': 1, 'TUESDAY': 1, 'WEDNESDAY': 1, 'THURSDAY':1, 'FRIDAY':1, 'SATURDAY':0, 'SUNDAY':0})


#### Consolidate Documents to a single variable that is a count of documents

In [37]:
# Combine documents
df_clean['DOC_COUNT']  = df_clean[['FLAG_DOCUMENT_2',
            'FLAG_DOCUMENT_3', 
            'FLAG_DOCUMENT_4', 
            'FLAG_DOCUMENT_5', 
            'FLAG_DOCUMENT_6', 
            'FLAG_DOCUMENT_7', 
            'FLAG_DOCUMENT_8', 
            'FLAG_DOCUMENT_9', 
            'FLAG_DOCUMENT_10', 
            'FLAG_DOCUMENT_11', 
            'FLAG_DOCUMENT_12', 
            'FLAG_DOCUMENT_13', 
            'FLAG_DOCUMENT_14', 
            'FLAG_DOCUMENT_15', 
            'FLAG_DOCUMENT_16', 
            'FLAG_DOCUMENT_17', 
            'FLAG_DOCUMENT_18', 
            'FLAG_DOCUMENT_19', 
            'FLAG_DOCUMENT_20', 
            'FLAG_DOCUMENT_21' ]].sum(axis=1)


In [38]:
df_clean = df_clean.drop(columns = ['FLAG_DOCUMENT_2',
            'FLAG_DOCUMENT_3', 
            'FLAG_DOCUMENT_4', 
            'FLAG_DOCUMENT_5', 
            'FLAG_DOCUMENT_6', 
            'FLAG_DOCUMENT_7', 
            'FLAG_DOCUMENT_8', 
            'FLAG_DOCUMENT_9', 
            'FLAG_DOCUMENT_10', 
            'FLAG_DOCUMENT_11', 
            'FLAG_DOCUMENT_12', 
            'FLAG_DOCUMENT_13', 
            'FLAG_DOCUMENT_14', 
            'FLAG_DOCUMENT_15', 
            'FLAG_DOCUMENT_16', 
            'FLAG_DOCUMENT_17', 
            'FLAG_DOCUMENT_18', 
            'FLAG_DOCUMENT_19', 
            'FLAG_DOCUMENT_20', 
            'FLAG_DOCUMENT_21'])


In [39]:
df_clean['DOC_COUNT'].value_counts()

1    270054
0     29545
2      7742
3       163
4         1
Name: DOC_COUNT, dtype: int64

In [40]:
df_clean['ORGANIZATION_TYPE'].value_counts()

Business Entity Type 3    67992
XNA                       55374
Self-employed             38412
Other                     16683
Medicine                  11192
Business Entity Type 2    10552
Government                10404
School                     8893
Trade: type 7              7831
Kindergarten               6879
Construction               6721
Business Entity Type 1     5983
Transport: type 4          5398
Trade: type 3              3492
Industry: type 9           3368
Industry: type 3           3277
Security                   3247
Housing                    2958
Industry: type 11          2704
Military                   2634
Bank                       2507
Agriculture                2454
Police                     2341
Transport: type 2          2204
Postal                     2157
Security Ministries        1974
Trade: type 2              1900
Restaurant                 1811
Services                   1575
University                 1327
Industry: type 7           1307
Transpor

In [41]:
# organization type.  It has 58 categories and no good way to bucket them.
df_clean = df_clean.drop(columns = ['ORGANIZATION_TYPE'])

# finished binning
---

# Load Postgress with clean code

In [42]:
df_clean.to_sql('app_data_clean',con=engine,schema='proj4_sch',if_exists='replace',index=False)

505

---
# Prepare for machine learning

In [None]:
X = df_clean

In [None]:
# looking to see if there are any variable that are highly correlated that may be preventing a more accurate model 
plt.figure(figsize=(10,7))
c= X.corr()
sns.heatmap(c)

# The correlation heat map below indicates the variables are not well separated.  

In [None]:
# Convert categorical data using dummies.
# Convert categorical data to numeric with `pd.get_dummies`
#  YOUR CODE GOES HERE
X_dummies = pd.get_dummies(X)
print(X_dummies.columns)
X_dummies

In [None]:
stop here

---
# Start model building

In [None]:
# Seperate out the dependent and independen variables
# Split our preprocessed data into our features and target arrays

# this is the target
y = X["TARGET"].values

# Drop y out of the dataframe to get the independent variables
# this is the feature list
X = X.drop("TARGET", axis=1)
independent_variables = X.columns
X.head()

### Split

In [None]:
# Split the preprocessed data into a training and testing dataset
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Scale

In [None]:
# there was only one value that needed Scaling ASK_AMT
# Therefore I scaled that variable earlier in the code so the whole data set could be exported for testing in other code.

# I Copied the dataframes to the the "standard names" so that "cut and Paste code would work"
X_train_scaled = X_train
X_test_scaled = X_test

In [None]:
stop here