In [1]:
# imports
import warnings
warnings.filterwarnings('ignore')
import sqlalchemy
import psycopg2
from sqlalchemy import create_engine
from sql_config import protocol, username, password, host, port, database_name
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

# create db connection and engine
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)

# Load Data from PostgreSQL with clean code

In [2]:
df_clean = pd.read_sql_query('select * from proj4_sch.app_data_clean', con=engine)

In [3]:
df_clean.head()

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DOC_COUNT
0,0,Cash loans,F,N,Y,2,202500.0,598486.5,25290.0,454500.0,...,0.0,0.0,-2700.0,0.0,0.0,0.0,0.0,1.0,1.0,1
1,0,Cash loans,F,N,Y,0,157500.0,942300.0,27679.5,675000.0,...,0.0,0.0,-849.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0,Cash loans,F,N,Y,0,144000.0,446931.0,16978.5,369000.0,...,1.0,0.0,-1649.0,0.0,0.0,0.0,0.0,0.0,5.0,1
3,1,Cash loans,M,N,Y,1,171000.0,1009566.0,36391.5,904500.0,...,0.0,0.0,-1144.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0,Cash loans,M,Y,N,2,247500.0,521280.0,27423.0,450000.0,...,0.0,0.0,-1754.0,0.0,0.0,0.0,0.0,0.0,1.0,1


---
# Prepare for machine learning

In [4]:
X = df_clean

In [5]:
X.head()

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DOC_COUNT
0,0,Cash loans,F,N,Y,2,202500.0,598486.5,25290.0,454500.0,...,0.0,0.0,-2700.0,0.0,0.0,0.0,0.0,1.0,1.0,1
1,0,Cash loans,F,N,Y,0,157500.0,942300.0,27679.5,675000.0,...,0.0,0.0,-849.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0,Cash loans,F,N,Y,0,144000.0,446931.0,16978.5,369000.0,...,1.0,0.0,-1649.0,0.0,0.0,0.0,0.0,0.0,5.0,1
3,1,Cash loans,M,N,Y,1,171000.0,1009566.0,36391.5,904500.0,...,0.0,0.0,-1144.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0,Cash loans,M,Y,N,2,247500.0,521280.0,27423.0,450000.0,...,0.0,0.0,-1754.0,0.0,0.0,0.0,0.0,0.0,1.0,1


---
# Start model building

In [6]:
# Seperate out the dependent and independent variables
# Split our preprocessed data into our features and target arrays

# this is the target
y = X["TARGET"].values

# Drop y out of the dataframe to get the independent variables
# this is the feature list
X = X.drop("TARGET", axis=1)
independent_variables = X.columns
X.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DOC_COUNT
0,Cash loans,F,N,Y,2,202500.0,598486.5,25290.0,454500.0,Unaccompanied,...,0.0,0.0,-2700.0,0.0,0.0,0.0,0.0,1.0,1.0,1
1,Cash loans,F,N,Y,0,157500.0,942300.0,27679.5,675000.0,Unaccompanied,...,0.0,0.0,-849.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,Cash loans,F,N,Y,0,144000.0,446931.0,16978.5,369000.0,Unaccompanied,...,1.0,0.0,-1649.0,0.0,0.0,0.0,0.0,0.0,5.0,1
3,Cash loans,M,N,Y,1,171000.0,1009566.0,36391.5,904500.0,Unaccompanied,...,0.0,0.0,-1144.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,Cash loans,M,Y,N,2,247500.0,521280.0,27423.0,450000.0,Unaccompanied,...,0.0,0.0,-1754.0,0.0,0.0,0.0,0.0,0.0,1.0,1


In [7]:
y

array([0, 0, 0, ..., 1, 0, 0])

In [8]:
X_dummies = pd.get_dummies(X)
print(X_dummies.columns)
X_dummies

Index(['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
       'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH',
       ...
       'WALLSMATERIAL_MODE_Block', 'WALLSMATERIAL_MODE_Mixed',
       'WALLSMATERIAL_MODE_Monolithic', 'WALLSMATERIAL_MODE_Others',
       'WALLSMATERIAL_MODE_Panel', 'WALLSMATERIAL_MODE_Stone, brick',
       'WALLSMATERIAL_MODE_Wooden', 'EMERGENCYSTATE_MODE_0',
       'EMERGENCYSTATE_MODE_No', 'EMERGENCYSTATE_MODE_Yes'],
      dtype='object', length=132)


Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_0,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,2,202500.0,598486.5,25290.0,454500.0,0.008230,-11693,-3158,-2746.0,-3310,...,0,0,0,0,0,0,0,1,0,0
1,0,157500.0,942300.0,27679.5,675000.0,0.030755,-16832,-5544,-8718.0,-379,...,0,0,0,0,0,1,0,0,1,0
2,0,144000.0,446931.0,16978.5,369000.0,0.007020,-19501,-10988,-8368.0,-3013,...,0,0,0,0,0,0,0,1,0,0
3,1,171000.0,1009566.0,36391.5,904500.0,0.025164,-18767,-2046,-6555.0,-2317,...,0,0,0,0,0,0,0,1,0,0
4,2,247500.0,521280.0,27423.0,450000.0,0.008866,-12191,-3298,-1371.0,-3948,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307500,0,112500.0,405000.0,20250.0,405000.0,0.035792,-19345,-2418,-13244.0,-2891,...,0,0,0,0,1,0,0,0,1,0
307501,1,67500.0,817560.0,30951.0,675000.0,0.035792,-21401,365243,-3940.0,-4642,...,0,0,0,0,0,0,0,1,0,0
307502,0,81000.0,312840.0,22891.5,247500.0,0.018850,-8038,-609,-8038.0,-727,...,0,0,0,0,0,0,0,1,0,0
307503,0,40500.0,127350.0,7438.5,112500.0,0.007330,-23903,365243,-12295.0,-4332,...,0,0,0,0,0,1,0,0,1,0


In [9]:
target = y
target_names = ["no_risk", "risk"]

In [10]:
feature_names = X_dummies.columns
X_dummies.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_0,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,2,202500.0,598486.5,25290.0,454500.0,0.00823,-11693,-3158,-2746.0,-3310,...,0,0,0,0,0,0,0,1,0,0
1,0,157500.0,942300.0,27679.5,675000.0,0.030755,-16832,-5544,-8718.0,-379,...,0,0,0,0,0,1,0,0,1,0
2,0,144000.0,446931.0,16978.5,369000.0,0.00702,-19501,-10988,-8368.0,-3013,...,0,0,0,0,0,0,0,1,0,0
3,1,171000.0,1009566.0,36391.5,904500.0,0.025164,-18767,-2046,-6555.0,-2317,...,0,0,0,0,0,0,0,1,0,0
4,2,247500.0,521280.0,27423.0,450000.0,0.008866,-12191,-3298,-1371.0,-3948,...,0,0,0,0,0,0,1,0,1,0


### Split

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_dummies, target, test_size=.30, random_state=109)

### Fit

In [16]:
# Support vector machine linear classifier
from sklearn.svm import LinearSVC,SVC
model = LinearSVC(random_state=0,tol=1e-5)
model.fit(X_train,y_train.ravel())

LinearSVC(random_state=0, tol=1e-05)

In [17]:
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.920


In [18]:
# Calculate the classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

     no_risk       0.92      1.00      0.96     84840
        risk       0.00      0.00      0.00      7412

    accuracy                           0.92     92252
   macro avg       0.46      0.50      0.48     92252
weighted avg       0.85      0.92      0.88     92252



In [18]:
engine.dispose()