In [1]:
# imports
import warnings
warnings.filterwarnings('ignore')
import sqlalchemy
import psycopg2
from sqlalchemy import create_engine
from sql_config import protocol, username, password, host, port, database_name
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

# create db connection and engine
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)

# Load Data from PostgreSQL with clean code

In [2]:
df_clean = pd.read_sql_query('select * from proj4_sch.app_data_clean', con=engine)

In [3]:
df_clean.head()

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DOC_COUNT
0,0,Cash loans,M,Y,N,2,157500.0,720000.0,28552.5,720000.0,...,0.0,0.0,-1450.0,0.0,0.0,0.0,0.0,0.0,2.0,1
1,0,Revolving loans,F,N,N,2,112500.0,270000.0,13500.0,270000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0,Cash loans,M,N,N,0,292500.0,1024740.0,55719.0,900000.0,...,0.0,0.0,-964.0,0.0,0.0,0.0,0.0,1.0,1.0,1
3,0,Cash loans,F,N,N,1,121500.0,1102500.0,32364.0,1102500.0,...,0.0,0.0,-317.0,0.0,0.0,0.0,1.0,0.0,4.0,1
4,0,Cash loans,F,N,Y,0,157500.0,544491.0,16047.0,454500.0,...,0.0,0.0,-122.0,0.0,0.0,0.0,0.0,0.0,2.0,1


---
# Prepare for machine learning

In [4]:
X = df_clean

In [5]:
X.head()

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DOC_COUNT
0,0,Cash loans,M,Y,N,2,157500.0,720000.0,28552.5,720000.0,...,0.0,0.0,-1450.0,0.0,0.0,0.0,0.0,0.0,2.0,1
1,0,Revolving loans,F,N,N,2,112500.0,270000.0,13500.0,270000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0,Cash loans,M,N,N,0,292500.0,1024740.0,55719.0,900000.0,...,0.0,0.0,-964.0,0.0,0.0,0.0,0.0,1.0,1.0,1
3,0,Cash loans,F,N,N,1,121500.0,1102500.0,32364.0,1102500.0,...,0.0,0.0,-317.0,0.0,0.0,0.0,1.0,0.0,4.0,1
4,0,Cash loans,F,N,Y,0,157500.0,544491.0,16047.0,454500.0,...,0.0,0.0,-122.0,0.0,0.0,0.0,0.0,0.0,2.0,1


---
# Start model building

In [6]:
# Seperate out the dependent and independent variables
# Split our preprocessed data into our features and target arrays

# this is the target
y = X["TARGET"].values

# Drop y out of the dataframe to get the independent variables
# this is the feature list
X = X.drop("TARGET", axis=1)
independent_variables = X.columns
X.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DOC_COUNT
0,Cash loans,M,Y,N,2,157500.0,720000.0,28552.5,720000.0,Unaccompanied,...,0.0,0.0,-1450.0,0.0,0.0,0.0,0.0,0.0,2.0,1
1,Revolving loans,F,N,N,2,112500.0,270000.0,13500.0,270000.0,Unaccompanied,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,Cash loans,M,N,N,0,292500.0,1024740.0,55719.0,900000.0,Unaccompanied,...,0.0,0.0,-964.0,0.0,0.0,0.0,0.0,1.0,1.0,1
3,Cash loans,F,N,N,1,121500.0,1102500.0,32364.0,1102500.0,Unaccompanied,...,0.0,0.0,-317.0,0.0,0.0,0.0,1.0,0.0,4.0,1
4,Cash loans,F,N,Y,0,157500.0,544491.0,16047.0,454500.0,Family,...,0.0,0.0,-122.0,0.0,0.0,0.0,0.0,0.0,2.0,1


In [7]:
y

array([0, 0, 0, ..., 0, 0, 0])

In [8]:
X_dummies = pd.get_dummies(X)
print(X_dummies.columns)
X_dummies

Index(['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
       'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH',
       ...
       'WALLSMATERIAL_MODE_Block', 'WALLSMATERIAL_MODE_Mixed',
       'WALLSMATERIAL_MODE_Monolithic', 'WALLSMATERIAL_MODE_Others',
       'WALLSMATERIAL_MODE_Panel', 'WALLSMATERIAL_MODE_Stone, brick',
       'WALLSMATERIAL_MODE_Wooden', 'EMERGENCYSTATE_MODE_0',
       'EMERGENCYSTATE_MODE_No', 'EMERGENCYSTATE_MODE_Yes'],
      dtype='object', length=132)


Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_0,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,2,157500.0,720000.0,28552.5,720000.0,0.030755,-14144,-4516,-1516.0,-4616,...,0,0,0,0,0,0,0,1,0,0
1,2,112500.0,270000.0,13500.0,270000.0,0.009657,-13584,-680,-350.0,-2097,...,0,0,0,0,1,0,0,0,1,0
2,0,292500.0,1024740.0,55719.0,900000.0,0.006207,-10010,-240,-4629.0,-2680,...,0,0,0,0,0,0,0,1,0,0
3,1,121500.0,1102500.0,32364.0,1102500.0,0.009657,-13957,-410,-5038.0,-4630,...,0,0,0,0,0,1,0,0,1,0
4,0,157500.0,544491.0,16047.0,454500.0,0.010147,-17947,-5025,-11812.0,-1505,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307500,0,337500.0,876078.0,49050.0,765000.0,0.072508,-13210,-225,-7318.0,-4397,...,0,0,0,0,0,0,0,0,1,0
307501,1,315000.0,2250000.0,83515.5,2250000.0,0.032561,-11177,-1514,-64.0,-2793,...,0,0,1,0,0,0,0,0,1,0
307502,0,225000.0,1040985.0,30568.5,909000.0,0.003818,-14823,-185,-1147.0,-4138,...,0,0,0,0,1,0,0,0,1,0
307503,0,225000.0,684054.0,77494.5,630000.0,0.006629,-23656,-11883,-7125.0,-4681,...,1,0,0,0,0,0,0,0,1,0


### Split

In [9]:
X_dummies.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_0,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,2,157500.0,720000.0,28552.5,720000.0,0.030755,-14144,-4516,-1516.0,-4616,...,0,0,0,0,0,0,0,1,0,0
1,2,112500.0,270000.0,13500.0,270000.0,0.009657,-13584,-680,-350.0,-2097,...,0,0,0,0,1,0,0,0,1,0
2,0,292500.0,1024740.0,55719.0,900000.0,0.006207,-10010,-240,-4629.0,-2680,...,0,0,0,0,0,0,0,1,0,0
3,1,121500.0,1102500.0,32364.0,1102500.0,0.009657,-13957,-410,-5038.0,-4630,...,0,0,0,0,0,1,0,0,1,0
4,0,157500.0,544491.0,16047.0,454500.0,0.010147,-17947,-5025,-11812.0,-1505,...,0,0,0,0,0,0,0,1,0,0


In [10]:
# Split the preprocessed data into a training and testing dataset
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y, random_state=42)

### LogisticRegression Classifier

In [11]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

### Fit

In [12]:
classifier.fit(X_train, y_train)

LogisticRegression()

### Score

In [13]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9190557954801672
Testing Data Score: 0.9197809487883242


### Predictions

In [14]:
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
76872,0,0
76873,0,0
76874,0,0
76875,0,1


### Confusion Matrix and Classification Report

In [15]:
from sklearn.metrics import confusion_matrix, classification_report

y_true = y_test
y_pred = classifier.predict(X_test)
confusion_matrix(y_true, y_pred)

array([[70710,     3],
       [ 6164,     0]])

In [16]:
confusion_matrix(y, classifier.predict(X_dummies))

array([[282670,     10],
       [ 24825,      0]])

In [17]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     70713
           1       0.00      0.00      0.00      6164

    accuracy                           0.92     76877
   macro avg       0.46      0.50      0.48     76877
weighted avg       0.85      0.92      0.88     76877



In [18]:
engine.dispose()