In [1]:
# imports
import warnings
warnings.filterwarnings('ignore')
import sqlalchemy
import psycopg2
from sqlalchemy import create_engine
from sql_config import protocol, username, password, host, port, database_name
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss

# create db connection and engine
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)

# Load Data from PostgreSQL with clean code

In [2]:
df_clean = pd.read_sql_query('select * from si_info.app_data_one_hot_target', con=engine)

In [3]:
df_clean.head()

Unnamed: 0,target,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,region_population_relative,days_birth,days_employed,days_registration,...,wallsmaterial_mode_Block,wallsmaterial_mode_Mixed,wallsmaterial_mode_Monolithic,wallsmaterial_mode_Others,wallsmaterial_mode_Panel,"wallsmaterial_mode_Stone, brick",wallsmaterial_mode_Wooden,emergencystate_mode_0,emergencystate_mode_No,emergencystate_mode_Yes
0,0,2,202500.0,598486.5,25290.0,454500.0,0.00823,-11693,-3158,-2746.0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,157500.0,942300.0,27679.5,675000.0,0.030755,-16832,-5544,-8718.0,...,0,0,0,0,0,1,0,0,1,0
2,0,0,144000.0,446931.0,16978.5,369000.0,0.00702,-19501,-10988,-8368.0,...,0,0,0,0,0,0,0,1,0,0
3,1,1,171000.0,1009566.0,36391.5,904500.0,0.025164,-18767,-2046,-6555.0,...,0,0,0,0,0,0,0,1,0,0
4,0,2,247500.0,521280.0,27423.0,450000.0,0.008866,-12191,-3298,-1371.0,...,0,0,0,0,0,0,1,0,1,0


---
# Prepare for machine learning

In [4]:
X = df_clean

In [5]:
X.head()

Unnamed: 0,target,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,region_population_relative,days_birth,days_employed,days_registration,...,wallsmaterial_mode_Block,wallsmaterial_mode_Mixed,wallsmaterial_mode_Monolithic,wallsmaterial_mode_Others,wallsmaterial_mode_Panel,"wallsmaterial_mode_Stone, brick",wallsmaterial_mode_Wooden,emergencystate_mode_0,emergencystate_mode_No,emergencystate_mode_Yes
0,0,2,202500.0,598486.5,25290.0,454500.0,0.00823,-11693,-3158,-2746.0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,157500.0,942300.0,27679.5,675000.0,0.030755,-16832,-5544,-8718.0,...,0,0,0,0,0,1,0,0,1,0
2,0,0,144000.0,446931.0,16978.5,369000.0,0.00702,-19501,-10988,-8368.0,...,0,0,0,0,0,0,0,1,0,0
3,1,1,171000.0,1009566.0,36391.5,904500.0,0.025164,-18767,-2046,-6555.0,...,0,0,0,0,0,0,0,1,0,0
4,0,2,247500.0,521280.0,27423.0,450000.0,0.008866,-12191,-3298,-1371.0,...,0,0,0,0,0,0,1,0,1,0


---
# Start model building

In [6]:
# Seperate out the dependent and independent variables
# Split our preprocessed data into our features and target arrays

# this is the target
y = X["target"].values

# Drop y out of the dataframe to get the independent variables
# this is the feature list
X = X.drop("target", axis=1)
independent_variables = X.columns
X.head()

Unnamed: 0,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,region_population_relative,days_birth,days_employed,days_registration,days_id_publish,...,wallsmaterial_mode_Block,wallsmaterial_mode_Mixed,wallsmaterial_mode_Monolithic,wallsmaterial_mode_Others,wallsmaterial_mode_Panel,"wallsmaterial_mode_Stone, brick",wallsmaterial_mode_Wooden,emergencystate_mode_0,emergencystate_mode_No,emergencystate_mode_Yes
0,2,202500.0,598486.5,25290.0,454500.0,0.00823,-11693,-3158,-2746.0,-3310,...,0,0,0,0,0,0,0,1,0,0
1,0,157500.0,942300.0,27679.5,675000.0,0.030755,-16832,-5544,-8718.0,-379,...,0,0,0,0,0,1,0,0,1,0
2,0,144000.0,446931.0,16978.5,369000.0,0.00702,-19501,-10988,-8368.0,-3013,...,0,0,0,0,0,0,0,1,0,0
3,1,171000.0,1009566.0,36391.5,904500.0,0.025164,-18767,-2046,-6555.0,-2317,...,0,0,0,0,0,0,0,1,0,0
4,2,247500.0,521280.0,27423.0,450000.0,0.008866,-12191,-3298,-1371.0,-3948,...,0,0,0,0,0,0,1,0,1,0


In [7]:
y

array([0, 0, 0, ..., 1, 0, 0])

## Random OverSample
### Split -> Fit -> Score

In [8]:
# Split the preprocessed data into a training and testing dataset
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5,stratify=y, random_state=0)

ros = RandomOverSampler(random_state=42)
X_train_ros,y_train_ros = ros.fit_resample(X_train,y_train)
classifier = LogisticRegression()
classifier
classifier.fit(X_train_ros,y_train_ros)
# print(f"Training Data Score: {classifier.score(X_train, y_train)}")
# print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

### Predictions

In [9]:
# predictions = classifier.predict(X_test)
# # test_pred_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
# test_pred_df = pd.DataFrame({"Actual": y_test,"Prediction": predictions})
# test_pred_df.to_sql('test_scores',con=engine,schema='si_info',if_exists='replace',index=False)

### Confusion Matrix and Classification Report

In [10]:
# y_true = y_test
# y_pred = classifier.predict(X_test)
# confusion_matrix(y_true, y_pred)

# continue the evaluation of the model
y_true = y_test

# Predict  - using the model calculate results for the text data
y_pred = classifier.predict(X_test)

# create and evaluate the confusion matrix
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
print(f"True positives (TP): {tp}")
print(f"True negatives (TN): {tn}")
print(f"False positives (FP): {fp}")
print(f"False negatives (FN): {fn}")
precision = tp / (tp + fp)
print(f'precision =  {precision}')
accuracy = (tp + tn)/(tp+tn+fp+fn)
print(f'accuracy =  {accuracy}')
sensitivity = tp / (tp + fn)
print(f'sensitivity =  {sensitivity}')
F1_j = 2*tp/(2*tp+fn+fp)
print(f'F1 = {F1_j}')
print(classification_report(y_true, y_pred))

True positives (TP): 6610
True negatives (TN): 86862
False positives (FP): 54478
False negatives (FN): 5803
precision =  0.10820455735987428
accuracy =  0.6079361053117662
sensitivity =  0.532506243454443
F1 = 0.17986149848301383
              precision    recall  f1-score   support

           0       0.94      0.61      0.74    141340
           1       0.11      0.53      0.18     12413

    accuracy                           0.61    153753
   macro avg       0.52      0.57      0.46    153753
weighted avg       0.87      0.61      0.70    153753



## SMOT OverSample
### Split -> Fit -> Score

In [11]:
# Split the preprocessed data into a training and testing dataset
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5,stratify=y, random_state=0)

smote = SMOTE(random_state=42)
X_train_smote,y_train_smote = ros.fit_resample(X_train,y_train)
classifier = LogisticRegression()
classifier
classifier.fit(X_train_smote,y_train_smote)
# print(f"Training Data Score: {classifier.score(X_train, y_train)}")
# print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

### Predictions
### Confusion Matrix and Classification Report

In [12]:
# y_true = y_test
# y_pred = classifier.predict(X_test)
# confusion_matrix(y_true, y_pred)

# continue the evaluation of the model
y_true = y_test

# Predict  - using the model calculate results for the text data
y_pred = classifier.predict(X_test)

# create and evaluate the confusion matrix
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
print(f"True positives (TP): {tp}")
print(f"True negatives (TN): {tn}")
print(f"False positives (FP): {fp}")
print(f"False negatives (FN): {fn}")
precision = tp / (tp + fp)
print(f'precision =  {precision}')
accuracy = (tp + tn)/(tp+tn+fp+fn)
print(f'accuracy =  {accuracy}')
sensitivity = tp / (tp + fn)
print(f'sensitivity =  {sensitivity}')
F1_j = 2*tp/(2*tp+fn+fp)
print(f'F1 = {F1_j}')
print(classification_report(y_true, y_pred))

True positives (TP): 6610
True negatives (TN): 86862
False positives (FP): 54478
False negatives (FN): 5803
precision =  0.10820455735987428
accuracy =  0.6079361053117662
sensitivity =  0.532506243454443
F1 = 0.17986149848301383
              precision    recall  f1-score   support

           0       0.94      0.61      0.74    141340
           1       0.11      0.53      0.18     12413

    accuracy                           0.61    153753
   macro avg       0.52      0.57      0.46    153753
weighted avg       0.87      0.61      0.70    153753



## Random UnderSample
### Split -> Fit -> Score

In [13]:
# Split the preprocessed data into a training and testing dataset
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5,stratify=y, random_state=0)

rus = RandomUnderSampler(random_state=42)
X_train_rus,y_train_rus = rus.fit_resample(X_train,y_train)
classifier = LogisticRegression()
classifier
classifier.fit(X_train_rus,y_train_rus)
# print(f"Training Data Score: {classifier.score(X_train, y_train)}")
# print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

### Predictions
### Confusion Matrix and Classification Report

In [14]:
# y_true = y_test
# y_pred = classifier.predict(X_test)
# confusion_matrix(y_true, y_pred)

# continue the evaluation of the model
y_true = y_test

# Predict  - using the model calculate results for the text data
y_pred = classifier.predict(X_test)

# create and evaluate the confusion matrix
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
print(f"True positives (TP): {tp}")
print(f"True negatives (TN): {tn}")
print(f"False positives (FP): {fp}")
print(f"False negatives (FN): {fn}")
precision = tp / (tp + fp)
print(f'precision =  {precision}')
accuracy = (tp + tn)/(tp+tn+fp+fn)
print(f'accuracy =  {accuracy}')
sensitivity = tp / (tp + fn)
print(f'sensitivity =  {sensitivity}')
F1_j = 2*tp/(2*tp+fn+fp)
print(f'F1 = {F1_j}')
print(classification_report(y_true, y_pred))

True positives (TP): 6642
True negatives (TN): 87120
False positives (FP): 54220
False negatives (FN): 5771
precision =  0.10913213499392067
accuracy =  0.609822247370783
sensitivity =  0.5350841859341013
F1 = 0.18128966223132037
              precision    recall  f1-score   support

           0       0.94      0.62      0.74    141340
           1       0.11      0.54      0.18     12413

    accuracy                           0.61    153753
   macro avg       0.52      0.58      0.46    153753
weighted avg       0.87      0.61      0.70    153753



## NearMiss UnderSample
### Split -> Fit -> Score

In [15]:
# Split the preprocessed data into a training and testing dataset
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5,stratify=y, random_state=0)

nm = NearMiss(version=3)
X_train_nm,y_train_nm = nm.fit_resample(X_train,y_train)
classifier = LogisticRegression()
classifier
classifier.fit(X_train_rus,y_train_rus)
# print(f"Training Data Score: {classifier.score(X_train, y_train)}")
# print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

### Predictions
### Confusion Matrix and Classification Report

In [16]:
# y_true = y_test
# y_pred = classifier.predict(X_test)
# confusion_matrix(y_true, y_pred)

# continue the evaluation of the model
y_true = y_test

# Predict  - using the model calculate results for the text data
y_pred = classifier.predict(X_test)

# create and evaluate the confusion matrix
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
print(f"True positives (TP): {tp}")
print(f"True negatives (TN): {tn}")
print(f"False positives (FP): {fp}")
print(f"False negatives (FN): {fn}")
precision = tp / (tp + fp)
print(f'precision =  {precision}')
accuracy = (tp + tn)/(tp+tn+fp+fn)
print(f'accuracy =  {accuracy}')
sensitivity = tp / (tp + fn)
print(f'sensitivity =  {sensitivity}')
F1_j = 2*tp/(2*tp+fn+fp)
print(f'F1 = {F1_j}')
print(classification_report(y_true, y_pred))

True positives (TP): 6642
True negatives (TN): 87120
False positives (FP): 54220
False negatives (FN): 5771
precision =  0.10913213499392067
accuracy =  0.609822247370783
sensitivity =  0.5350841859341013
F1 = 0.18128966223132037
              precision    recall  f1-score   support

           0       0.94      0.62      0.74    141340
           1       0.11      0.54      0.18     12413

    accuracy                           0.61    153753
   macro avg       0.52      0.58      0.46    153753
weighted avg       0.87      0.61      0.70    153753



In [17]:
engine.dispose()