# Data Science Bootcamp - Project 4
Team Members:
* Ben Calderaio
* Conrad Urffer
* Clara Bucar
* Tammy Lacher
* Jeff Pinegar

Due Date: March 22, 2023

---
# KNN - K Nearest Neighbor


### Imports

In [1]:
# imports
# Data loading
import sqlalchemy
import psycopg2
from sqlalchemy import create_engine
from sql_config import protocol, username, password, host, port, database_name
import csv
import os

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
import tensorflow as tf

# Common laibraries 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from datetime import datetime
import numpy as np

# Oversampling and under sampling
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss

---
# Load Clean Data from Postgres

In [2]:
# create db connection and engine
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)

# Read in the postgres cleaned data in table "app_data_clean" into a dataframe
df_clean = pd.read_sql_query('select * from proj4_sch.app_data_clean', con=engine)

In [3]:
# Check the dataframe
df_clean.head()

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DOC_COUNT
0,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,2.0,2.0,-1134.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,1.0,0.0,-828.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,0.0,0.0,-815.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,...,2.0,0.0,-617.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,0.0,0.0,-1106.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [4]:
# Check the dataframe continued
df_clean.describe(include = 'all').round(3)

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DOC_COUNT
count,307505.0,307505,307505,307505,307505,307505.0,307505.0,307505.0,307505.0,307505.0,...,307505.0,307505.0,307505.0,307505.0,307505.0,307505.0,307505.0,307505.0,307505.0,307505.0
unique,,2,2,2,2,,,,,,...,,,,,,,,,,
top,,Cash loans,F,N,Y,,,,,,...,,,,,,,,,,
freq,,278232,202447,202920,213306,,,,,,...,,,,,,,,,,
mean,0.081,,,,,0.416,168796.7,599028.395,27107.58,537914.488,...,1.401,0.1,-962.859,0.006,0.006,0.03,0.231,0.23,1.643,0.93
std,0.272,,,,,0.715,237124.8,402493.887,14494.547,369633.198,...,2.377,0.362,826.814,0.078,0.103,0.191,0.857,0.744,1.856,0.344
min,0.0,,,,,0.0,25650.0,45000.0,0.0,0.0,...,0.0,0.0,-4292.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,,,,,0.0,112500.0,270000.0,16524.0,238500.0,...,0.0,0.0,-1570.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,,,,,0.0,147150.0,513531.0,24903.0,450000.0,...,0.0,0.0,-757.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
75%,0.0,,,,,1.0,202500.0,808650.0,34596.0,679500.0,...,2.0,0.0,-274.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0


In [5]:
print(df_clean.dtypes)

TARGET                          int64
NAME_CONTRACT_TYPE             object
CODE_GENDER                    object
FLAG_OWN_CAR                   object
FLAG_OWN_REALTY                object
                               ...   
AMT_REQ_CREDIT_BUREAU_WEEK    float64
AMT_REQ_CREDIT_BUREAU_MON     float64
AMT_REQ_CREDIT_BUREAU_QRT     float64
AMT_REQ_CREDIT_BUREAU_YEAR    float64
DOC_COUNT                       int64
Length: 71, dtype: object


In [6]:
# Check the dataframe continued
print (df_clean.shape)
print (df_clean.columns)

(307505, 71)
Index(['TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE',
       'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'FLAG_EMP_PHONE',
       'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
       'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT',
       'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START',
       'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION',
       'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY',
       'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_1',
       'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI',
       'YEARS_BEGINEXPLUATATION_MEDI', 'YEA

---
# Prepare for machine learning

In [7]:
# Coppy dataframe to "X" so that most of my existing code will work.
X = df_clean

#### Encoding of Categorical variables using Pandas Dummies

In [8]:
# Convert categorical data using dummies.
# Convert categorical data to numeric with `pd.get_dummies`
#  YOUR CODE GOES HERE
X_dummies = pd.get_dummies(X)
print(X_dummies.columns)
X_dummies

Index(['TARGET', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE',
       'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION',
       ...
       'WALLSMATERIAL_MODE_Block', 'WALLSMATERIAL_MODE_Mixed',
       'WALLSMATERIAL_MODE_Monolithic', 'WALLSMATERIAL_MODE_Others',
       'WALLSMATERIAL_MODE_Panel', 'WALLSMATERIAL_MODE_Stone, brick',
       'WALLSMATERIAL_MODE_Wooden', 'EMERGENCYSTATE_MODE_0',
       'EMERGENCYSTATE_MODE_No', 'EMERGENCYSTATE_MODE_Yes'],
      dtype='object', length=133)


Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_0,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,-3648.0,...,0,0,0,0,0,1,0,0,1,0
1,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,-1186.0,...,1,0,0,0,0,0,0,0,1,0
2,0,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225,-4260.0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005,-3039,-9833.0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038,-4311.0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307500,0,0,157500.0,254700.0,27558.0,225000.0,0.032561,-9327,-236,-8456.0,...,0,0,0,0,0,1,0,0,1,0
307501,0,0,72000.0,269550.0,12001.5,225000.0,0.025164,-20775,365243,-4388.0,...,0,0,0,0,0,1,0,0,1,0
307502,0,0,153000.0,677664.0,29979.0,585000.0,0.005002,-14966,-7921,-6737.0,...,0,0,0,0,1,0,0,0,1,0
307503,1,0,171000.0,370107.0,20205.0,319500.0,0.005313,-11961,-4786,-2562.0,...,0,0,0,0,0,1,0,0,1,0


---
# Start Model Building

In [9]:
# Seperate out the dependent and independen variables
# Split our preprocessed data into our features and target arrays

# this is the target
y = X_dummies["TARGET"].values

# Drop y out of the dataframe to get the independent variables
# this is the feature list
X = X_dummies.drop("TARGET", axis=1)
independent_variables = X_dummies.columns
X.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_0,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,-3648.0,-2120,...,0,0,0,0,0,1,0,0,1,0
1,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,-1186.0,-291,...,1,0,0,0,0,0,0,0,1,0
2,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225,-4260.0,-2531,...,0,0,0,0,0,0,0,1,0,0
3,0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005,-3039,-9833.0,-2437,...,0,0,0,0,0,0,0,1,0,0
4,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038,-4311.0,-3458,...,0,0,0,0,0,0,0,1,0,0


### Split

In [10]:
# Split the preprocessed data into a training and testing dataset
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, stratify=y, random_state=0)

In [11]:
# Check the number of occurrences per target value in the testing data
unique_elements_test, counts_elements_test = np.unique(y_test, return_counts=True)
print("Frequency of unique values of the `y_test` array:")
print(np.asarray((unique_elements_test, counts_elements_test)))

Frequency of unique values of the `y_test` array:
[[     0      1]
 [197876  17378]]


In [12]:
# Check the number of occurrences per target value in the training data
unique_elements_train, counts_elements_train = np.unique(y_train, return_counts=True)
print("Frequency of unique values of the `y_train` array:")
print(np.asarray((unique_elements_train, counts_elements_train)))

Frequency of unique values of the `y_train` array:
[[    0     1]
 [84804  7447]]


### Scale

In [13]:
# Create a StandardScaler() model and fit it to the training data

X_scaler = StandardScaler().fit(X_train)

# Transform the training and testing data by using the X_scaler and y_scaler models

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_train_scaled


array([[-0.57716455,  0.35178816,  1.01618709, ..., -0.95035847,
         0.96452096, -0.08623766],
       [-0.57716455, -0.83872395, -0.87244427, ..., -0.95035847,
         0.96452096, -0.08623766],
       [-0.57716455, -0.15189004, -0.13438938, ..., -0.95035847,
         0.96452096, -0.08623766],
       ...,
       [ 0.8309228 ,  0.80967743,  0.18935038, ...,  1.05223453,
        -1.0367841 , -0.08623766],
       [-0.57716455, -0.79293502, -0.89463973, ..., -0.95035847,
         0.96452096, -0.08623766],
       [-0.57716455,  0.35178816, -0.89463973, ..., -0.95035847,
         0.96452096, -0.08623766]])

---
# Start K Nearest Neighbor Model

In [14]:
# # Loop through different k values to find which has the highest accuracy.
# # Note: We use only odd numbers because we don't want any ties.
# train_scores = []
# test_scores = []

# # run the model for k=3, 5, 7, ... 15)
# for k in range(3, 16, 2):
#     knn = KNeighborsClassifier(n_neighbors=k)
#     knn.fit(X_train_scaled, y_train)
#     train_score = knn.score(X_train_scaled, y_train)
#     test_score = knn.score(X_test_scaled, y_test)
#     train_scores.append(train_score)
#     test_scores.append(test_score)
#     print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
# plt.plot(range(3, 16, 2), train_scores, marker='o')
# plt.plot(range(3, 16, 2), test_scores, marker="x")
# plt.xlabel("k neighbors")
# plt.ylabel("Testing accuracy Score")
# plt.show()

In [15]:
train_scores = []
test_scores = []
# Run the model with k = 9 

k = 9 # look at 3 nearest neighbors
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train_scaled, y_train)
train_score = knn.score(X_train_scaled, y_train)
test_score = knn.score(X_test_scaled, y_test)
train_scores.append(train_score)
test_scores.append(test_score)
#print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
print(f"k: {k}, Train Score: {train_score:.3f}  Test Score: {test_score:.3f}")

k: 9, Train Score: 0.920  Test Score: 0.918


In [16]:
# continue the evaluation of the model
y_true = y_test

# Predict  - using the model calculate results for the text data
y_pred = knn.predict(X_test_scaled)

# create and evaluate the confusion matrix
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
print(f"True positives (TP): {tp}")
print(f"True negatives (TN): {tn}")
print(f"False positives (FP): {fp}")
print(f"False negatives (FN): {fn}")
precision = tp / (tp + fp)
print(f'precision =  {precision}')
accuracy = (tp + tn)/(tp+tn+fp+fn)
print(f'accuracy =  {accuracy}')
sensitivity = tp / (tp + fn)
print(f'sensitivity =  {sensitivity}')
F1_j = 2*tp/(2*tp+fn+fp)
print(f'F1 = {F1_j}')
print(classification_report(y_true, y_pred))

True positives (TP): 62
True negatives (TN): 197630
False positives (FP): 246
False negatives (FN): 17316
precision =  0.2012987012987013
accuracy =  0.9184126659667183
sensitivity =  0.0035677293129243873
F1 = 0.007011195295714124
              precision    recall  f1-score   support

           0       0.92      1.00      0.96    197876
           1       0.20      0.00      0.01     17378

    accuracy                           0.92    215254
   macro avg       0.56      0.50      0.48    215254
weighted avg       0.86      0.92      0.88    215254



In [17]:
# Check the number of occurrences per target value in the training data
unique_elements_test, counts_elements_test = np.unique(y_test, return_counts=True)
print("Frequency of unique values of the `y_train` array:")
print(np.asarray((unique_elements_test, counts_elements_test)))

Frequency of unique values of the `y_train` array:
[[     0      1]
 [197876  17378]]


---
# Random OverSample

In [18]:
# Split the preprocessed data into a training and testing dataset
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, stratify=y, random_state=0)

X_scaler = StandardScaler().fit(X_train)

# Transform the training and testing data by using the X_scaler and y_scaler models

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Randomly over sample the minority class
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros= ros.fit_resample(X_train_scaled, y_train)

In [19]:
# Check the number of occurrences per target value in the training data
unique_elements_train, counts_elements_train = np.unique(y_train_ros, return_counts=True)
print("Frequency of unique values of the `y_train` array:")
print(np.asarray((unique_elements_train, counts_elements_train)))

Frequency of unique values of the `y_train` array:
[[    0     1]
 [84804 84804]]


In [20]:
train_scores = []
test_scores = []

k = 9 # Number of nearest Neighbors
X_train = X_train_ros
y_train = y_train_ros
X_test = X_test_scaled
y_test = y_test


knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
train_score = knn.score(X_train, y_train)
test_score = knn.score(X_test, y_test)
train_scores.append(train_score)
test_scores.append(test_score)
#print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
print(f"k: {k}, Train Score: {train_score:.3f}  Test Score: {test_score:.3f}")

k: 9, Train Score: 0.864  Test Score: 0.656


In [21]:
# continue the evaluation of the model
y_true = y_test

# Predict  - using the model calculate results for the text data
y_pred = knn.predict(X_test)

# create and evaluate the confusion matrix
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
print(f"True positives (TP): {tp}")
print(f"True negatives (TN): {tn}")
print(f"False positives (FP): {fp}")
print(f"False negatives (FN): {fn}")
precision = tp / (tp + fp)
print(f'precision =  {precision}')
accuracy = (tp + tn)/(tp+tn+fp+fn)
print(f'accuracy =  {accuracy}')
sensitivity = tp / (tp + fn)
print(f'sensitivity =  {sensitivity}')
F1_j = 2*tp/(2*tp+fn+fp)
print(f'F1 = {F1_j}')
print(classification_report(y_true, y_pred))

True positives (TP): 7511
True negatives (TN): 133730
False positives (FP): 64146
False negatives (FN): 9867
precision =  0.10481878951114336
accuracy =  0.656159699703606
sensitivity =  0.43221314305443664
F1 = 0.16872016622676475
              precision    recall  f1-score   support

           0       0.93      0.68      0.78    197876
           1       0.10      0.43      0.17     17378

    accuracy                           0.66    215254
   macro avg       0.52      0.55      0.48    215254
weighted avg       0.86      0.66      0.73    215254



In [None]:
stop

---
# SMOT Oversampling

In [None]:
# Split the preprocessed data into a training and testing dataset
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, stratify=y, random_state=0)

X_scaler = StandardScaler().fit(X_train)

# Transform the training and testing data by using the X_scaler and y_scaler models

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Randomly over sample the minority class
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote= smote.fit_resample(X_train_scaled, y_train)

In [None]:

test_scores = []

k = 9 # Number of nearest Neighbors
X_train = X_train_smote
y_train = y_train_smote
X_test = X_test_scaled
y_test = y_test


knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
train_score = knn.score(X_train, y_train)
test_score = knn.score(X_test, y_test)
train_scores.append(train_score)
test_scores.append(test_score)
#print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
print(f"k: {k}, Train Score: {train_score:.3f}  Test Score: {test_score:.3f}")

# continue the evaluation of the model
y_true = y_test

# Predict  - using the model calculate results for the text data
y_pred = knn.predict(X_test)

# create and evaluate the confusion matrix
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
print(f"True positives (TP): {tp}")
print(f"True negatives (TN): {tn}")
print(f"False positives (FP): {fp}")
print(f"False negatives (FN): {fn}")
precision = tp / (tp + fp)
print(f'precision =  {precision}')
accuracy = (tp + tn)/(tp+tn+fp+fn)
print(f'accuracy =  {accuracy}')
sensitivity = tp / (tp + fn)
print(f'sensitivity =  {sensitivity}')
F1_j = 2*tp/(2*tp+fn+fp)
print(f'F1 = {F1_j}')
print(classification_report(y_true, y_pred))

k: 9, Train Score: 0.796  Test Score: 0.526
True positives (TP): 10633
True negatives (TN): 102657
False positives (FP): 95219
False negatives (FN): 6745
precision =  0.10045157389562785
accuracy =  0.526308454198296
sensitivity =  0.6118655771665324
F1 = 0.17257161405501906
              precision    recall  f1-score   support

           0       0.94      0.52      0.67    197876
           1       0.10      0.61      0.17     17378

    accuracy                           0.53    215254
   macro avg       0.52      0.57      0.42    215254
weighted avg       0.87      0.53      0.63    215254



In [None]:
Stop

---
# Random Under-Sample

In [22]:
# Split the preprocessed data into a training and testing dataset
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, stratify=y, random_state=0)

X_scaler = StandardScaler().fit(X_train)

# Transform the training and testing data by using the X_scaler and y_scaler models

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Randomly under sample the majority class
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus= rus.fit_resample(X_train, y_train)

In [23]:
test_scores = []

k = 9 # Number of nearest Neighbors
X_train = X_train_rus
y_train = y_train_rus
X_test = X_test_scaled
y_test = y_test


knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
train_score = knn.score(X_train, y_train)
test_score = knn.score(X_test, y_test)
train_scores.append(train_score)
test_scores.append(test_score)
#print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
print(f"k: {k}, Train Score: {train_score:.3f}  Test Score: {test_score:.3f}")

# continue the evaluation of the model
y_true = y_test

# Predict  - using the model calculate results for the text data
y_pred = knn.predict(X_test)

# create and evaluate the confusion matrix
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
print(f"True positives (TP): {tp}")
print(f"True negatives (TN): {tn}")
print(f"False positives (FP): {fp}")
print(f"False negatives (FN): {fn}")
precision = tp / (tp + fp)
print(f'precision =  {precision}')
accuracy = (tp + tn)/(tp+tn+fp+fn)
print(f'accuracy =  {accuracy}')
sensitivity = tp / (tp + fn)
print(f'sensitivity =  {sensitivity}')
F1_j = 2*tp/(2*tp+fn+fp)
print(f'F1 = {F1_j}')
print(classification_report(y_true, y_pred))



k: 9, Train Score: 0.666  Test Score: 0.081




True positives (TP): 17378
True negatives (TN): 0
False positives (FP): 197876
False negatives (FN): 0
precision =  0.08073252994137159
accuracy =  0.08073252994137159
sensitivity =  1.0
F1 = 0.14940334949620002


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00    197876
           1       0.08      1.00      0.15     17378

    accuracy                           0.08    215254
   macro avg       0.04      0.50      0.07    215254
weighted avg       0.01      0.08      0.01    215254



  _warn_prf(average, modifier, msg_start, len(result))


---
# Under-Sampling Using NearMiss

In [24]:
# Split the preprocessed data into a training and testing dataset
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, stratify=y, random_state=0)

X_scaler = StandardScaler().fit(X_train)

# Transform the training and testing data by using the X_scaler and y_scaler models

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Under sample the majority class
nearmiss = NearMiss(version=3)
X_train_nearmiss, y_train_nearmiss= nearmiss.fit_resample(X_train, y_train)

In [25]:
test_scores = []

k = 9 # Number of nearest Neighbors
X_train = X_train_nearmiss
y_train = y_train_nearmiss
X_test = X_test_scaled
y_test = y_test


knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
train_score = knn.score(X_train, y_train)
test_score = knn.score(X_test, y_test)
train_scores.append(train_score)
test_scores.append(test_score)
#print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
print(f"k: {k}, Train Score: {train_score:.3f}  Test Score: {test_score:.3f}")

# continue the evaluation of the model
y_true = y_test

# Predict  - using the model calculate results for the text data
y_pred = knn.predict(X_test)

# create and evaluate the confusion matrix
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
print(f"True positives (TP): {tp}")
print(f"True negatives (TN): {tn}")
print(f"False positives (FP): {fp}")
print(f"False negatives (FN): {fn}")
precision = tp / (tp + fp)
print(f'precision =  {precision}')
accuracy = (tp + tn)/(tp+tn+fp+fn)
print(f'accuracy =  {accuracy}')
sensitivity = tp / (tp + fn)
print(f'sensitivity =  {sensitivity}')
F1_j = 2*tp/(2*tp+fn+fp)
print(f'F1 = {F1_j}')
print(classification_report(y_true, y_pred))



k: 9, Train Score: 0.752  Test Score: 0.081




True positives (TP): 17378
True negatives (TN): 0
False positives (FP): 197876
False negatives (FN): 0
precision =  0.08073252994137159
accuracy =  0.08073252994137159
sensitivity =  1.0
F1 = 0.14940334949620002
              precision    recall  f1-score   support

           0       0.00      0.00      0.00    197876
           1       0.08      1.00      0.15     17378

    accuracy                           0.08    215254
   macro avg       0.04      0.50      0.07    215254
weighted avg       0.01      0.08      0.01    215254



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
