# Data Science Bootcamp - Project 4
Team Members:
* Ben Calderaio
* Conrad Urffer
* Clara Bucar
* Tammy Lacher
* Jeff Pinegar

Due Date: March 22, 2023

---
# Random Forest
This included:
* Random Forest
* Feature Selection
* Logistic regression with the selected features 
* Confusion Matrix

### Imports

In [1]:
# imports
import sqlalchemy
import psycopg2
from sqlalchemy import create_engine
from sql_config import protocol, username, password, host, port, database_name
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
import tensorflow as tf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from datetime import datetime
import numpy as np
import csv
import os

# Oversampling and under sampling
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss

---
# Load Clean Data from Postgres

In [2]:
# create db connection and engine
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)

# Read in the postgres cleaned data in table "app_data_clean" into a dataframe
df_clean = pd.read_sql_query('select * from proj4_sch.app_data_clean', con=engine)

In [3]:
# Check the dataframe
df_clean.head()

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DOC_COUNT
0,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,2.0,2.0,-1134.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,1.0,0.0,-828.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,0.0,0.0,-815.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,...,2.0,0.0,-617.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,0.0,0.0,-1106.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [4]:
# Check the dataframe continued
df_clean.describe(include = 'all').round(3)

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DOC_COUNT
count,307505.0,307505,307505,307505,307505,307505.0,307505.0,307505.0,307505.0,307505.0,...,307505.0,307505.0,307505.0,307505.0,307505.0,307505.0,307505.0,307505.0,307505.0,307505.0
unique,,2,2,2,2,,,,,,...,,,,,,,,,,
top,,Cash loans,F,N,Y,,,,,,...,,,,,,,,,,
freq,,278232,202447,202920,213306,,,,,,...,,,,,,,,,,
mean,0.081,,,,,0.416,168796.7,599028.395,27107.58,537914.488,...,1.401,0.1,-962.859,0.006,0.006,0.03,0.231,0.23,1.643,0.93
std,0.272,,,,,0.715,237124.8,402493.887,14494.547,369633.198,...,2.377,0.362,826.814,0.078,0.103,0.191,0.857,0.744,1.856,0.344
min,0.0,,,,,0.0,25650.0,45000.0,0.0,0.0,...,0.0,0.0,-4292.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,,,,,0.0,112500.0,270000.0,16524.0,238500.0,...,0.0,0.0,-1570.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,,,,,0.0,147150.0,513531.0,24903.0,450000.0,...,0.0,0.0,-757.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
75%,0.0,,,,,1.0,202500.0,808650.0,34596.0,679500.0,...,2.0,0.0,-274.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0


In [5]:
print(df_clean.dtypes)

TARGET                          int64
NAME_CONTRACT_TYPE             object
CODE_GENDER                    object
FLAG_OWN_CAR                   object
FLAG_OWN_REALTY                object
                               ...   
AMT_REQ_CREDIT_BUREAU_WEEK    float64
AMT_REQ_CREDIT_BUREAU_MON     float64
AMT_REQ_CREDIT_BUREAU_QRT     float64
AMT_REQ_CREDIT_BUREAU_YEAR    float64
DOC_COUNT                       int64
Length: 71, dtype: object


In [6]:
# Check the dataframe continued
print (df_clean.shape)
print (df_clean.columns)

(307505, 71)
Index(['TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE',
       'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'FLAG_EMP_PHONE',
       'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
       'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT',
       'REGION_RATING_CLIENT_W_CITY', 'WEEKDAY_APPR_PROCESS_START',
       'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION',
       'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY',
       'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_1',
       'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI',
       'YEARS_BEGINEXPLUATATION_MEDI', 'YEA

---
# Prepare for machine learning

In [7]:
# Coppy dataframe to "X" so that most of my existing code will work.
X = df_clean

#### Encoding of Categorical variables using Pandas Dummies

In [8]:
# Convert categorical data using dummies.
# Convert categorical data to numeric with `pd.get_dummies`
#  YOUR CODE GOES HERE
X_dummies = pd.get_dummies(X)
print(X_dummies.columns)
X_dummies

Index(['TARGET', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE',
       'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION',
       ...
       'WALLSMATERIAL_MODE_Block', 'WALLSMATERIAL_MODE_Mixed',
       'WALLSMATERIAL_MODE_Monolithic', 'WALLSMATERIAL_MODE_Others',
       'WALLSMATERIAL_MODE_Panel', 'WALLSMATERIAL_MODE_Stone, brick',
       'WALLSMATERIAL_MODE_Wooden', 'EMERGENCYSTATE_MODE_0',
       'EMERGENCYSTATE_MODE_No', 'EMERGENCYSTATE_MODE_Yes'],
      dtype='object', length=133)


Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_0,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,1,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,-3648.0,...,0,0,0,0,0,1,0,0,1,0
1,0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,-1186.0,...,1,0,0,0,0,0,0,0,1,0
2,0,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225,-4260.0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005,-3039,-9833.0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038,-4311.0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307500,0,0,157500.0,254700.0,27558.0,225000.0,0.032561,-9327,-236,-8456.0,...,0,0,0,0,0,1,0,0,1,0
307501,0,0,72000.0,269550.0,12001.5,225000.0,0.025164,-20775,365243,-4388.0,...,0,0,0,0,0,1,0,0,1,0
307502,0,0,153000.0,677664.0,29979.0,585000.0,0.005002,-14966,-7921,-6737.0,...,0,0,0,0,1,0,0,0,1,0
307503,1,0,171000.0,370107.0,20205.0,319500.0,0.005313,-11961,-4786,-2562.0,...,0,0,0,0,0,1,0,0,1,0


---
# Start Model Building

In [9]:
# Seperate out the dependent and independen variables
# Split our preprocessed data into our features and target arrays

# this is the target
y = X_dummies["TARGET"].values

# Drop y out of the dataframe to get the independent variables
# this is the feature list
X = X_dummies.drop("TARGET", axis=1)
independent_variables = X.columns
variables = X.columns.tolist()
X.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_0,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes
0,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,-3648.0,-2120,...,0,0,0,0,0,1,0,0,1,0
1,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,-1186.0,-291,...,1,0,0,0,0,0,0,0,1,0
2,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225,-4260.0,-2531,...,0,0,0,0,0,0,0,1,0,0
3,0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005,-3039,-9833.0,-2437,...,0,0,0,0,0,0,0,1,0,0
4,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038,-4311.0,-3458,...,0,0,0,0,0,0,0,1,0,0


### Split and Scale

In [10]:
# Split the preprocessed data into a training and testing dataset
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, stratify=y, random_state=42)

# Not scaling y since it is only 0 or 1
# Create a StandardScaler() model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)

# Transform the training and testing data by using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

---
# Start Random Forest

In [11]:
# Compile the model
X_for_train = X_train_scaled
y_for_train = y_train
X_for_test = X_test_scaled
y_for_test = y_test


clf_rf = RandomForestClassifier(random_state=1, n_estimators=1000).fit(X_for_train, y_train)
print(f'Training Score: {clf_rf.score(X_for_train, y_for_train)}')
print(f'Testing Score: {clf_rf.score(X_for_test, y_for_test)}')

Training Score: 1.0
Testing Score: 0.919272115733041


In [12]:
features = clf_rf.feature_importances_
# print(features)

# Create a bar chart to help with setting the threshold
fig = px.bar(x=list(range(len(features))), y=features, range_y=(0, 0.05) )
fig.show()


In [13]:
# Stop here.  After looking at the barchart determine a threshold for including variable into the model
# stop here so you can set the threshold in the next code block

In [14]:
# sel is the data the model feels are important.
sel = SelectFromModel(clf_rf, threshold = 0.015)
sel.fit(X_for_train, y_for_train)
sel.get_support()

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
        True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False,  True, False,  True, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [15]:
SelectTable = np.array(sel.get_support().tolist())
Select_df = pd.DataFrame.from_dict({'Variables': variables, 'Select':SelectTable})

# build a list of only the selected variables
Select_df = Select_df[Select_df['Select'] != False]
print(Select_df.shape)
Select_df.head()

(16, 2)


Unnamed: 0,Variables,Select
1,AMT_INCOME_TOTAL,True
2,AMT_CREDIT,True
3,AMT_ANNUITY,True
4,AMT_GOODS_PRICE,True
5,REGION_POPULATION_RELATIVE,True


In [16]:
# use the selected independent variables (sel)
X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y, test_size=0.5, stratify=y, random_state=0)


scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)


X has feature names, but SelectFromModel was fitted without feature names



## Random forest with Selected Variables

In [17]:
# Compile the model
X_for_train = X_selected_train_scaled
y_for_train = y_train
X_for_test = X_selected_test_scaled
y_for_test = y_test


# this is using the full set of data
clf_rf = RandomForestClassifier(random_state=1, n_estimators=1000).fit(X_for_train, y_train)
print(f'Training Score: {clf_rf.score(X_for_train, y_for_train)}')
print(f'Testing Score: {clf_rf.score(X_for_test, y_for_test)}')

Training Score: 1.0
Testing Score: 0.9193641750079673


In [18]:
# continue the evaluation of the model
y_true = y_test

# Predict  - using the model calculate results for the text data
y_pred = clf_rf.predict(X_for_test)

# create and evaluate the confusion matrix
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
print(f"True positives (TP): {tp}")
print(f"True negatives (TN): {tn}")
print(f"False positives (FP): {fp}")
print(f"False negatives (FN): {fn}")
precision = tp / (tp + fp)
print(f'precision =  {precision}')
accuracy = (tp + tn)/(tp+tn+fp+fn)
print(f'accuracy =  {accuracy}')
sensitivity = tp / (tp + fn)
print(f'sensitivity =  {sensitivity}')
F1_j = 2*tp/(2*tp+fn+fp)
print(f'F1 = {F1_j}')
print(classification_report(y_true, y_pred))

True positives (TP): 54
True negatives (TN): 141301
False positives (FP): 39
False negatives (FN): 12359
precision =  0.5806451612903226
accuracy =  0.9193641750079673
sensitivity =  0.0043502779344235885
F1 = 0.008635854789700944
              precision    recall  f1-score   support

           0       0.92      1.00      0.96    141340
           1       0.58      0.00      0.01     12413

    accuracy                           0.92    153753
   macro avg       0.75      0.50      0.48    153753
weighted avg       0.89      0.92      0.88    153753



## Logistic Regression with Random Forest Selections

In [19]:
# Compile the model
X_for_train = X_selected_train_scaled
y_for_train = y_train
X_for_test = X_selected_test_scaled
y_for_test = y_test

# Now repeat the process using the selected variables (features)
clf_lr = LogisticRegression()
clf_lr.fit(X_for_train, y_train)
print(f'Training Score: {clf_lr.score(X_for_train, y_train)}')
print(f'Testing Score: {clf_lr.score(X_for_test, y_test)}')

Training Score: 0.9192530828867267
Testing Score: 0.9192210883690074


In [20]:
# continue the evaluation of the model
y_true = y_test

# Predict  - using the model calculate results for the text data
y_pred = clf_lr.predict(X_for_test)

# create and evaluate the confusion matrix
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
print(f"True positives (TP): {tp}")
print(f"True negatives (TN): {tn}")
print(f"False positives (FP): {fp}")
print(f"False negatives (FN): {fn}")
precision = tp / (tp + fp)
print(f'precision =  {precision}')
accuracy = (tp + tn)/(tp+tn+fp+fn)
print(f'accuracy =  {accuracy}')
sensitivity = tp / (tp + fn)
print(f'sensitivity =  {sensitivity}')
F1_j = 2*tp/(2*tp+fn+fp)
print(f'F1 = {F1_j}')
print(classification_report(y_true, y_pred))

True positives (TP): 0
True negatives (TN): 141333
False positives (FP): 7
False negatives (FN): 12413
precision =  0.0
accuracy =  0.9192210883690074
sensitivity =  0.0
F1 = 0.0
              precision    recall  f1-score   support

           0       0.92      1.00      0.96    141340
           1       0.00      0.00      0.00     12413

    accuracy                           0.92    153753
   macro avg       0.46      0.50      0.48    153753
weighted avg       0.85      0.92      0.88    153753



---
# Random Oversampling

In [21]:
# Split the preprocessed data into a training and testing dataset
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, stratify=y, random_state=0)

X_scaler = StandardScaler().fit(X_train)

# Transform the training and testing data by using the X_scaler and y_scaler models

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Randomly over sample the minority class
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros= ros.fit_resample(X_train_scaled, y_train)

In [22]:
# Compile the model
X_for_train = X_train_ros
y_for_train = y_train_ros
X_for_test = X_test_scaled
y_for_test = y_test


# this is using the full set of data
clf_rf = RandomForestClassifier(random_state=1, n_estimators=100).fit(X_for_train, y_for_train)
print(f'Training Score: {clf_rf.score(X_for_train, y_for_train)}')
print(f'Testing Score: {clf_rf.score(X_for_test, y_for_test)}')

Training Score: 1.0
Testing Score: 0.9190119579659379


In [23]:
features = clf_rf.feature_importances_
# print(features)

# Create a bar chart to help with setting the threshold
fig = px.bar(x=list(range(len(features))), y=features, range_y=(0, 0.05) )
fig.show()

In [None]:
# stop

In [24]:
# sel is the data the model feels are important.
sel = SelectFromModel(clf_rf, threshold = 0.002)
sel.fit(X_for_train, y_for_train)
sel.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True, False,  True,  True,  True,  True,
        True,  True, False,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False,  True, False, False, False, False,  True,
        True, False,  True,  True,  True, False,  True, False, False,
        True,  True,  True,  True,  True, False, False,  True, False,
       False, False,  True,  True, False, False, False,  True,  True,
       False, False, False,  True, False,  True, False, False, False,
        True, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
        True,  True,

In [25]:
SelectTable = np.array(sel.get_support().tolist())
Select_df = pd.DataFrame.from_dict({'Variables': variables, 'Select':SelectTable})

# build a list of only the selected variables
Select_df = Select_df[Select_df['Select'] != False]
print(Select_df.shape)
Select_df.head()

(80, 2)


Unnamed: 0,Variables,Select
0,CNT_CHILDREN,True
1,AMT_INCOME_TOTAL,True
2,AMT_CREDIT,True
3,AMT_ANNUITY,True
4,AMT_GOODS_PRICE,True


In [26]:
# stop

In [27]:
# use the selected independent variables (sel)
X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y, test_size=0.5, stratify=y, random_state=0)


scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)


X has feature names, but SelectFromModel was fitted without feature names



### Continue with Random forest Selecte variable from over sampling

In [28]:
# Compile the model
X_for_train = X_selected_train_scaled
y_for_train = y_train
X_for_test = X_selected_test_scaled
y_for_test = y_test


# this is using the full set of data
clf_rf = RandomForestClassifier(random_state=1, n_estimators=100).fit(X_for_train, y_train)
print(f'Training Score: {clf_rf.score(X_for_train, y_for_train)}')
print(f'Testing Score: {clf_rf.score(X_for_test, y_for_test)}')

Training Score: 0.9999804880586919
Testing Score: 0.919312143502891


In [29]:
# continue the evaluation of the model
y_true = y_test

# Predict  - using the model calculate results for the text data
y_pred = clf_rf.predict(X_for_test)

# create and evaluate the confusion matrix
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
print(f"True positives (TP): {tp}")
print(f"True negatives (TN): {tn}")
print(f"False positives (FP): {fp}")
print(f"False negatives (FN): {fn}")
precision = tp / (tp + fp)
print(f'precision =  {precision}')
accuracy = (tp + tn)/(tp+tn+fp+fn)
print(f'accuracy =  {accuracy}')
sensitivity = tp / (tp + fn)
print(f'sensitivity =  {sensitivity}')
F1_j = 2*tp/(2*tp+fn+fp)
print(f'F1 = {F1_j}')
print(classification_report(y_true, y_pred))

True positives (TP): 14
True negatives (TN): 141333
False positives (FP): 7
False negatives (FN): 12399
precision =  0.6666666666666666
accuracy =  0.919312143502891
sensitivity =  0.00112784983485056
F1 = 0.0022518899790895932
              precision    recall  f1-score   support

           0       0.92      1.00      0.96    141340
           1       0.67      0.00      0.00     12413

    accuracy                           0.92    153753
   macro avg       0.79      0.50      0.48    153753
weighted avg       0.90      0.92      0.88    153753



## Logistic Regression

In [30]:
# Compile the model
X_for_train = X_selected_train_scaled
y_for_train = y_train
X_for_test = X_selected_test_scaled
y_for_test = y_test

# Now repeat the process using the selected variables (features)
clf_lr = LogisticRegression()
clf_lr.fit(X_for_train, y_train)
print(f'Training Score: {clf_lr.score(X_for_train, y_train)}')
print(f'Testing Score: {clf_lr.score(X_for_test, y_test)}')

Training Score: 0.9192400749258546
Testing Score: 0.9192080804927384



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [31]:
# continue the evaluation of the model
y_true = y_test

# Predict  - using the model calculate results for the text data
y_pred = clf_lr.predict(X_for_test)

# create and evaluate the confusion matrix
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
print(f"True positives (TP): {tp}")
print(f"True negatives (TN): {tn}")
print(f"False positives (FP): {fp}")
print(f"False negatives (FN): {fn}")
precision = tp / (tp + fp)
print(f'precision =  {precision}')
accuracy = (tp + tn)/(tp+tn+fp+fn)
print(f'accuracy =  {accuracy}')
sensitivity = tp / (tp + fn)
print(f'sensitivity =  {sensitivity}')
F1_j = 2*tp/(2*tp+fn+fp)
print(f'F1 = {F1_j}')
print(classification_report(y_true, y_pred))

True positives (TP): 27
True negatives (TN): 141304
False positives (FP): 36
False negatives (FN): 12386
precision =  0.42857142857142855
accuracy =  0.9192080804927384
sensitivity =  0.0021751389672117942
F1 = 0.004328310355883296
              precision    recall  f1-score   support

           0       0.92      1.00      0.96    141340
           1       0.43      0.00      0.00     12413

    accuracy                           0.92    153753
   macro avg       0.67      0.50      0.48    153753
weighted avg       0.88      0.92      0.88    153753

