In [33]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [34]:
# Load the data
data = Path('./Data/fraudTest.csv')

df_fraud = pd.read_csv(data, parse_dates=True, index_col=0)
df_fraud.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [35]:
# Remove the trans_date_trans_time column
df_fraud = df_fraud.drop(columns=['trans_date_trans_time'])
df_fraud.head()

Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,state,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,NY,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [36]:
# Remove the merchant column
df_fraud = df_fraud.drop(columns=['merchant'])
df_fraud.head()

Unnamed: 0,cc_num,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [37]:
# Remove the category column
df_fraud = df_fraud.drop(columns=['category'])
df_fraud.head()

Unnamed: 0,cc_num,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [38]:
# Remove the first name column
df_fraud = df_fraud.drop(columns=['first'])
df_fraud.head()

Unnamed: 0,cc_num,amt,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,Elliott,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,Williams,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,Lopez,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [39]:
# Remove the last name column
df_fraud = df_fraud.drop(columns=['last'])
df_fraud.head()

Unnamed: 0,cc_num,amt,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,M,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,M,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [40]:
# Remove the gender column
df_fraud = df_fraud.drop(columns=['gender'])
df_fraud.head()

Unnamed: 0,cc_num,amt,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [41]:
# Remove the street column
df_fraud = df_fraud.drop(columns=['street'])
df_fraud.head()

Unnamed: 0,cc_num,amt,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [42]:
# Remove the city column
df_fraud = df_fraud.drop(columns=['city'])
df_fraud.head()

Unnamed: 0,cc_num,amt,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [43]:
# Remove the state column
df_fraud = df_fraud.drop(columns=['state'])
df_fraud.head()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [44]:
# Remove the job column
df_fraud = df_fraud.drop(columns=['job'])
df_fraud.head()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,29209,33.9659,-80.9355,333497,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,84002,40.3207,-110.436,302,1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,11710,40.6729,-73.5365,34496,1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,32780,28.5697,-80.8191,54767,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,49632,44.2529,-85.017,1126,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [45]:
# Remove the date of birth column
df_fraud = df_fraud.drop(columns=['dob'])
df_fraud.head()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,29209,33.9659,-80.9355,333497,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,84002,40.3207,-110.436,302,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,11710,40.6729,-73.5365,34496,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,32780,28.5697,-80.8191,54767,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,49632,44.2529,-85.017,1126,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [46]:
# Remove the trans_num column
df_fraud = df_fraud.drop(columns=['trans_num'])
df_fraud.head()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,29209,33.9659,-80.9355,333497,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,84002,40.3207,-110.436,302,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,11710,40.6729,-73.5365,34496,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,32780,28.5697,-80.8191,54767,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,49632,44.2529,-85.017,1126,1371816917,44.959148,-85.884734,0


In [47]:
df_fraud.describe()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0
mean,4.178387e+17,69.39281,48842.628015,38.543253,-90.231325,88221.89,1380679000.0,38.542798,-90.23138,0.00386
std,1.309837e+18,156.745941,26855.283328,5.061336,13.72178,300390.9,5201104.0,5.095829,13.733071,0.062008
min,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1371817000.0,19.027422,-166.671575,0.0
25%,180042900000000.0,9.63,26292.0,34.6689,-96.798,741.0,1376029000.0,34.755302,-96.905129,0.0
50%,3521417000000000.0,47.29,48174.0,39.3716,-87.4769,2408.0,1380762000.0,39.376593,-87.445204,0.0
75%,4635331000000000.0,83.01,72011.0,41.8948,-80.1752,19685.0,1385867000.0,41.954163,-80.264637,0.0
max,4.992346e+18,22768.11,99921.0,65.6899,-67.9503,2906700.0,1388534000.0,66.679297,-66.952026,1.0


In [48]:
# Fitting and encoding columns with LabelEncoder
le = LabelEncoder()

# Encoding
le.fit(df_fraud["cc_num"])
df_fraud["cc_num"] = le.transform(df_fraud["cc_num"])

df_fraud.head()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,409,2.86,29209,33.9659,-80.9355,333497,1371816865,33.986391,-81.200714,0
1,552,29.84,84002,40.3207,-110.436,302,1371816873,39.450498,-109.960431,0
2,596,41.28,11710,40.6729,-73.5365,34496,1371816893,40.49581,-74.196111,0
3,583,60.05,32780,28.5697,-80.8191,54767,1371816915,28.812398,-80.883061,0
4,470,3.19,49632,44.2529,-85.017,1126,1371816917,44.959148,-85.884734,0


In [49]:
# Create features
X = df_fraud.drop(columns="is_fraud")

# Create target
y = df_fraud["is_fraud"]
X

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long
0,409,2.86,29209,33.9659,-80.9355,333497,1371816865,33.986391,-81.200714
1,552,29.84,84002,40.3207,-110.4360,302,1371816873,39.450498,-109.960431
2,596,41.28,11710,40.6729,-73.5365,34496,1371816893,40.495810,-74.196111
3,583,60.05,32780,28.5697,-80.8191,54767,1371816915,28.812398,-80.883061
4,470,3.19,49632,44.2529,-85.0170,1126,1371816917,44.959148,-85.884734
...,...,...,...,...,...,...,...,...,...
555714,199,43.77,63453,40.4931,-91.8912,519,1388534347,39.946837,-91.333331
555715,518,111.84,77566,29.0393,-95.4401,28739,1388534349,29.661049,-96.186633
555716,798,86.88,99323,46.1966,-118.9017,3684,1388534355,46.658340,-119.715054
555717,77,7.99,83643,44.6255,-116.4493,129,1388534364,44.470525,-117.080888


In [50]:
X.describe()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long
count,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0
mean,463.783434,69.39281,48842.628015,38.543253,-90.231325,88221.89,1380679000.0,38.542798,-90.23138
std,265.537414,156.745941,26855.283328,5.061336,13.72178,300390.9,5201104.0,5.095829,13.733071
min,0.0,1.0,1257.0,20.0271,-165.6723,23.0,1371817000.0,19.027422,-166.671575
25%,233.0,9.63,26292.0,34.6689,-96.798,741.0,1376029000.0,34.755302,-96.905129
50%,463.0,47.29,48174.0,39.3716,-87.4769,2408.0,1380762000.0,39.376593,-87.445204
75%,693.0,83.01,72011.0,41.8948,-80.1752,19685.0,1385867000.0,41.954163,-80.264637
max,923.0,22768.11,99921.0,65.6899,-67.9503,2906700.0,1388534000.0,66.679297,-66.952026


In [51]:
# Check the balance of target values
y.value_counts()

0    553574
1      2145
Name: is_fraud, dtype: int64

In [52]:
# Create X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   random_state=1,
                                                   stratify=y)

X_train.shape

(416789, 9)

In [53]:
# Create scaler instance
scaler = StandardScaler()

# Fit scaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [54]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train_scaled, y_train)

LogisticRegression(random_state=1)

In [55]:
# Calculate balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test_scaled)
bas = balanced_accuracy_score(y_test, y_pred)
print(bas)

0.4998590979377719


In [56]:
# Display confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"]
)
cm_df

Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,138355,39
Actual Low Risk,536,0


In [57]:
# Print imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      0.00      1.00      0.00      0.00    138394
          1       0.00      0.00      1.00      0.00      0.00      0.00       536

avg / total       0.99      1.00      0.00      0.99      0.00      0.00    138930



In [58]:
# Resample training data w/ RandomOverSampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled_ros, y_resampled_ros = ros.fit_resample(X_train_scaled, y_train)

In [59]:
# Train Logistic Regression model w/ resampled data
from sklearn.linear_model import LogisticRegression 

model_ros = LogisticRegression(solver='lbfgs', random_state=1)
model_ros.fit(X_resampled_ros, y_resampled_ros)

LogisticRegression(random_state=1)

In [60]:
# Calculate balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred_ros = model_ros.predict(X_test_scaled)

bas_ros = balanced_accuracy_score(y_test, y_pred_ros)
print(bas_ros)

0.8498869170628784


In [None]:
# Display confusion matrix
cm_ros = confusion_matrix(y_test, y_pred_ros)
cm_df_ros = pd.DataFrame(
    cm_ros, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"]
)
cm_df_ros

In [None]:
# Print imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred_ros))

In [None]:
# Resample data using ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(random_state=1)
X_resampled_cc, y_resampled_cc = cc.fit_resample(X_train_scaled, y_train)

from collections import Counter

Count(y_resampled_cc)

In [None]:
# Train logistic Regression model using resampled data
from sklearn.linear_model import LogisticRegression
model_cc = LogisticRegression(solver='lbfgs', random_state=1)
model_cc.fit(X_resampled_cc, y_resampled_cc)

In [None]:
# Calculate balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred_cc = mode_cc.predict(X_test_scaled)
bas_cc = balanced_accuracy_score(y_test, y_pred_cc)
print(bas_cc)

In [None]:
# Display confusion matrix
from sklearn.metrics import confusion_matrix
cm_cc = confusion_matrix(y_test, y_pred_cc)
cm_df_cc = pd.DataFrame(
    cm_cc, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"]
)
cm_df_cc

In [None]:
# Print imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred_cc))

In [None]:
# Resample training data w/ SMOTE 
from imblearn.over_sampling import SMOTE

X_resampled_smote, y_resampled_smote = SMOTE(random_state=1, sampling_strategy=1.0).fit_resample(
    X_train_scaled, y_train
)
from collections import Counter

Counter(y_resampled_smote)

In [None]:
# Train logistic regression model w/ resampled data
model_smote = LogisticRegression(solver='lbfgs', random_state=1)
model_smote.fit(X_resampled_smote, y_resampled_smote)

In [None]:
# Calculate balanced accuracy score
y_pred_smote = model_smote.predict(X_test_scaled)
bas_smote= balanced_accuracy_score(y_test, y_pred_smote)
print(bas_smote)

In [None]:
# Display confusion matrix
cm_eec = confusion_matrix(y_test, y_pred_eec)
cm_df_eec = pd.DataFrame(
    cm_eec, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"]
)
cm_df_eec

In [None]:
# Print the imbalanced classification report
print(classifcation_report_imbalanced(y_test, y_pred_eec))