In [122]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from pathlib import Path
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [123]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix 
from imblearn.metrics import classification_report_imbalanced

In [124]:
data = Path('./Data/fraudTest.csv')

df_fraud = pd.read_csv(data, parse_dates=True, index_col=0)
df_fraud.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [125]:
# Remove the trans_date_trans_time column
df_fraud = df_fraud.drop(columns=['trans_date_trans_time'])
df_fraud.head()

Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,state,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,NY,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [126]:
# Remove the merchant column
df_fraud = df_fraud.drop(columns=['merchant'])
df_fraud.head()

Unnamed: 0,cc_num,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [127]:
# Remove the category column
df_fraud = df_fraud.drop(columns=['category'])
df_fraud.head()

Unnamed: 0,cc_num,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [128]:
# Remove the first name column
df_fraud = df_fraud.drop(columns=['first'])
df_fraud.head()

Unnamed: 0,cc_num,amt,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,Elliott,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,Williams,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,Lopez,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [129]:
# Remove the last name column
df_fraud = df_fraud.drop(columns=['last'])
df_fraud.head()

Unnamed: 0,cc_num,amt,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,M,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,M,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [130]:
# Remove the gender column
df_fraud = df_fraud.drop(columns=['gender'])
df_fraud.head()

Unnamed: 0,cc_num,amt,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [131]:
# Remove the street column
df_fraud = df_fraud.drop(columns=['street'])
df_fraud.head()

Unnamed: 0,cc_num,amt,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [132]:
# Remove the city column
df_fraud = df_fraud.drop(columns=['city'])
df_fraud.head()

Unnamed: 0,cc_num,amt,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [133]:
# Remove the state column
df_fraud = df_fraud.drop(columns=['state'])
df_fraud.head()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [134]:
# Remove the job column
df_fraud = df_fraud.drop(columns=['job'])
df_fraud.head()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,29209,33.9659,-80.9355,333497,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,84002,40.3207,-110.436,302,1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,11710,40.6729,-73.5365,34496,1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,32780,28.5697,-80.8191,54767,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,49632,44.2529,-85.017,1126,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [135]:
# Remove the date of birth column
df_fraud = df_fraud.drop(columns=['dob'])
df_fraud.head()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,29209,33.9659,-80.9355,333497,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,84002,40.3207,-110.436,302,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,11710,40.6729,-73.5365,34496,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,32780,28.5697,-80.8191,54767,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,49632,44.2529,-85.017,1126,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [136]:
# Remove the trans_num column
df_fraud = df_fraud.drop(columns=['trans_num'])
df_fraud.head()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,2291163933867244,2.86,29209,33.9659,-80.9355,333497,1371816865,33.986391,-81.200714,0
1,3573030041201292,29.84,84002,40.3207,-110.436,302,1371816873,39.450498,-109.960431,0
2,3598215285024754,41.28,11710,40.6729,-73.5365,34496,1371816893,40.49581,-74.196111,0
3,3591919803438423,60.05,32780,28.5697,-80.8191,54767,1371816915,28.812398,-80.883061,0
4,3526826139003047,3.19,49632,44.2529,-85.017,1126,1371816917,44.959148,-85.884734,0


In [137]:
df_fraud.describe()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0
mean,4.178387e+17,69.39281,48842.628015,38.543253,-90.231325,88221.89,1380679000.0,38.542798,-90.23138,0.00386
std,1.309837e+18,156.745941,26855.283328,5.061336,13.72178,300390.9,5201104.0,5.095829,13.733071,0.062008
min,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1371817000.0,19.027422,-166.671575,0.0
25%,180042900000000.0,9.63,26292.0,34.6689,-96.798,741.0,1376029000.0,34.755302,-96.905129,0.0
50%,3521417000000000.0,47.29,48174.0,39.3716,-87.4769,2408.0,1380762000.0,39.376593,-87.445204,0.0
75%,4635331000000000.0,83.01,72011.0,41.8948,-80.1752,19685.0,1385867000.0,41.954163,-80.264637,0.0
max,4.992346e+18,22768.11,99921.0,65.6899,-67.9503,2906700.0,1388534000.0,66.679297,-66.952026,1.0


In [138]:
# Fitting and encoding columns with LabelEncoder
le = LabelEncoder()

# Encoding
le.fit(df_fraud["cc_num"])
df_fraud["cc_num"] = le.transform(df_fraud["cc_num"])

df_fraud.head()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
0,409,2.86,29209,33.9659,-80.9355,333497,1371816865,33.986391,-81.200714,0
1,552,29.84,84002,40.3207,-110.436,302,1371816873,39.450498,-109.960431,0
2,596,41.28,11710,40.6729,-73.5365,34496,1371816893,40.49581,-74.196111,0
3,583,60.05,32780,28.5697,-80.8191,54767,1371816915,28.812398,-80.883061,0
4,470,3.19,49632,44.2529,-85.017,1126,1371816917,44.959148,-85.884734,0


In [139]:
# Create features
X = df_fraud.drop(columns="is_fraud")

# Create target
y = df_fraud["is_fraud"]
X

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long
0,409,2.86,29209,33.9659,-80.9355,333497,1371816865,33.986391,-81.200714
1,552,29.84,84002,40.3207,-110.4360,302,1371816873,39.450498,-109.960431
2,596,41.28,11710,40.6729,-73.5365,34496,1371816893,40.495810,-74.196111
3,583,60.05,32780,28.5697,-80.8191,54767,1371816915,28.812398,-80.883061
4,470,3.19,49632,44.2529,-85.0170,1126,1371816917,44.959148,-85.884734
...,...,...,...,...,...,...,...,...,...
555714,199,43.77,63453,40.4931,-91.8912,519,1388534347,39.946837,-91.333331
555715,518,111.84,77566,29.0393,-95.4401,28739,1388534349,29.661049,-96.186633
555716,798,86.88,99323,46.1966,-118.9017,3684,1388534355,46.658340,-119.715054
555717,77,7.99,83643,44.6255,-116.4493,129,1388534364,44.470525,-117.080888


In [140]:
X.describe()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long
count,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0
mean,463.783434,69.39281,48842.628015,38.543253,-90.231325,88221.89,1380679000.0,38.542798,-90.23138
std,265.537414,156.745941,26855.283328,5.061336,13.72178,300390.9,5201104.0,5.095829,13.733071
min,0.0,1.0,1257.0,20.0271,-165.6723,23.0,1371817000.0,19.027422,-166.671575
25%,233.0,9.63,26292.0,34.6689,-96.798,741.0,1376029000.0,34.755302,-96.905129
50%,463.0,47.29,48174.0,39.3716,-87.4769,2408.0,1380762000.0,39.376593,-87.445204
75%,693.0,83.01,72011.0,41.8948,-80.1752,19685.0,1385867000.0,41.954163,-80.264637
max,923.0,22768.11,99921.0,65.6899,-67.9503,2906700.0,1388534000.0,66.679297,-66.952026


In [141]:
new_df = df_fraud[(df_fraud.is_fraud == 1)]

In [142]:
new_df.head()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
1685,529,24.84,79759,31.8599,-102.7413,23,1371852399,32.575873,-102.60429,1
1767,829,780.52,53803,42.5545,-90.3508,1306,1371853942,42.461127,-91.147148,1
1781,829,620.33,53803,42.5545,-90.3508,1306,1371854247,42.771834,-90.158365,1
1784,603,1077.69,70726,30.459,-90.9027,71335,1371854335,31.204974,-90.261595,1
1857,529,842.65,79759,31.8599,-102.7413,23,1371855736,31.315782,-102.73639,1


In [143]:
new_df.describe()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,2145.0,2145.0,2145.0,2145.0,2145.0,2145.0,2145.0,2145.0,2145.0,2145.0
mean,435.035431,528.356494,47862.900233,39.019971,-90.445556,64529.32,1379587000.0,39.018618,-90.465222,1.0
std,266.215066,392.747594,26706.283776,5.076671,14.147521,219207.5,4546697.0,5.093782,14.159252,0.0
min,4.0,1.78,1257.0,20.0827,-155.488,23.0,1371852000.0,19.161782,-156.480766,1.0
25%,199.0,214.51,23937.0,34.9572,-96.743,1131.0,1375756000.0,35.01748,-96.6971,1.0
50%,416.0,371.94,47987.0,39.8936,-88.0935,2870.0,1379548000.0,39.744929,-88.040722,1.0
75%,648.0,907.77,68031.0,42.1808,-79.7856,12335.0,1383350000.0,42.208725,-79.909134,1.0
max,922.0,1320.92,99921.0,55.4732,-67.9503,1577385.0,1387754000.0,56.214113,-66.960745,1.0


In [144]:
# Check balance of target values
y.value_counts(200)

0    0.99614
1    0.00386
Name: is_fraud, dtype: float64

In [145]:
# Split x and y into X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   random_state =1,
                                                   stratify=y)

X_train.shape

(416789, 9)

In [146]:
scaler = StandardScaler()

# Fitting the scaler
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [148]:
# Resample the training data with the BalancedRandomClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brf = BalancedRandomForestClassifier(n_estimators=1000, random_state=1)
brf.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(n_estimators=1000, random_state=1)

In [149]:
# Calculate the balanced accuracy score
y_pred_brf = brf.predict(X_test_scaled)
bas_brf = balanced_accuracy_score(y_test, y_pred_brf)
print(bas_brf)

0.9276598000862344


In [150]:
# Display the confusion matrix
cm_brf = confusion_matrix(y_test, y_pred_brf)
cm_df_brf = pd.DataFrame(
    cm_brf, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"]
)
cm_df_brf

Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,129990,8404
Actual Low Risk,45,491


In [151]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_brf))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.94      0.92      0.97      0.93      0.86    138394
          1       0.06      0.92      0.94      0.10      0.93      0.86       536

avg / total       1.00      0.94      0.92      0.97      0.93      0.86    138930



In [152]:
# List the features sorted in descending order by importance
importance = brf.feature_importances_
sorted(zip(brf.feature_importances_, X.columns), reverse=True)

[(0.6501918432149896, 'amt'),
 (0.07733038244114203, 'unix_time'),
 (0.04849604289489086, 'city_pop'),
 (0.04283550024331607, 'cc_num'),
 (0.03950623853804815, 'lat'),
 (0.03732312337604167, 'merch_lat'),
 (0.03691558089854136, 'merch_long'),
 (0.034807250645780656, 'long'),
 (0.03259403774724961, 'zip')]

In [153]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=1000, random_state=1)
eec.fit(X_train_scaled, y_train)

EasyEnsembleClassifier(n_estimators=1000, random_state=1)

In [None]:
# Calculate the balanced accuracy score
y_pred_eec = eec.predict(X_test_scaled)
bas_eec=balanced_accuracy_score(y_test, y_pred_brf)
print(bas_eec)

In [None]:
# Display confusion matrix
cm_eec = confusion(y_test, y_pred_eec)
cm_df_eec = pd.DataFrame(
    cm_eec, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"]
)
cm_df_eec

In [156]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_eec))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.89      0.88      0.94      0.89      0.79    138394
          1       0.03      0.88      0.89      0.06      0.89      0.79       536

avg / total       1.00      0.89      0.88      0.94      0.89      0.79    138930

