# Fraud Detection

## Data Analysis

In [1]:
import glob
import pandas as pd
import numpy as np

input_filenames = sorted(glob.glob('fraud_detection/*.Inputs'))
target_filenames = sorted(glob.glob('fraud_detection/*.Targets'))

df = pd.DataFrame()
for input_file, target_file in zip(input_filenames, target_filenames):
    df = df.append(pd.read_csv(input_file))
    df['fraud'] = pd.read_csv(target_file)

df.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,amount,domain1,field1,field2,field3,field4,field5,flag1,flag2,flag3,flag4,flag5,fraud,hour1,hour2,indicator1,indicator2,state1,total,zip1
0,25.9,BELLSOUTH.NET,3,1,3878,8,0,1,0,1,0,1,0.0,0,0,0,0,FL,25.9,331
1,38.85,COMCAST.NET,2,1,-6330,21,1,0,1,1,0,1,0.0,0,0,0,0,TX,38.85,750
2,38.85,HOTMAIL.COM,2,0,5183,19,1,0,0,0,0,1,0.0,1,1,0,0,VA,38.85,222
3,24.95,GMAIL.COM,0,0,3822,16,0,0,0,0,0,1,0.0,1,1,0,0,CA,24.95,946
4,20.72,LEVEL3.COM,3,0,3536,8,1,1,1,1,0,1,0.0,1,1,0,0,CO,20.72,805


### Dataset Profiling Report

In [2]:
# import pandas_profiling

# profile = pandas_profiling.ProfileReport(df)
# profile.to_file(output_file="ProfilingResult.html")

In [3]:
# profile

### Rows containing null values

In [4]:
df[pd.isnull(df).any(axis=1)]

Unnamed: 0,amount,domain1,field1,field2,field3,field4,field5,flag1,flag2,flag3,flag4,flag5,fraud,hour1,hour2,indicator1,indicator2,state1,total,zip1
63375,38.85,,3,0,-2459,38,24,0,1,1,0,0,0.0,16,16,0,0,NY,38.85,105
6991,38.85,,3,0,-2459,38,24,0,1,1,0,0,0.0,16,16,0,0,NY,38.85,105
85213,38.85,AOL.COM,2,0,-676,8,2,0,1,1,0,4,,19,19,0,0,AZ,38.85,857
63292,38.85,,3,0,-2459,38,24,0,1,1,0,0,0.0,16,16,0,0,NY,38.85,105
63317,38.85,,3,0,-2459,38,24,0,1,1,0,0,0.0,16,16,0,0,NY,38.85,105
85213,38.85,AOL.COM,2,0,-676,8,2,0,1,1,0,4,,19,19,0,0,AZ,38.85,857
63358,38.85,,3,0,-2459,38,24,0,1,1,0,0,0.0,16,16,0,0,NY,38.85,105
85213,38.85,AOL.COM,2,0,-676,8,2,0,1,1,0,4,,19,19,0,0,AZ,38.85,857
63302,38.85,,3,0,-2459,38,24,0,1,1,0,0,0.0,16,16,0,0,NY,38.85,105
85213,38.85,AOL.COM,2,0,-676,8,2,0,1,1,0,4,,19,19,0,0,AZ,38.85,857


## Data Preprocessing

Drop highly correlated columns

In [5]:
clean_df = df.drop(columns=['hour2', 'domain1', 'total'])
clean_df.head()

Unnamed: 0,amount,field1,field2,field3,field4,field5,flag1,flag2,flag3,flag4,flag5,fraud,hour1,indicator1,indicator2,state1,zip1
0,25.9,3,1,3878,8,0,1,0,1,0,1,0.0,0,0,0,FL,331
1,38.85,2,1,-6330,21,1,0,1,1,0,1,0.0,0,0,0,TX,750
2,38.85,2,0,5183,19,1,0,0,0,0,1,0.0,1,0,0,VA,222
3,24.95,0,0,3822,16,0,0,0,0,0,1,0.0,1,0,0,CA,946
4,20.72,3,0,3536,8,1,1,1,1,0,1,0.0,1,0,0,CO,805


In [6]:
clean_df = clean_df.dropna()
clean_df = clean_df.drop_duplicates()

In [7]:
clean_df['state1'] = clean_df['state1'].astype('category')
clean_df['state1'] = clean_df['state1'].cat.codes

clean_df.head()

Unnamed: 0,amount,field1,field2,field3,field4,field5,flag1,flag2,flag3,flag4,flag5,fraud,hour1,indicator1,indicator2,state1,zip1
0,25.9,3,1,3878,8,0,1,0,1,0,1,0.0,0,0,0,11,331
1,38.85,2,1,-6330,21,1,0,1,1,0,1,0.0,0,0,0,45,750
2,38.85,2,0,5183,19,1,0,0,0,0,1,0.0,1,0,0,47,222
3,24.95,0,0,3822,16,0,0,0,0,0,1,0.0,1,0,0,6,946
4,20.72,3,0,3536,8,1,1,1,1,0,1,0.0,1,0,0,7,805


In [8]:
clean_df.shape

(99305, 17)

## Split dataset

In [9]:
from sklearn.model_selection import train_test_split

X = clean_df.drop(columns=['fraud'])
y = clean_df['fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

In [10]:
X_train

Unnamed: 0,amount,field1,field2,field3,field4,field5,flag1,flag2,flag3,flag4,flag5,hour1,indicator1,indicator2,state1,zip1
29863,12.95,0,1,46,9,0,1,0,0,0,1,16,1,0,6,910
78849,38.85,3,0,3118,8,0,0,0,0,0,1,15,0,0,40,190
7070,38.85,3,1,-9540,19,0,1,0,0,0,1,16,0,0,21,19
5935,11.01,3,0,1097,6,4,1,0,0,0,1,22,0,0,36,112
59956,12.95,3,1,1177,22,0,0,1,1,0,1,21,0,0,18,667
61589,12.95,3,1,1478,24,4,0,1,1,0,1,20,0,0,6,953
23511,25.90,0,0,-440,23,0,1,1,1,0,1,0,1,0,6,900
13546,49.95,3,1,2636,22,0,0,1,0,0,1,16,0,0,45,750
63837,12.95,3,0,30,20,1,1,1,1,0,1,6,0,0,37,445
4948,38.85,0,0,4170,18,2,0,1,1,0,1,18,0,0,45,752


In [11]:
# import pandas_profiling

# profile = pandas_profiling.ProfileReport(df_train)
# profile.to_file(output_file="PreprocessedProfilingResult.html")

## Oversampling & Undersampling

### Oversampling Approach

In [12]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(ratio='minority')
X_train_oversampling = X_train
y_train_oversampling = y_train
X_res, y_res = smote.fit_resample(X_train_oversampling, y_train_oversampling)

dataset_res = []
for i in range(len(y_res)):
    dataset_res.append(np.append(X_res[i], y_res[i]))

dataset_res = np.asarray(dataset_res)
dataset_res.shape
dataset_res

Using TensorFlow backend.


array([[1.29500000e+01, 0.00000000e+00, 1.00000000e+00, ...,
        6.00000000e+00, 9.10000000e+02, 0.00000000e+00],
       [3.88500000e+01, 3.00000000e+00, 0.00000000e+00, ...,
        4.00000000e+01, 1.90000000e+02, 0.00000000e+00],
       [3.88500000e+01, 3.00000000e+00, 1.00000000e+00, ...,
        2.10000000e+01, 1.90000000e+01, 0.00000000e+00],
       ...,
       [2.29786152e+01, 3.22558956e+00, 0.00000000e+00, ...,
        2.12794781e+00, 9.87299649e+02, 1.00000000e+00],
       [2.34539614e+01, 3.40555835e+00, 5.94441646e-01, ...,
        5.00000000e+00, 8.52594442e+02, 1.00000000e+00],
       [1.89536814e+01, 2.23180237e+00, 2.31802369e-01, ...,
        6.00000000e+00, 9.34026862e+02, 1.00000000e+00]])

In [13]:
dataset_res[0]

array([ 12.95,   0.  ,   1.  ,  46.  ,   9.  ,   0.  ,   1.  ,   0.  ,
         0.  ,   0.  ,   1.  ,  16.  ,   1.  ,   0.  ,   6.  , 910.  ,
         0.  ])

In [14]:
columns = X.columns.tolist()
columns.append('fraud')

oversampled_df = pd.DataFrame(dataset_res, columns=columns)
oversampled_df

Unnamed: 0,amount,field1,field2,field3,field4,field5,flag1,flag2,flag3,flag4,flag5,hour1,indicator1,indicator2,state1,zip1,fraud
0,12.950000,0.000000,1.000000,46.000000,9.000000,0.000000,1.000000,0.000000,0.000000,0.000000,1.000000,16.000000,1.000000,0.0,6.000000,910.000000,0.0
1,38.850000,3.000000,0.000000,3118.000000,8.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,15.000000,0.000000,0.0,40.000000,190.000000,0.0
2,38.850000,3.000000,1.000000,-9540.000000,19.000000,0.000000,1.000000,0.000000,0.000000,0.000000,1.000000,16.000000,0.000000,0.0,21.000000,19.000000,0.0
3,11.010000,3.000000,0.000000,1097.000000,6.000000,4.000000,1.000000,0.000000,0.000000,0.000000,1.000000,22.000000,0.000000,0.0,36.000000,112.000000,0.0
4,12.950000,3.000000,1.000000,1177.000000,22.000000,0.000000,0.000000,1.000000,1.000000,0.000000,1.000000,21.000000,0.000000,0.0,18.000000,667.000000,0.0
5,12.950000,3.000000,1.000000,1478.000000,24.000000,4.000000,0.000000,1.000000,1.000000,0.000000,1.000000,20.000000,0.000000,0.0,6.000000,953.000000,0.0
6,25.900000,0.000000,0.000000,-440.000000,23.000000,0.000000,1.000000,1.000000,1.000000,0.000000,1.000000,0.000000,1.000000,0.0,6.000000,900.000000,0.0
7,49.950000,3.000000,1.000000,2636.000000,22.000000,0.000000,0.000000,1.000000,0.000000,0.000000,1.000000,16.000000,0.000000,0.0,45.000000,750.000000,0.0
8,12.950000,3.000000,0.000000,30.000000,20.000000,1.000000,1.000000,1.000000,1.000000,0.000000,1.000000,6.000000,0.000000,0.0,37.000000,445.000000,0.0
9,38.850000,0.000000,0.000000,4170.000000,18.000000,2.000000,0.000000,1.000000,1.000000,0.000000,1.000000,18.000000,0.000000,0.0,45.000000,752.000000,0.0


In [15]:
# import pandas_profiling

# profile = pandas_profiling.ProfileReport(oversampled_df)
# profile.to_file(output_file="OversampledProfilingResult.html")

In [16]:
oversampled_df['fraud'].value_counts()

1.0    65495
0.0    65495
Name: fraud, dtype: int64

### Undersampling Approach

In [17]:
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from sklearn.datasets import make_classification
from collections import Counter

X_train_undersampling = X_train
y_train_undersampling = y_train
# print (y_train_undersampling)
print (X_train_undersampling.shape)

renn = RepeatedEditedNearestNeighbours(sampling_strategy='majority')
X_res, y_res = renn.fit_resample(X_train_undersampling, y_train_undersampling)
# print (y)
print (X_res.shape)

(74478, 16)
(40751, 16)


In [18]:
dataset_res = []
for i in range(len(y_res)):
    dataset_res.append(np.append(X_res[i], y_res[i]))

dataset_res = np.asarray(dataset_res)

undersampled_df = pd.DataFrame(dataset_res, columns=columns)
undersampled_df

Unnamed: 0,amount,field1,field2,field3,field4,field5,flag1,flag2,flag3,flag4,flag5,hour1,indicator1,indicator2,state1,zip1,fraud
0,38.85,3.0,0.0,3118.0,8.0,0.0,0.0,0.0,0.0,0.0,1.0,15.0,0.0,0.0,40.0,190.0,0.0
1,38.85,3.0,1.0,-9540.0,19.0,0.0,1.0,0.0,0.0,0.0,1.0,16.0,0.0,0.0,21.0,19.0,0.0
2,11.01,3.0,0.0,1097.0,6.0,4.0,1.0,0.0,0.0,0.0,1.0,22.0,0.0,0.0,36.0,112.0,0.0
3,25.90,0.0,0.0,-440.0,23.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,6.0,900.0,0.0
4,49.95,3.0,1.0,2636.0,22.0,0.0,0.0,1.0,0.0,0.0,1.0,16.0,0.0,0.0,45.0,750.0,0.0
5,38.85,0.0,0.0,4170.0,18.0,2.0,0.0,1.0,1.0,0.0,1.0,18.0,0.0,0.0,45.0,752.0,0.0
6,12.95,3.0,0.0,-2980.0,17.0,0.0,1.0,1.0,1.0,0.0,1.0,20.0,0.0,0.0,37.0,452.0,0.0
7,10.36,3.0,0.0,3174.0,9.0,9.0,1.0,0.0,0.0,0.0,1.0,22.0,1.0,0.0,37.0,452.0,0.0
8,12.95,0.0,1.0,-4664.0,6.0,1.0,0.0,1.0,1.0,0.0,1.0,23.0,0.0,0.0,13.0,967.0,0.0
9,38.85,1.0,1.0,-1282.0,16.0,1.0,0.0,0.0,0.0,0.0,1.0,14.0,0.0,0.0,11.0,339.0,0.0


In [19]:
undersampled_df['fraud'].value_counts()

0.0    31768
1.0     8983
Name: fraud, dtype: int64

In [20]:
y_train.value_counts()

0.0    65495
1.0     8983
Name: fraud, dtype: int64

### Hybrid Approach

In [21]:
from imblearn.combine import SMOTEENN

X_train_combine = X_train
y_train_combine = y_train

print (X_train_combine.shape)

sme = SMOTEENN(sampling_strategy='minority')
X_res, y_res = sme.fit_resample(X_train_combine, y_train_combine)

print (X_res.shape)

(74478, 16)
(67699, 16)


In [22]:
dataset_res = []
for i in range(len(y_res)):
    dataset_res.append(np.append(X_res[i], y_res[i]))

dataset_res = np.asarray(dataset_res)

combined_df = pd.DataFrame(dataset_res, columns=columns)
combined_df

Unnamed: 0,amount,field1,field2,field3,field4,field5,flag1,flag2,flag3,flag4,flag5,hour1,indicator1,indicator2,state1,zip1,fraud
0,38.850000,3.000000,0.000000,3118.000000,8.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,15.000000,0.000000,0.000000,40.000000,190.000000,0.0
1,38.850000,3.000000,1.000000,-9540.000000,19.000000,0.000000,1.000000,0.000000,0.000000,0.000000,1.000000,16.000000,0.000000,0.000000,21.000000,19.000000,0.0
2,11.010000,3.000000,0.000000,1097.000000,6.000000,4.000000,1.000000,0.000000,0.000000,0.000000,1.000000,22.000000,0.000000,0.000000,36.000000,112.000000,0.0
3,12.950000,3.000000,1.000000,1177.000000,22.000000,0.000000,0.000000,1.000000,1.000000,0.000000,1.000000,21.000000,0.000000,0.000000,18.000000,667.000000,0.0
4,49.950000,3.000000,1.000000,2636.000000,22.000000,0.000000,0.000000,1.000000,0.000000,0.000000,1.000000,16.000000,0.000000,0.000000,45.000000,750.000000,0.0
5,12.950000,3.000000,0.000000,-2980.000000,17.000000,0.000000,1.000000,1.000000,1.000000,0.000000,1.000000,20.000000,0.000000,0.000000,37.000000,452.000000,0.0
6,10.360000,3.000000,0.000000,3174.000000,9.000000,9.000000,1.000000,0.000000,0.000000,0.000000,1.000000,22.000000,1.000000,0.000000,37.000000,452.000000,0.0
7,12.950000,0.000000,1.000000,-4664.000000,6.000000,1.000000,0.000000,1.000000,1.000000,0.000000,1.000000,23.000000,0.000000,0.000000,13.000000,967.000000,0.0
8,38.850000,1.000000,1.000000,-1282.000000,16.000000,1.000000,0.000000,0.000000,0.000000,0.000000,1.000000,14.000000,0.000000,0.000000,11.000000,339.000000,0.0
9,38.850000,3.000000,1.000000,-2067.000000,7.000000,0.000000,1.000000,1.000000,1.000000,0.000000,2.000000,19.000000,0.000000,0.000000,21.000000,16.000000,0.0


In [27]:
combined_df['fraud'].value_counts()

1.0    35645
0.0    32054
Name: fraud, dtype: int64

## Model and Evaluation

In [23]:
# from sklearn.model_selection import StratifiedKFold
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics import accuracy_score

# X = oversampled_df.drop(columns=['fraud']).to_numpy()
# y = oversampled_df['fraud']

# model = MultinomialNB(alpha=1e-10)

# kf = StratifiedKFold(n_splits=10)
# for train_index, test_index in kf.split(X, y):
#     X_train_model, X_validation_model = X[train_index], X[test_index]
#     y_train_model, y_validation_model = y[train_index], y[test_index]
#     model.fit(X_train_model, y_train_model)
#     predict = model.predict(X_validation_model)
#     print ("Accuracy\t", accuracy_score(y_validation_model, predict))

In [25]:
from sklearn.model_selection import StratifiedKFold
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score

X = combined_df.drop(columns=['fraud']).to_numpy()
y = combined_df['fraud']

model = Sequential()
model.add(Dense(50, input_dim=16, init='uniform', activation='relu'))
model.add(Dense(25, init='uniform', activation='relu'))
model.add(Dense(10, init='uniform', activation='relu'))
model.add(Dense(1, init='uniform', activation='relu'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

kf = StratifiedKFold(n_splits=2)
for train_index, test_index in kf.split(X, y):
    X_train_model, X_validation_model = X[train_index], X[test_index]
    y_train_model, y_validation_model = y[train_index], y[test_index]
    history = model.fit(X_train_model, y_train_model, nb_epoch=50, batch_size=50,  verbose=1)
    predict = model.predict_classes(X_validation_model)
    print ("Accuracy\t", accuracy_score(y_validation_model, predict))

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':
  del sys.path[0]


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Accuracy	 0.6492171344165436
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50


Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Accuracy	 0.6428845756152324


In [26]:
from sklearn.metrics import classification_report

predict_test = model.predict_classes(X_test)
print ("Accuracy\t", accuracy_score(y_test, predict_test))
print(classification_report(y_test, predict_test, target_names=['0', '1']))

Accuracy	 0.5153260563096629
              precision    recall  f1-score   support

           0       0.91      0.50      0.64     21833
           1       0.15      0.65      0.24      2994

    accuracy                           0.52     24827
   macro avg       0.53      0.57      0.44     24827
weighted avg       0.82      0.52      0.60     24827

