In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the data
train_df = pd.read_csv('../Data/fraudTrain.csv')
test_df = pd.read_csv('../Data/fraudTest.csv')

train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

In [3]:
# Drop the columns that are not needed
columns_to_drop = ['first', 'last', 'street', 'city', 'state', 'job', 'trans_num', 'dob', 'city_pop', 'unix_time']

print('Columns dropped: ', (columns_to_drop))

train_df = train_df.drop(columns_to_drop, axis=1)
test_df = test_df.drop(columns_to_drop, axis=1)

Columns dropped:  ['first', 'last', 'street', 'city', 'state', 'job', 'trans_num', 'dob', 'city_pop', 'unix_time']


In [4]:
# Fix the date column

# Convert the date-time column to a datetime object
train_df['date'] = pd.to_datetime(train_df['trans_date_trans_time'])

# Extract the parts of the date
train_df['year'] = train_df['date'].dt.year
train_df['month'] = train_df['date'].dt.month
train_df['day'] = train_df['date'].dt.day
train_df['hour'] = train_df['date'].dt.hour
train_df['minute'] = train_df['date'].dt.minute
train_df['second'] = train_df['date'].dt.second

# Now you can drop the original date-time column
train_df = train_df.drop(['date', 'trans_date_trans_time'], axis=1)

# Convert the date-time column to a datetime object
test_df['date'] = pd.to_datetime(test_df['trans_date_trans_time'])

# Extract the parts of the date
test_df['year'] = test_df['date'].dt.year
test_df['month'] = test_df['date'].dt.month
test_df['day'] = test_df['date'].dt.day
test_df['hour'] = test_df['date'].dt.hour
test_df['minute'] = test_df['date'].dt.minute
test_df['second'] = test_df['date'].dt.second

# Now you can drop the original date-time column
test_df = test_df.drop(['date', 'trans_date_trans_time'], axis=1)

In [5]:
# Convert the categorical columns to numerical values

train_df['merchant'] = train_df['merchant'].astype('category')
train_df['merchant'] = train_df['merchant'].cat.codes

test_df['merchant'] = test_df['merchant'].astype('category')
test_df['merchant'] = test_df['merchant'].cat.codes

train_df['category'] = train_df['category'].astype('category')
train_df['category'] = train_df['category'].cat.codes

test_df['category'] = test_df['category'].astype('category')
test_df['category'] = test_df['category'].cat.codes


train_df['gender'].replace(['F', 'M'], [0, 1], inplace=True)
test_df['gender'].replace(['F', 'M'], [0, 1], inplace=True)

# train_df.head().to_csv('../Data/cleanedTrain.csv', index=False)
# test_df.head().to_csv('../Data/cleanedTest.csv', index=False)

In [6]:
# split the data into X and y

X_train = train_df.drop(['is_fraud'], axis=1)
y_train = train_df['is_fraud']
X_test = test_df.drop(['is_fraud'], axis=1)
y_test = test_df['is_fraud']

In [7]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,cc_num,merchant,category,amt,gender,zip,lat,long,merch_lat,merch_long,year,month,day,hour,minute,second
0,0,2703186189652095,514,8,4.97,0,28654,36.0788,-81.1781,36.011293,-82.048315,2019,1,1,0,0,18
1,1,630423337322,241,4,107.23,0,99160,48.8878,-118.2105,49.159047,-118.186462,2019,1,1,0,0,44
2,2,38859492057661,390,0,220.11,1,83252,42.1808,-112.262,43.150704,-112.154481,2019,1,1,0,0,51
3,3,3534093764340240,360,2,45.0,1,59632,46.2306,-112.1138,47.034331,-112.561071,2019,1,1,0,1,16
4,4,375534208663984,297,9,41.96,1,24433,38.4207,-79.4629,38.674999,-78.632459,2019,1,1,0,3,6


In [8]:
y_train

0          0
1          0
2          0
3          0
4          0
          ..
1296670    0
1296671    0
1296672    0
1296673    0
1296674    0
Name: is_fraud, Length: 1296675, dtype: int64

In [9]:
X_test.head()

Unnamed: 0.1,Unnamed: 0,cc_num,merchant,category,amt,gender,zip,lat,long,merch_lat,merch_long,year,month,day,hour,minute,second
0,0,2291163933867244,319,10,2.86,1,29209,33.9659,-80.9355,33.986391,-81.200714,2020,6,21,12,14,25
1,1,3573030041201292,591,10,29.84,0,84002,40.3207,-110.436,39.450498,-109.960431,2020,6,21,12,14,33
2,2,3598215285024754,611,5,41.28,0,11710,40.6729,-73.5365,40.49581,-74.196111,2020,6,21,12,14,53
3,3,3591919803438423,222,9,60.05,1,32780,28.5697,-80.8191,28.812398,-80.883061,2020,6,21,12,15,15
4,4,3526826139003047,292,13,3.19,1,49632,44.2529,-85.017,44.959148,-85.884734,2020,6,21,12,15,17


In [10]:
y_test

0         0
1         0
2         0
3         0
4         0
         ..
555714    0
555715    0
555716    0
555717    0
555718    0
Name: is_fraud, Length: 555719, dtype: int64

In [11]:
# Create the model
model = MLPClassifier(hidden_layer_sizes=(16, 8, 2), alpha=0.0001, random_state=1)

model.fit(X_train, y_train)

In [12]:
# Make predictions
y_pred = model.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

df

Unnamed: 0,Actual,Predicted
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
555714,0,0
555715,0,0
555716,0,0
555717,0,0


In [17]:
# Metrics
print('Decision Tree Classifier')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('Mean Absolute Percentage Error:', mean_absolute_percentage_error(y_test, y_pred))
print('R2 Score:', r2_score(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

Decision Tree Classifier
Accuracy: 0.9961401355721147
Mean Absolute Error: 0.0038598644278853163
Mean Squared Error: 0.0038598644278853163
Root Mean Squared Error: 0.062127807203258965
Mean Absolute Percentage Error: 0.0038598644278853163
R2 Score: -0.003874820710510818
Confusion Matrix:
 [[553574      0]
 [  2145      0]]
