In [141]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from geopy import distance
import joblib

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE 

# Simple Neural Networks

In [142]:
# Load some data
df = pd.read_csv("https://jrssbcrsefilesnait.blob.core.windows.net/3950data1/fraudTrain.csv.zip")
df.drop(columns={"Unnamed: 0"}, inplace=True)
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [143]:
df.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
trans_date_trans_time,1296675.0,1274791.0,2019-04-22 16:02:01,4.0,,,,,,,
cc_num,1296675.0,,,,4.1719204207972666e+17,1.3088064470007892e+18,60416207185.0,180042946491150.0,3521417320836166.0,4642255475285942.0,4.992346398065154e+18
merchant,1296675.0,693.0,fraud_Kilback LLC,4403.0,,,,,,,
category,1296675.0,14.0,gas_transport,131659.0,,,,,,,
amt,1296675.0,,,,70.351035,160.316039,1.0,9.65,47.52,83.14,28948.9
first,1296675.0,352.0,Christopher,26669.0,,,,,,,
last,1296675.0,481.0,Smith,28794.0,,,,,,,
gender,1296675.0,2.0,F,709863.0,,,,,,,
street,1296675.0,983.0,0069 Robin Brooks Apt. 695,3123.0,,,,,,,
city,1296675.0,894.0,Birmingham,5617.0,,,,,,,


In [144]:
print(df.isnull().sum())

trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64


### Deal with Lat/Lon

We can utilize lat/lon of the home and merchant in a useful way?

Note: I left the section headers in from when I did it. You can remove them if you want. 

In [145]:
#calculating the distance between customer and merchants
def calculate_distance(row):
    return distance.distance((row['lat'], row['long']), (row['merch_lat'], row['merch_long'])).miles

df['distance'] = df.apply(calculate_distance, axis=1)

### Deal with Time

Can we make date/time and the date of birth into something useful?

In [146]:
#converting to datetime and extracting the hour and day of the weel 
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['trans_hour'] = df['trans_date_trans_time'].dt.hour
df['trans_day'] = df['trans_date_trans_time'].dt.dayofweek

# calculating the age
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = np.floor((pd.to_datetime('now') - df['dob']).dt.days / 365.25)

### Check Target Balance

In [147]:
# checking the target balance
fraud_count = df['is_fraud'].sum()
non_fraud_count = len(df) - fraud_count
balance_ratio = fraud_count / non_fraud_count
print(f"Fraud to Non-Fraud Ratio: {balance_ratio}")

Fraud to Non-Fraud Ratio: 0.005822355331224998


### Prepare Data


In [148]:
print(df.columns.tolist)

<bound method IndexOpsMixin.tolist of Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud', 'distance', 'trans_hour', 'trans_day', 'age'],
      dtype='object')>


In [149]:
#dropping unnecessary data 
df = df.drop(columns=['trans_date_trans_time', 'cc_num', 'first', 'last', 'street', 'zip', 'trans_num', 'unix_time', 'dob'])

In [150]:
# label encoding categorical columns
label_encoder_city = LabelEncoder()
label_encoder_job = LabelEncoder()
label_encoder_merchant = LabelEncoder()
label_encoder_gender = LabelEncoder()
label_encoder_state = LabelEncoder()
label_encoder_category = LabelEncoder()

In [151]:
# fitting label encoders
df['city'] = label_encoder_city.fit_transform(df['city'])
df['job'] = label_encoder_job.fit_transform(df['job'])
df['merchant'] = label_encoder_merchant.fit_transform(df['merchant'])
df['gender'] = label_encoder_gender.fit_transform(df['gender'])
df['state'] = label_encoder_state.fit_transform(df['state'])
df['category'] = label_encoder_category.fit_transform(df['category'])

In [152]:
#numerical columns that need to be scaled
num_cols = ['amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long', 'distance', 'trans_hour', 'trans_day']
scaler = StandardScaler()

In [153]:
# saving label encoders and scaler
joblib.dump(label_encoder_city, 'label_encoder_city.joblib')
joblib.dump(label_encoder_job, 'label_encoder_job.joblib')
joblib.dump(label_encoder_merchant, 'label_encoder_merchant.joblib')
joblib.dump(label_encoder_gender, 'label_encoder_gender.joblib')
joblib.dump(label_encoder_state, 'label_encoder_state.joblib')
joblib.dump(label_encoder_category, 'label_encoder_category.joblib')
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']

### Split Data

In [154]:
#sliptting the data
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [155]:
# fitting the scaler on training data + tranforming the training and testing data
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [156]:
#using smote to oversample 
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [157]:
# giving importance to minority class
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))
print(class_weights_dict)

{0: 0.5029020103669545, 1: 86.64717674574005}


In [158]:
n_features = X_train_smote.shape[1]

### Model

In [159]:
# defining the model
model = keras.Sequential([
    Input(shape=(n_features,)),  
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])



# compiling and trying different optimizers
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)  # Try different learning rates
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# fitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5)  # Increased patience

history = model.fit(
    X_train_smote, y_train_smote,
    epochs=10,
    batch_size=64,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping],
    class_weight=class_weights_dict
)

# evaluating
scores = model.evaluate(X_test, y_test)
print(f"Accuracy: {scores[1]*100}")

# saving the model
model.save('my_model.keras')

Epoch 1/10
[1m32230/32230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 1ms/step - accuracy: 0.4995 - loss: 63.8102 - val_accuracy: 0.0059 - val_loss: 4.5824
Epoch 2/10
[1m32230/32230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 1ms/step - accuracy: 0.5004 - loss: 1.2957 - val_accuracy: 0.0059 - val_loss: 3.6580
Epoch 3/10
[1m32230/32230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 1ms/step - accuracy: 0.4998 - loss: 1.1461 - val_accuracy: 0.0061 - val_loss: 3.6450
Epoch 4/10
[1m32230/32230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 1ms/step - accuracy: 0.5025 - loss: 1.1017 - val_accuracy: 0.0125 - val_loss: 3.5852
Epoch 5/10
[1m32230/32230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 992us/step - accuracy: 0.5171 - loss: 1.0651 - val_accuracy: 0.0395 - val_loss: 3.2142
Epoch 6/10
[1m32230/32230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 1ms/step - accuracy: 0.5337 - loss: 1.0386 - val_accuracy: 0.0758 - val_loss: 3.

### Evaluation

In [169]:
#predict
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.9).astype(int)  

# classification report, confusion matrix, and roc auc
print(classification_report(y_test, y_pred_binary))
print(confusion_matrix(y_test, y_pred_binary))
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred)}")

[1m8105/8105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 554us/step
              precision    recall  f1-score   support

           0       1.00      0.36      0.53    257815
           1       0.01      0.98      0.02      1520

    accuracy                           0.37    259335
   macro avg       0.50      0.67      0.28    259335
weighted avg       0.99      0.37      0.53    259335

[[ 93854 163961]
 [    26   1494]]
ROC AUC Score: 0.9257311916337398


### Explanation

This machine learning model was made to detect fraud transactions. I began by calculating the geographical distance between customers and merchants using their latitude and longitude coordinates. Time variables were converted into transaction hour and the day of the week to get better insights on spending behaviour. I also changed the customer's date of birth to an age feature. Label encoding were use on categorical columns and standard scaling was applied on numerical columns in order to normalize the data. Features were selected based on their relevance to the problem, and unnecessary features were dropped to simplify the model and reduce noise.

The imbalanced data was handled by using the SMOTE technique to oversample the minority class - balancing the dataset and improving the model's ability to detect fraud. Class weights were used to give inceased importance to the minority class and help reduce the model's bias towards the majority class when training. The model has dense layers, including dropout layers for regularization to prevent overfitting. Also used an Adam optimizer with a low learning rate and early stopping based on validation loss to optimize the model. 

Although the model shows a great ability to detect fraud, it also provided alot of false positives :C