# DS 4400 Final Project : Credit Card Fraud Detection

#### Emily Chen, Glen Damian Lim, Tara Sawhney

#### Dataset : https://www.kaggle.com/datasets/kartik2112/fraud-detection

#### ML models: Logistic Regression, Decision Trees, Feedforward Neural Networks, Recurrent Neural Networks

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

# ML libraries
from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
# Neural Networks libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from keras.optimizers import SGD, Adam
from keras.losses import BinaryCrossentropy
from keras.models import Sequential
from keras.layers import Dense

2023-04-08 21:32:02.912393: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df_train = pd.read_csv('data/fraudTrain.csv')
df_test = pd.read_csv('data/fraudTest.csv')

## Data pre-processing

In [3]:
def datetime_column(df, col_name: str, hour: bool =False, new_col_prefix: str =''):
    df[col_name] = pd.to_datetime(df[col_name])

    if hour:
        new_col = new_col_prefix + '_hour'
        df[new_col] = df[col_name].dt.hour
    df[new_col_prefix + '_weekday'] = df[col_name].dt.weekday
    df[new_col_prefix + '_month'] = df[col_name].dt.strftime("%m")
    df[new_col_prefix + '_year'] = df[col_name].dt.year

# deriving additonal columns from 'trans_date_trans_time' and 'dob' columns
datetime_column(df_train, 'trans_date_trans_time', True, 'trans')
datetime_column(df_test, 'trans_date_trans_time', True, 'trans')
datetime_column(df_train, 'dob', new_col_prefix='dob')
datetime_column(df_test, 'dob', new_col_prefix='dob')


# dropping irrelevant columns
df_train.drop(['Unnamed: 0','merchant', 'first', 'last','street','zip', 'dob', 'trans_num', 'trans_date_trans_time'], axis=1, inplace=True)
df_test.drop(['Unnamed: 0','merchant', 'first', 'last','street','zip', 'dob', 'trans_num', 'trans_date_trans_time'], axis=1, inplace=True)

# Convert categorical columns
categorical_column_names = ['gender', 'city', 'state', 'job', 'category']

for cat_name in categorical_column_names:
    df_train[cat_name] = pd.factorize(df_train[cat_name])[0]
    df_test[cat_name] = pd.factorize(df_test[cat_name])[0]

#### Resampling methods 

In [4]:
class_count_0, class_count_1 = df_train['is_fraud'].value_counts()

class_0 = df_train[df_train['is_fraud'] == 0]
class_1 = df_train[df_train['is_fraud'] == 1]

class_0_under = class_0.sample(class_count_1)
df_undersampling = pd.concat([class_0_under, class_1], axis=0)
# undersampled_trainX = test_under.drop('is_fraud', axis =1)
# undersampled_trainy = test_under['is_fraud']

class_1_over = class_1.sample(class_count_0, replace=True)
df_oversampling = pd.concat([class_1_over, class_0], axis=0)
# oversampled_trainX = test_over.drop('is_fraud', axis =1)
# oversampled_trainy = test_over['is_fraud']

#### Feature Selection and Data Scaling

In [5]:
def select_scale_features(model, n_features, df_train, df_test):
    X_train = df_train.drop('is_fraud', axis=1)
    y_train = df_train['is_fraud']
    X_test = df_test.drop('is_fraud', axis=1)
    y_test = df_test['is_fraud']
    
    selector = RFE(model, n_features_to_select=n_features, step=1)
    selector.fit(X_train, y_train)

    X_train = X_train[X_train.columns[selector.support_]]
    X_test = X_test[X_test.columns[selector.support_]]

    # Scale training data
    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test

In [44]:
X_train, y_train, X_test, y_test = select_scale_features(LogisticRegression(), 10, df_undersampling, df_test)

## Models

#### Logistic Regression

In [7]:
def logistic_regression(X_train, y_train, X_test, y_test, random_state: int = 3000):
    clf = LogisticRegression(random_state=random_state, penalty="l2").fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('accuracy:', metrics.accuracy_score(y_test, y_pred))
    print(metrics.classification_report(y_test,
                                    y_pred))
    return y_pred

In [8]:
y_pred = logistic_regression(X_train, y_train, X_test, y_test)

accuracy: 0.973087117769952
              precision    recall  f1-score   support

           0       1.00      0.97      0.99    553574
           1       0.09      0.69      0.16      2145

    accuracy                           0.97    555719
   macro avg       0.55      0.83      0.58    555719
weighted avg       1.00      0.97      0.98    555719



In [9]:
print(Counter(y_pred))
print(Counter(y_test))

Counter({0: 539962, 1: 15757})
Counter({0: 553574, 1: 2145})


#### Decision Trees

In [10]:
def decision_tree(X_train, y_train, X_test, y_test, criterion, max_depth, min_samples_split, random_state: int = 3000):
    clf = DecisionTreeClassifier(random_state=random_state, criterion = criterion, max_depth = max_depth,\
                                 min_samples_split = min_samples_split)
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    print('accuracy:', metrics.accuracy_score(y_test, y_pred))
    print(metrics.classification_report(y_test,
                                    y_pred))
    
    return y_pred

In [11]:
y_pred = decision_tree(X_train, y_train, X_test, y_test, "gini", 20, 10)

accuracy: 0.9105411188028482
              precision    recall  f1-score   support

           0       1.00      0.91      0.95    553574
           1       0.01      0.13      0.01      2145

    accuracy                           0.91    555719
   macro avg       0.50      0.52      0.48    555719
weighted avg       0.99      0.91      0.95    555719



In [12]:
print(Counter(y_pred))
print(Counter(y_test))

Counter({0: 507578, 1: 48141})
Counter({0: 553574, 1: 2145})


#### Feedforward Neural Network

In [53]:
def ffnn(X_train, y_train, X_test,epochs, batch_size, optimizer, loss, metrics):
    model = Sequential()
    # input/output dimensions
    # hidden layer -- same number of hidden units as above
    model.add(Dense(1024, activation='relu', input_shape = (X_train.shape[1],)))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(128, activation='relu')) 
    # output layer
    model.add(Dense(1, activation='sigmoid'))
              
    # configure the learning process
    model.compile(loss=loss,
                  optimizer=optimizer,
                  metrics=metrics)

    print(model.output)
    model.fit(X_train, y_train, 
              epochs= epochs, batch_size = batch_size, verbose=1, validation_split = 0.2)
    print(model.output)
    y_pred = model.predict(X_test)
    print(y_pred)
    return y_pred

In [62]:
preds = ffnn(X_train, y_train, X_test, 25, 128, 'adam', 'binary_crossentropy', ['accuracy'])

KerasTensor(type_spec=TensorSpec(shape=(None, 1), dtype=tf.float32, name=None), name='dense_31/Sigmoid:0', description="created by layer 'dense_31'")
Epoch 1/25


2023-04-08 22:16:18.523559: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-04-08 22:16:23.575778: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
KerasTensor(type_spec=TensorSpec(shape=(None, 1), dtype=tf.float32, name=None), name='dense_31/Sigmoid:0', description="created by layer 'dense_31'")
   30/17367 [..............................] - ETA: 1:00

2023-04-08 22:17:57.007606: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


[[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]


In [37]:
a = metrics.classification_report(y_test, t)
print(a)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.00      0.00      0.00      2145

    accuracy                           1.00    555719
   macro avg       0.50      0.50      0.50    555719
weighted avg       0.99      1.00      0.99    555719



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [55]:
t = [1 for pred in preds if pred > 0.5]
t

#### Recurrent Neural Network