# DS 4400 Final Project : Credit Card Fraud Detection

#### Emily Chen, Glen Damian Lim, Tara Sawhney

#### Dataset : https://www.kaggle.com/datasets/kartik2112/fraud-detection

#### ML models: Logistic Regression, Decision Trees, Feedforward Neural Networks, Recurrent Neural Networks

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

# ML libraries
from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
# Neural Networks libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

2023-04-05 21:25:08.914445: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train_df = pd.read_csv('data/fraudTrain.csv')
test_df = pd.read_csv('data/fraudTest.csv')

## Data pre-processing

In [3]:
def datetime_column(df, col_name: str, hour: bool =False, new_col_prefix: str =''):
    df[col_name] = pd.to_datetime(df[col_name])

    if hour:
        new_col = new_col_prefix + '_hour'
        df[new_col] = df[col_name].dt.hour
    df[new_col_prefix + '_weekday'] = df[col_name].dt.weekday
    df[new_col_prefix + '_month'] = df[col_name].dt.strftime("%m")
    df[new_col_prefix + '_year'] = df[col_name].dt.year

# deriving additonal columns from 'trans_date_trans_time' and 'dob' columns
datetime_column(train_df, 'trans_date_trans_time', True, 'trans')
datetime_column(test_df, 'trans_date_trans_time', True, 'trans')
datetime_column(train_df, 'dob', new_col_prefix='dob')
datetime_column(test_df, 'dob', new_col_prefix='dob')


# dropping irrelevant columns
train_df.drop(['Unnamed: 0','merchant', 'first', 'last','street','zip', 'dob', 'trans_num', 'trans_date_trans_time'], axis=1, inplace=True)
test_df.drop(['Unnamed: 0','merchant', 'first', 'last','street','zip', 'dob', 'trans_num', 'trans_date_trans_time'], axis=1, inplace=True)

# Convert categorical columns
categorical_column_names = ['gender', 'city', 'state', 'job', 'category']

for cat_name in categorical_column_names:
    train_df[cat_name] = pd.factorize(train_df[cat_name])[0]
    test_df[cat_name] = pd.factorize(test_df[cat_name])[0]

In [11]:
non_fraud = train_df[train_df['is_fraud'] == 0].sample(n = len(train_df[train_df['is_fraud'] == 1]))
fraud = train_df[train_df['is_fraud'] == 1]
df_train_merged = pd.concat([non_fraud,fraud], ignore_index=True, sort=False)

In [12]:
non_fraud = test_df[test_df['is_fraud'] == 0].sample(n = len(test_df[test_df['is_fraud'] == 1]))
fraud = test_df[test_df['is_fraud'] == 1]
df_test_merged = pd.concat([non_fraud,fraud], ignore_index=True, sort=False)

#### Feature Selection

In [13]:
# Features and labels
X_train = df_train_merged.drop('is_fraud', axis=1)
y_train = df_train_merged['is_fraud']

X_test = df_test_merged.drop('is_fraud', axis=1)
y_test = df_test_merged['is_fraud']

selector = RFE(LogisticRegression(), n_features_to_select=10, step=1)
selector.fit(X_train, y_train)

X_train = X_train[X_train.columns[selector.support_]]
X_test = X_test[X_test.columns[selector.support_]]

# Scale training data
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


## Models

#### Logistic Regression

In [14]:
# Create a model object
clf = LogisticRegression()

# Train the model
clf = clf.fit(X=X_train, y = y_train)

# Predict the classes
y_pred = clf.predict(X_test)

# Print model accuracy
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))

Accuracy:  0.8375291375291375


In [20]:
Counter(y_pred)

Counter({0: 3901, 1: 389})

In [21]:
Counter(y_test)

Counter({0: 2145, 1: 2145})

#### Decision Trees

In [16]:
# Features and labels
X_train = df_train_merged.drop('is_fraud', axis=1)
y_train = df_train_merged['is_fraud']

X_test = df_test_merged.drop('is_fraud', axis=1)
y_test = df_test_merged['is_fraud']

selector = RFE(DecisionTreeClassifier(), n_features_to_select=10, step=1)
selector.fit(X_train, y_train)

X_train = X_train[X_train.columns[selector.support_]]
X_test = X_test[X_test.columns[selector.support_]]

In [17]:
# Create a model object
clf = DecisionTreeClassifier()

# Train the model
clf = clf.fit(X=X_train, y = y_train)

# Predict the classes
y_pred = clf.predict(X_test)

# Print model accuracy
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))

Accuracy:  0.5184149184149184


In [18]:
Counter(y_pred)

Counter({0: 3901, 1: 389})

In [19]:
Counter(y_test)

Counter({0: 2145, 1: 2145})

#### Feedforward Neural Network

#### Recurrent Neural Network