#### Detecting Bank Account Fraud with Synthetic Account Data
##### SEIS 763-01 - Prof. Lai
---
Charles Rehder | Krishna Chaitanya Bellamkonda | Mengyuan Cui | Soad Ahmed | Yang Yang

##### Introduction to the dataset & description of the problem.

Placeholder for something later

In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
class DummyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns_ = None

    def fit(self, X, y=None):
        X_dummies = pd.get_dummies(X, dtype=int)
        self.columns_ = X_dummies.columns
        return self

    def transform(self, X):
        X_dummies = pd.get_dummies(X, dtype=int)
        # reindex to ensure consistent columns
        return X_dummies.reindex(columns=self.columns_, fill_value=0)

In [3]:
# build custom preprocessor function - wrapper around column transformer
class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, cat_features, num_features):
        self.cat_features = cat_features
        self.num_features = num_features
        self.dummy_encoder = DummyEncoder()
        self.scaler = StandardScaler()
    
    def fit(self, X, y=None):
        self.dummy_encoder.fit(X[self.cat_features])
        self.scaler.fit(X[self.num_features])
        return self
    
    def transform(self, X):
        X_cat = self.dummy_encoder.transform(X[self.cat_features])
        X_num = pd.DataFrame(self.scaler.transform(X[self.num_features]),
                             columns=self.num_features, index=X.index)
        X_combined = pd.concat([X_cat, X_num], axis=1)
        return X_combined

##### Loading and splitting the data
---

In [4]:
df = pd.read_csv("data/fraud_detect.csv")
df.shape

(1000000, 32)

In [5]:
y = df["fraud_bool"]
X = df.drop(["fraud_bool"], axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train.shape

(750000, 31)

##### Preprocessing the training data
---

In [7]:
# define get numeric cols
numeric_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

In [8]:
# define and fit pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', Preprocessor(categorical_features, numeric_features))
])

In [9]:
X_train_processed = pipeline.fit_transform(X_train)

##### Training the model
---

In [10]:
regr = LogisticRegression()
regr.fit(X_train_processed, y_train)

##### Prepros

In [11]:
X_test_processed = pipeline.fit_transform(X_test)
y_preds = regr.predict(X_test_processed)

print(classification_report(y_preds, y_test))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99    249941
           1       0.01      0.54      0.02        59

    accuracy                           0.99    250000
   macro avg       0.51      0.77      0.51    250000
weighted avg       1.00      0.99      0.99    250000

