##### Data Preprocessing
---

In [1]:
# imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# build custom preprocessor function - wrapper around column transformer
class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, cat_features, num_features):
        self.cat_features_ = cat_features
        self.num_features_ = num_features
        self.encoder_ = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.scaler_ = StandardScaler()
    
    def fit(self, X, y=None):
        self.encoder_.fit(X[self.cat_features_])
        self.scaler_.fit(X[self.num_features_])
        return self
    
    def transform(self, X):
        X_cat = pd.DataFrame(self.encoder_.transform(X[self.cat_features_]),
                            columns=self.encoder_.get_feature_names_out(self.cat_features_), index=X.index)
        X_num = pd.DataFrame(self.scaler_.transform(X[self.num_features_]),
                             columns=self.num_features_, index=X.index)
        X_combined = pd.concat([X_cat, X_num], axis=1)
        return X_combined

In [3]:
df = pd.read_csv("data/fraud_detect.csv")

y = df["fraud_bool"]
X = df.drop(["fraud_bool"], axis=1)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [5]:
# define get numeric cols
numeric_features = X.select_dtypes(include=['number']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [6]:
pipeline = Pipeline(steps=[
    ('preprocessor', Preprocessor(categorical_features, numeric_features))
])

In [7]:
pipeline.fit(X_train)
X_train_processed = pipeline.transform(X_train)

In [8]:
X_test_processed = pipeline.transform(X_test)

In [9]:
X_train_processed.shape

(750000, 52)

In [10]:
X_test_processed.shape

(250000, 52)

In [11]:
# write processed dataframes to output
#X_train_processed.to_csv("data/X_train.csv")
#y_train.to_csv("data/y_train.csv")
#X_test_processed.to_csv("data/X_test.csv")
#y_test.to_csv("data/y_test.csv")

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


regr = LogisticRegression()
regr.fit(X_train_processed, y_train)

y_hat = regr.predict(X_test_processed)

print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    247320
           1       0.54      0.01      0.02      2680

    accuracy                           0.99    250000
   macro avg       0.77      0.51      0.51    250000
weighted avg       0.98      0.99      0.98    250000



In [13]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

clf = SVC(C=1, kernel="rbf")
clf.fit(X_train_processed, y_train)
y_hat = clf.predict(X_test_processed)

print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    247320
           1       0.00      0.00      0.00      2680

    accuracy                           0.99    250000
   macro avg       0.49      0.50      0.50    250000
weighted avg       0.98      0.99      0.98    250000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
