##### Data Preprocessing
---

In [25]:
# imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

In [9]:
# build custom preprocessor function - wrapper around column transformer
class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, cat_features, num_features):
        self.cat_features_ = cat_features
        self.num_features_ = num_features
        self.encoder_ = OneHotEncoder(sparse_output=False, drop='first')
        self.scaler_ = StandardScaler()
        self.imputer_ = SimpleImputer(strategy="mean")
    
    def fit(self, X, y=None):
        self.encoder_.fit(X[self.cat_features_])
        self.scaler_.fit(X[self.num_features_])
        self.imputer_.fit(X[self.num_features_])
        return self
    
    def transform(self, X):
        X_encoded = pd.DataFrame(self.encoder_.transform(X[self.cat_features_]),
                            columns=self.encoder_.get_feature_names_out(self.cat_features_), index=X.index)
        X_imputed = pd.DataFrame(self.imputer_.transform(X[self.num_features_]),
                                 columns=self.num_features_, index=X.index)
        X_scaled = pd.DataFrame(self.scaler_.transform(X_imputed[self.num_features_]),
                             columns=self.num_features_, index=X_imputed.index)
        X_combined = pd.concat([X_encoded, X_scaled], axis=1)
        return X_combined

In [10]:
df = pd.read_csv("data/fraud_detect.csv")

y = df["fraud_bool"]
X = df.drop(["fraud_bool"], axis=1)
X = X.replace(-1, np.nan)

X.head()

Unnamed: 0,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,velocity_6h,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0.3,0.986506,,25.0,40,0.006735,102.453711,AA,1059,13096.035018,...,0,1500.0,0,INTERNET,16.224843,linux,1,1.0,0,0
1,0.8,0.617426,,89.0,20,0.010095,-0.849551,AD,1658,9223.283431,...,0,1500.0,0,INTERNET,3.363854,other,1,1.0,0,0
2,0.8,0.996707,9.0,14.0,40,0.012316,-1.490386,AB,1095,4471.472149,...,0,200.0,0,INTERNET,22.730559,windows,0,1.0,0,0
3,0.6,0.4751,11.0,14.0,30,0.006991,-1.863101,AB,3483,14431.993621,...,0,200.0,0,INTERNET,15.215816,linux,1,1.0,0,0
4,0.9,0.842307,,29.0,40,5.742626,47.152498,AA,2339,7601.511579,...,0,200.0,0,INTERNET,3.743048,other,0,1.0,0,0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [12]:
# define get numeric cols
#numeric_features = X.select_dtypes(include=['number']).columns.tolist()
numeric_features = ["income", "name_email_similarity", "prev_address_months_count",
                    "current_address_months_count", "customer_age", "days_since_request",
                    "intended_balcon_amount", "zip_count_4w", "velocity_6h", "velocity_24h",
                    "velocity_4w", "bank_branch_count_8w", "date_of_birth_distinct_emails_4w",
                    "credit_risk_score", "bank_months_count", "proposed_credit_limit",
                    "session_length_in_minutes", "device_distinct_emails_8w", "device_fraud_count", 
                    "month"]
#categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_features = ["payment_type", "employment_status", "housing_status", "source", "device_os",
                       "email_is_free", "phone_home_valid", "phone_mobile_valid", "has_other_cards",
                       "foreign_request", "keep_alive_session"]

In [13]:
pipeline = Pipeline(steps=[
    ('preprocessor', Preprocessor(categorical_features, numeric_features)),
])

In [14]:
X_train_processed = pipeline.fit_transform(X_train)
X_train_processed.head()

Unnamed: 0,payment_type_AB,payment_type_AC,payment_type_AD,payment_type_AE,employment_status_CB,employment_status_CC,employment_status_CD,employment_status_CE,employment_status_CF,employment_status_CG,...,velocity_4w,bank_branch_count_8w,date_of_birth_distinct_emails_4w,credit_risk_score,bank_months_count,proposed_credit_limit,session_length_in_minutes,device_distinct_emails_8w,device_fraud_count,month
570606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.181324,-0.399231,-0.894539,-1.494541,0.0,2.016399,0.423707,-0.107872,0.0,0.321879
756283,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.456177,-0.388355,1.092164,-1.150035,-0.509289,-0.648666,0.174233,-0.107872,0.0,0.774403
738227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.374982,3.259512,-1.29188,0.055734,0.878568,-0.033651,-0.104132,-0.107872,0.0,0.774403
554038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.374719,-0.397056,0.694824,-0.31748,1.138791,-0.033651,-0.204781,-0.107872,0.0,0.321879
712266,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.765597,0.116299,-0.894539,1.189732,-1.203218,0.991374,-0.316836,-0.107872,0.0,0.774403


In [15]:
X_test_processed = pipeline.transform(X_test)
X_test_processed.head()

Unnamed: 0,payment_type_AB,payment_type_AC,payment_type_AD,payment_type_AE,employment_status_CB,employment_status_CC,employment_status_CD,employment_status_CE,employment_status_CF,employment_status_CG,...,velocity_4w,bank_branch_count_8w,date_of_birth_distinct_emails_4w,credit_risk_score,bank_months_count,proposed_credit_limit,session_length_in_minutes,device_distinct_emails_8w,device_fraud_count,month
987231,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-1.95239,-0.381829,-1.49055,0.586847,1.138791,-0.033651,1.00129,-0.107872,0.0,1.679451
79954,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.104457,1.134308,-0.099858,-0.044747,0.618345,-0.648666,-0.205522,-0.107872,0.0,-1.488217
567130,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.124291,-0.370953,1.290835,-1.781629,0.878568,-0.648666,-0.352212,-0.107872,0.0,0.321879
500891,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.026844,-0.399231,0.694824,-0.504087,0.0,-0.648666,0.080462,-0.107872,0.0,-0.130645
55399,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.652033,-0.373128,-0.497199,-0.661986,-1.203218,-0.648666,-0.302617,-0.107872,0.0,-1.488217


In [16]:
X_train_processed.shape

(750000, 47)

In [17]:
X_test_processed.shape

(250000, 47)

In [26]:
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_processed, y_train)
X_train_resampled.shape

(16698, 47)

In [39]:
# write processed dataframes to output
#X_train_processed.to_csv("data/X_train.csv")
#y_train.to_csv("data/y_train.csv")
#X_test_processed.to_csv("data/X_test.csv")
#y_test.to_csv("data/y_test.csv")

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


regr = LogisticRegression()
regr.fit(X_train_resampled, y_train_resampled)

y_hat = regr.predict(X_test_processed)

print(classification_report(y_test, y_hat))

              precision    recall  f1-score   support

           0       1.00      0.80      0.89    247320
           1       0.04      0.78      0.08      2680

    accuracy                           0.80    250000
   macro avg       0.52      0.79      0.48    250000
weighted avg       0.99      0.80      0.88    250000



In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

clf = SVC(kernel="rbf")
clf.fit(X_train_resampled, y_train_resampled)
y_hat = clf.predict(X_test_processed)

print(classification_report(y_test, y_hat))