In [10]:
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression

import pandas as pd

# SPECIFIC TRANSFORMATIONS FOR COLUMNS TYPE
# --------------------

# Imputer needed for categorical columns before preprocessing
pre_cat_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=pd.NA, strategy='most_frequent'))
])

# Categorical columns
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=pd.NA, strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

# Numerical columns
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=pd.NA, strategy='median')),
    ('scaler', MinMaxScaler())
])

# Unified transformer
transformer = ColumnTransformer(
    transformers=[
        ('num', num_transformer, make_column_selector(dtype_exclude='object')),
        ('cat', cat_transformer, make_column_selector(dtype_include='object'))],
    verbose_feature_names_out=False
)

pipeline_preprocess = Pipeline(steps=[
    ('cast', FunctionTransformer(lambda x: x.infer_objects(), validate=False)),
    ('transformer', transformer),
    ('variance_threshold', VarianceThreshold())
])

In [11]:
pipe = Pipeline(steps=[
    *pipeline_preprocess.steps,
    ("model", LogisticRegression(max_iter=100000, class_weight='balanced'))
])

df_data = pd.read_csv('../data/fraudTrain.csv', index_col=0)
df_data_reduced = df_data.sample(frac=0.01, random_state=42)
df_data_test = pd.read_csv('../data/fraudTest.csv', index_col=0)
df_data_test_reduced = df_data_test.sample(frac=0.01, random_state=42)


X = df_data_reduced.drop(columns=['is_fraud', 'trans_date_trans_time'])
y = df_data_reduced['is_fraud']
X_test = df_data_test_reduced.drop(columns=['is_fraud', 'trans_date_trans_time'])
y_test = df_data_test_reduced['is_fraud']

pipe.fit(X, y)
y_pred = pipe.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       1.00      0.99      1.00      5544
           1       0.00      0.00      0.00        13

    accuracy                           0.99      5557
   macro avg       0.50      0.50      0.50      5557
weighted avg       1.00      0.99      0.99      5557

