In [14]:
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate

import numpy as np
import pandas as pd

# SPECIFIC TRANSFORMATIONS FOR COLUMNS TYPE
# --------------------

# Imputer needed for categorical columns before preprocessing
pre_cat_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=pd.NA, strategy='most_frequent'))
])

# Categorical columns
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=pd.NA, strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

# Numerical columns
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=pd.NA, strategy='median')),
    ('scaler', MinMaxScaler())
])

# Unified transformer
transformer = ColumnTransformer(
    transformers=[
        ('num', num_transformer, make_column_selector(dtype_exclude='object')),
        ('cat', cat_transformer, make_column_selector(dtype_include='object'))],
    verbose_feature_names_out=False
)

pipeline_preprocess = Pipeline(steps=[
    ('cast', FunctionTransformer(lambda x: x.infer_objects(), validate=False)),
    ('transformer', transformer),
    ('variance_threshold', VarianceThreshold())
])

In [20]:
pipe = Pipeline(steps=[
    *pipeline_preprocess.steps,
    ("model", LogisticRegression(max_iter=100000, class_weight='balanced'))
])

df_data = pd.read_csv('../data/fraudTrain_i2.csv')
#df_data_reduced = df_data.sample(frac=0.01, random_state=42)
df_data_test = pd.read_csv('../data/fraudTest.csv', index_col=0)
#df_data_test_reduced = df_data_test.sample(frac=0.01, random_state=42)


X = df_data.drop(columns=['is_fraud', 'Ciudad_rg', 'cc_num_hashed', 'trans_num_hashed',
                          'job', 'city', 'merchant']) #, 'trans_date_trans_time', 'cc_num', 'merchant', 'first', 'last', 'street', 'city', 'zip', 'job', 'dob', 'trans_num'
                                  
y = df_data['is_fraud']
"""
X_test = df_data_test.drop(columns=['is_fraud', 'trans_date_trans_time', 'cc_num', 'merchant', 'first', 'last', 'street',
                                  'city', 'zip', 'job', 'dob', 'trans_num'])
y_test = df_data_test['is_fraud']
"""

"""

pipe.fit(X, y)
y_pred = pipe.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))

"""

cv_score = cross_validate(pipe, X, y, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'], n_jobs=-1, return_train_score=True)


In [21]:
print(f'Accuracy Train: {np.mean(cv_score["train_accuracy"]):.4f} // Test: {np.mean(cv_score["test_accuracy"]):.4f}')
print(f'F1 score Train: {np.mean(cv_score["train_f1"]):.4f} // Test: {np.mean(cv_score["test_f1"]):.4f}')
print(f'F1 score Train std: {np.std(cv_score["train_f1"]):.4f} // Test std: {np.std(cv_score["test_f1"]):.4f}')
print(f'List of Train F1: {cv_score["train_f1"]}')
print(f'List of Test F1: {cv_score["test_f1"]}')

Accuracy Train: 0.8802 // Test: 0.8734
F1 score Train: 0.0684 // Test: 0.0711
F1 score Train std: 0.0022 // Test std: 0.0217
List of Train F1: [0.07083327 0.07118201 0.06578766 0.06660657 0.06746913]
List of Test F1: [0.03937291 0.06702102 0.10574602 0.07956269 0.06399089]


In [19]:
df_data.head(10)

Unnamed: 0_level_0,category,amt,gender,city,state,city_pop,job,is_fraud,dob_day,dob_month,dob_year,trans_date_trans_time_time_in_seconds,trans_date_trans_time_date_day,trans_date_trans_time_date_month,trans_date_trans_time_date_year,Ciudad_rg,Estado_rg,cc_num_hashed,trans_num_hashed
merchant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
"Rippin, Kub and Mann",misc_net,4.97,F,Moravian Falls,NC,3495,"Psychologist, counselling",0,9,3,1988,18,1,1,2019,Spruce Pine,North Carolina,80923ef01336409c8bfd3f8f5689196742e2c9871c0275...,d11f0b110a676dca672f06e1b4331293f34bad4787d3cd...
"Heller, Gutmann and Zieme",grocery_pos,107.23,F,Orient,WA,149,Special educational needs teacher,0,21,6,1978,44,1,1,2019,Grand Forks,British Columbia,f80a8e60a9f15ecf1b85ba3370ff9c2dc0242190d8bbb7...,00861bc278de220dd319500c2f0a5ecd39f6cf81e4232f...
Lind-Buckridge,entertainment,220.11,M,Malad City,ID,4154,Nature conservation officer,0,19,1,1962,51,1,1,2019,Blackfoot,Idaho,756a303c0348d0ebb53f6a51f114fdff35dc453af29456...,94399d0af3f2bd98e540c44fd65b53d4a5a112cb4b3274...
"Kutch, Hermiston and Farrell",gas_transport,45.0,M,Boulder,MT,1939,Patent attorney,0,12,1,1967,76,1,1,2019,Lincoln,Montana,374dcb008121abf2bd02a528ddd0c6069dea0e98d65208...,04a1d26d9fa39c02a073cd3f59053399de73da2f98a489...
Keeling-Crist,misc_pos,41.96,M,Doe Hill,VA,99,Dance movement psychotherapist,0,28,3,1986,186,1,1,2019,New Market,Virginia,7f921c03617da992089549e1df9b9d080107bfb23d446c...,3781bf2b8e8707de32ed85d8330b079de9a09f38f35c1f...
"Stroman, Hudson and Erdman",gas_transport,94.63,F,Dublin,PA,2158,Transport planner,0,19,6,1961,248,1,1,2019,Schuylkill Haven,Pennsylvania,48867fa811840e6dc33f15d3b1c77cf0b19bc77ea307af...,f9fe137ce70722fa98196772d216a1ca01c5447f1cbb30...
Rowe-Vandervort,grocery_net,44.54,F,Holcomb,KS,2691,Arboriculturist,0,16,8,1993,282,1,1,2019,Meade,Kansas,324ce63ac0754a3ffe7326e6412722319be6e255b1e524...,ee9e9a1be0d95b359a42f120b7b0f6ff5c3197532414de...
Corwin-Collins,gas_transport,71.65,M,Edinburg,VA,6018,"Designer, multimedia",0,21,8,1947,308,1,1,2019,Woodstock,Virginia,1a0982dcfad237278487ec899d2e1dd73cbf473712ab23...,705b686a64c739c960df6883f9cadf7dbdd69ca5bae5a5...
Herzog Ltd,misc_pos,4.27,F,Manor,PA,1472,Public affairs consultant,0,7,3,1941,318,1,1,2019,Pleasant Hills,Pennsylvania,a5154695c5538f4d38372b939031876c060652167987eb...,ecbd237eb99342e409605733db9ebbbd3bc36eed0c6fe5...
"Schoen, Kuphal and Nitzsche",grocery_pos,198.39,F,Clarksville,TN,151785,Pathologist,0,28,3,1974,361,1,1,2019,Nortonville,Kentucky,7953f70ed2959131197313cdd555124d8a08dec5f16579...,6ce5284c05375b8c3b5475f884067885dc5c97bcdf7f03...
