In [None]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_classif
from imblearn.over_sampling import SMOTE

import paths
import constants as const


# ------------------- LIMITI (infiniti) DIMENSIONE DELLE PRINT --------------------
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)


# ------------------- AUXILIARY ARRAYS -------------------
print("Defining auxiliary arrays of features...")
all_categorical_features = [
    "ProductCD",
    "card1", "card2", "card3", "card4", "card5", "card6",
    "addr1", "addr2",
    "P_emaildomain", "R_emaildomain",
    "M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8", "M9",
    "DeviceType", "DeviceInfo",
    "id_12", "id_13", "id_14", "id_15", "id_16", "id_17", "id_18", "id_19", "id_20", "id_21", "id_22", "id_23", "id_24", "id_25", "id_26", "id_27", "id_28", "id_29","id_30", "id_31", "id_32", "id_33", "id_34", "id_35", "id_36", "id_37", "id_38"
]

all_numerical_features = [
    "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14",
    "D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9", "D10", "D11", "D12", "D13", "D14", "D15",
    "TransactionAmt",
    "V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", "V12", "V13", "V14", "V15", "V16", "V17", "V18", "V19",
    "V20", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V28", "V29", "V30", "V31", "V32", "V33", "V34", "V35", "V36", "V37", "V38", "V39",
    "V40", "V41", "V42", "V43", "V44", "V45", "V46", "V47", "V48", "V49", "V50", "V51", "V52", "V53", "V54", "V55", "V56", "V57", "V58", "V59",
    "V60", "V61", "V62", "V63", "V64", "V65", "V66", "V67", "V68", "V69", "V70", "V71", "V72", "V73", "V74", "V75", "V76", "V77", "V78", "V79",
    "V80", "V81", "V82", "V83", "V84", "V85", "V86", "V87", "V88", "V89", "V90", "V91", "V92", "V93", "V94", "V95", "V96", "V97", "V98", "V99",
    "V100", "V101", "V102", "V103", "V104", "V105", "V106", "V107", "V108", "V109", "V110", "V111", "V112", "V113", "V114", "V115", "V116", "V117", "V118", "V119",
    "V120", "V121", "V122", "V123", "V124", "V125", "V126", "V127", "V128", "V129", "V130", "V131", "V132", "V133", "V134", "V135", "V136", "V137", "V138", "V139",
    "V140", "V141", "V142", "V143", "V144", "V145", "V146", "V147", "V148", "V149", "V150", "V151", "V152", "V153", "V154", "V155", "V156", "V157", "V158", "V159",
    "V160", "V161", "V162", "V163", "V164", "V165", "V166", "V167", "V168", "V169", "V170", "V171", "V172", "V173", "V174", "V175", "V176", "V177", "V178", "V179",
    "V180", "V181", "V182", "V183", "V184", "V185", "V186", "V187", "V188", "V189", "V190", "V191", "V192", "V193", "V194", "V195", "V196", "V197", "V198", "V199",
    "V200", "V201", "V202", "V203", "V204", "V205", "V206", "V207", "V208", "V209", "V210", "V211", "V212", "V213", "V214", "V215", "V216", "V217", "V218", "V219",
    "V220", "V221", "V222", "V223", "V224", "V225", "V226", "V227", "V228", "V229", "V230", "V231", "V232", "V233", "V234", "V235", "V236", "V237", "V238", "V239",
    "V240", "V241", "V242", "V243", "V244", "V245", "V246", "V247", "V248", "V249", "V250", "V251", "V252", "V253", "V254", "V255", "V256", "V257", "V258", "V259",
    "V260", "V261", "V262", "V263", "V264", "V265", "V266", "V267", "V268", "V269", "V270", "V271", "V272", "V273", "V274", "V275", "V276", "V277", "V278", "V279",
    "V280", "V281", "V282", "V283", "V284", "V285", "V286", "V287", "V288", "V289", "V290", "V291", "V292", "V293", "V294", "V295", "V296", "V297", "V298", "V299",
    "V300", "V301", "V302", "V303", "V304", "V305", "V306", "V307", "V308", "V309", "V310", "V311", "V312", "V313", "V314", "V315", "V316", "V317", "V318", "V319",
    "V320", "V321", "V322", "V323", "V324", "V325", "V326", "V327", "V328", "V329", "V330", "V331", "V332", "V333", "V334", "V335", "V336", "V337", "V338", "V339",
    "dist1", "dist2",
    "id_01", "id_02", "id_03", "id_04", "id_05", "id_06", "id_07", "id_08", "id_09", "id_10", "id_11",
    "TransactionDT_days", "hour_sin", "hour_cos", "dayofweek_sin", "dayofweek_cos"  # nuove features temporali
]


# ------------------ FUNZIONI AUSILIARIE ----------------------
print("Defining auxiliary functions...")
# # Dato un DataFrame, un array di feature categoriche e un array di feature numeriche, la funzione ritorna due array:
# #   - uno con le features presenti nel DataFrame che NON sono presenti nè in "categorical_features" nè in "numerical_features";
# #   - uno con le features presenti in "categorical_features" oppure in "numerical_features" che NON sono presenti nel DataFrame.
# def find_different_features(df, categorical_features, numerical_features):
#     print("Finding differences among features...")
#     dataframe_features = np.array(df.columns)

#     arrays = np.array(categorical_features + numerical_features)

#     in_df_not_in_arrays = []
#     in_arrays_not_in_df = []
#     for feature in dataframe_features:
#         if feature not in arrays:
#             in_df_not_in_arrays.append(feature)
#     for feature in arrays:
#         if feature not in dataframe_features:
#             in_arrays_not_in_df.append(feature)
    
#     return in_df_not_in_arrays, in_arrays_not_in_df

# La funzione prende in input un DataFrame e due array di features: aggiorna gli array rimuovendo le features che non sono presenti nel DataFrame.
# Ritorna i due array aggiornati e un terzo array che è la concatenazione dei primi due.
def update_features_arrays(df, categorical_features, numerical_features):
    print("Updating features arrays...")
    cat_ftr = [col for col in categorical_features if col in df.columns]
    num_ftr = [col for col in numerical_features if col in df.columns]

    return cat_ftr, num_ftr, cat_ftr + num_ftr

# # La funzione stampa a video i dati del DataFrame che gli viene passato come parametro.
# def print_dataframe_stats(df, categorical_features, numerical_features):
#     print("Finding problematic features...")
#     in_df_not_in_arrays, in_arrays_not_in_df = find_different_features(df, categorical_features, numerical_features)
#     print(f"in_df_not_in_arrays: {in_df_not_in_arrays}")
#     print(f"in_arrays_not_in_df: {in_arrays_not_in_df}")

#     print("Describing dataframe...")
#     print(f"Dataframe shape: {df.shape}")
    
#     return df.describe().sort_index(axis=1, ascending=True)

# print("DONE.")

Defining auxiliary arrays of features...
Defining auxiliary functions...
DONE.


In [2]:
# -------------------- DATASET --------------------
print("Loading imbalanced dataset...")
df = pd.read_csv(paths.RAW_ALL_PATH)

# description = print_dataframe_stats(df, all_categorical_features, all_numerical_features)
# description

print("DONE.")

Loading imbalanced dataset...
DONE.


In [3]:
# -------------------- FEATURE ENGINEERING --------------------
print("\nPREPROCESSING:")

# -------------------- Feature temporali --------------------
# La colonna TransactionDT è in secondi.
# Non usiamo il valore grezzo perché sono secondi cumulativi a partire da una data e ora di riferimento sconosciute, quindi NON hanno un significato immediato.
# Trasformiamo questi valori in features in cui sia possbile catturare pattern temporali (grazie all'applicazione delle funzioni seno e coseno).
print("Creating temporal features...")
df["TransactionDT_days"] = df["TransactionDT"] / (24*60*60)

hour = (df["TransactionDT"] // 3600) % 24
dayofweek = (df["TransactionDT"] // (24*3600)) % 7

df["hour_sin"] = np.sin(2 * np.pi * hour / 24)
df["hour_cos"] = np.cos(2 * np.pi * hour / 24)
df["dayofweek_sin"] = np.sin(2 * np.pi * dayofweek / 7)
df["dayofweek_cos"] = np.cos(2 * np.pi * dayofweek / 7)

categorical_features, numerical_features, feature_names = update_features_arrays(df, all_categorical_features, all_numerical_features)

# description = print_dataframe_stats(df, categorical_features, numerical_features)
# description

print("DONE.")


PREPROCESSING:
Creating temporal features...
Updating features arrays...
DONE.


In [4]:
# -------------------- Features / Target --------------------
print("Creating feature matrix (X) and target vector (y)...")
X = df.drop(columns=["isFraud"])
y = df["isFraud"]

categorical_features, numerical_features, feature_names = update_features_arrays(X, all_categorical_features, all_numerical_features)

# description = print_dataframe_stats(X, categorical_features, numerical_features)
# description

print("DONE.")

Creating feature matrix (X) and target vector (y)...
Updating features arrays...
DONE.


In [5]:
# -------------------- 1st feature selection --------------------
print("Applying first feature selection...")
X = X.drop(columns=["TransactionID", "TransactionDT"])  # useless for prediction
X = X.loc[:, X.isnull().mean() < const.MISSING_VALUES_THRESHOLD]    # drop features with more than 90% missing values

categorical_features, numerical_features, feature_names = update_features_arrays(X, all_categorical_features, all_numerical_features)

# description = print_dataframe_stats(X, categorical_features, numerical_features)
# description

print("DONE.")

Applying first feature selection...
Updating features arrays...
DONE.


In [6]:
# -------------------- Train / Test --------------------
print("Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=const.DIM_TEST, random_state=const.RANDOM_STATE, stratify=y
)

# print_dataframe_stats(X_train, categorical_features, numerical_features)
# description = print_dataframe_stats(X_test, categorical_features, numerical_features)
# description

print("DONE.")

Splitting data into train and test sets...
DONE.


In [7]:
# -------------------- Imputation --------------------
print("Applying imputer...")
imputer = ColumnTransformer(
    transformers=[
        ("cat", SimpleImputer(strategy="most_frequent"), categorical_features),
        ("num", SimpleImputer(strategy="median"), numerical_features)
    ],
    verbose=True
)
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=feature_names, index=y_train.index) # type: ignore
X_test = pd.DataFrame(imputer.transform(X_test), columns=feature_names, index=y_test.index) # type: ignore

categorical_features, numerical_features, feature_names = update_features_arrays(X_train, all_categorical_features, all_numerical_features)

# print_dataframe_stats(X_train, categorical_features, numerical_features)
# description = print_dataframe_stats(X_test, categorical_features, numerical_features)
# description

print("DONE.")

Applying imputer...
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   3.1s
[ColumnTransformer] ........... (2 of 2) Processing num, total=  19.8s
Updating features arrays...
DONE.


In [8]:
# -------------------- Scaling --------------------
print("Applying scaler...")
scaler = ColumnTransformer(
    transformers=[
        ("cat", "passthrough", categorical_features),  # lascia intatte le categoriche
        ("num", RobustScaler(), numerical_features)
    ],
    verbose=True
)
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=feature_names, index=y_train.index) # type: ignore
X_test = pd.DataFrame(scaler.transform(X_test), columns=feature_names, index=y_test.index) # type: ignore

categorical_features, numerical_features, feature_names = update_features_arrays(X_train, all_categorical_features, all_numerical_features)

# print_dataframe_stats(X_train, categorical_features, numerical_features)
# description = print_dataframe_stats(X_test, categorical_features, numerical_features)
# description

print("DONE.")

Applying scaler...
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   0.0s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   6.8s
Updating features arrays...
DONE.


In [9]:
# -------------------- Encoding --------------------
print("Applying one-hot encoding...")
encoder = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numerical_features)  # lascia intatte le numeriche
    ],
    verbose=True
)
X_train_sparse = encoder.fit_transform(X_train)
X_test_sparse = encoder.transform(X_test)

# Salvo i nomi delle feature dopo l'encoding (per i passi successivi)
feature_names = encoder.get_feature_names_out()

print("DONE.")

Applying one-hot encoding...
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   2.2s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   0.0s
DONE.


In [10]:
print("Checkpoints...")
X_train_checkpoint = X_train_sparse
X_test_checkpoint = X_test_sparse
feature_names_checkpoint = feature_names

print("DONE.")

Checkpoints...
DONE.


In [None]:
# # Ripristino il checkpoint
# X_train_sparse = X_train_checkpoint
# X_test_sparse = X_test_checkpoint
# feature_names = feature_names_checkpoint


# -------------------- 2nd feature selection --------------------
# Seleziono le feature con varianza sopra una certa soglia
print("Selecting best features by variance threshold...")
var_thresh = VarianceThreshold(threshold=const.VARIANCE_THRESHOLD)
X_train_sparse = var_thresh.fit_transform(X_train_sparse)
X_test_sparse = var_thresh.transform(X_test_sparse)

X_train = pd.DataFrame(X_train_sparse.toarray(), columns=feature_names[var_thresh.get_support()], index=y_train.index)  # type: ignore
X_test = pd.DataFrame(X_test_sparse.toarray(), columns=feature_names[var_thresh.get_support()], index=y_test.index)     # type: ignore

# Definisco una mask per features discrete/continue
print("Creating dynamic mask for discrete features...")
remaining_cols = X_train.columns
discrete_mask = [np.issubdtype(X_train[c].to_numpy().dtype, np.integer) for c in remaining_cols]

# Definisco la score function mutual info
def mi_score(X, y):
    if hasattr(X, "toarray"):  # se è sparse
        X = X.toarray()
    return mutual_info_classif(X, y, discrete_features=discrete_mask, random_state=const.RANDOM_STATE)

# Seleziono le migliori k feature
print("Selecting best features by mutual information...")
selector = SelectKBest(score_func=mi_score, k=const.BEST_K_FEATURES)
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)

# Seleziono le features finali
selected_features = remaining_cols[selector.get_support()]

# Ricostruisco DataFrame denso
X_train = pd.DataFrame(X_train, columns=selected_features, index=y_train.index)
X_test = pd.DataFrame(X_test, columns=selected_features, index=y_test.index) # type: ignore

# print_dataframe_stats(X_train, categorical_features, numerical_features)
# description = print_dataframe_stats(X_test, categorical_features, numerical_features)
# description

print("DONE.")

Selecting best features by variance threshold...
Type of X_train before sparse->dense conversion: <class 'scipy.sparse._csr.csr_matrix'>
Type of X_test before sparse->dense conversion: <class 'scipy.sparse._csr.csr_matrix'>
Shape of X_train before sparse->dense conversion: (442905, 394)
Shape of X_test before sparse->dense conversion: (147635, 394)
Shape of feature_names: (394,)
Shape of X_train after sparse->dense conversion: (442905, 394)
Shape of X_test after sparse->dense conversion: (147635, 394)
Type of X_train after sparse->dense conversion: <class 'pandas.core.frame.DataFrame'>
Type of X_test after sparse->dense conversion: <class 'pandas.core.frame.DataFrame'>
Creating dynamic mask for discrete features...
Selecting best features by mutual information...
Updating features arrays...
Finding problematic features...
Finding differences among features...
in_df_not_in_arrays: ['cat__ProductCD_C', 'cat__ProductCD_W', 'cat__card3_150.0', 'cat__card4_mastercard', 'cat__card4_visa', 'c

Unnamed: 0,cat__DeviceInfo_Windows,cat__DeviceType_desktop,cat__M2_T,cat__M3_T,cat__M4_M0,cat__M5_F,cat__M6_F,cat__M6_T,cat__M7_F,cat__M8_F,cat__M9_T,cat__P_emaildomain_gmail.com,cat__ProductCD_C,cat__ProductCD_W,cat__R_emaildomain_gmail.com,cat__addr1_299.0,cat__card3_150.0,cat__card4_mastercard,cat__card4_visa,cat__card5_226.0,cat__card6_credit,cat__card6_debit,cat__id_13_52.0,cat__id_14_-300.0,cat__id_15_Found,cat__id_16_Found,cat__id_17_166.0,cat__id_19_266.0,cat__id_20_507.0,cat__id_28_Found,cat__id_29_Found,cat__id_30_Windows 10,cat__id_31_chrome 63.0,cat__id_33_1920x1080,cat__id_35_F,cat__id_35_T,cat__id_38_F,num__C10,num__C12,num__C4,num__C7,num__C8,num__D8,num__TransactionAmt,num__TransactionDT_days,num__V10,num__V11,num__V12,num__V13,num__V168,num__V170,num__V171,num__V178,num__V179,num__V186,num__V187,num__V188,num__V189,num__V190,num__V199,num__V200,num__V201,num__V202,num__V203,num__V204,num__V211,num__V212,num__V213,num__V217,num__V218,num__V219,num__V221,num__V222,num__V229,num__V230,num__V231,num__V232,num__V233,num__V242,num__V243,num__V244,num__V245,num__V246,num__V257,num__V258,num__V259,num__V263,num__V264,num__V265,num__V273,num__V274,num__V275,num__V282,num__V283,num__V29,num__V30,num__V302,num__V303,num__V304,num__V34,num__V35,num__V36,num__V40,num__V45,num__V48,num__V49,num__V51,num__V52,num__V53,num__V54,num__V69,num__V70,num__V75,num__V76,num__V79,num__V90,num__V91,num__V94,num__dayofweek_cos,num__dayofweek_sin
count,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0,147635.0
mean,0.88202,0.907088,0.941965,0.884336,0.810702,0.817611,0.670288,0.329712,0.944485,0.848945,0.93444,0.546212,0.114919,0.746774,0.865824,0.188722,0.886335,0.318949,0.65521,0.510231,0.251255,0.748698,0.884499,0.939689,0.877346,0.893846,0.898141,0.798869,0.803712,0.891205,0.88901,0.905686,0.801565,0.905375,0.106059,0.893941,0.887892,5.618715,4.41988,4.331961,3.099069,5.505436,13.419857,0.808337,0.0077,0.246798,0.254824,-0.383432,-0.348048,1.421655,0.100742,0.163132,1.607403,1.183696,0.036096,0.199045,0.0034,0.009076,0.051532,0.064321,0.027209,0.036638,109.207385,260.434908,166.876597,94.844277,185.721886,130.661355,0.242334,0.395712,0.314356,0.067477,0.087412,0.141606,0.102571,0.17781,0.234057,0.211305,0.024906,0.039042,0.02601,-0.027913,0.040519,0.054391,0.074684,-0.006055,26.987438,46.376868,35.077109,17.503827,25.356053,20.916618,-0.182159,-0.009253,0.340278,0.357043,0.24946,0.281011,0.262119,0.120019,-0.327023,-0.300728,0.125309,0.085793,0.276317,0.286633,0.120839,0.129502,-0.366011,-0.329725,0.341904,0.357436,-0.386873,-0.349856,0.114966,0.343469,0.359149,0.115007,0.17386,-0.002717
std,0.322586,0.290309,0.23381,0.319822,0.391746,0.386166,0.47011,0.47011,0.228984,0.358104,0.247513,0.497862,0.318925,0.434861,0.340842,0.391289,0.317406,0.466071,0.475301,0.499897,0.433736,0.433763,0.319627,0.238063,0.328041,0.308035,0.302464,0.400847,0.39719,0.311383,0.314121,0.292266,0.398823,0.292698,0.307914,0.307914,0.3155,101.146294,92.20441,72.30351,65.679079,101.057001,88.849093,2.84261,0.56166,0.445414,0.468773,0.499201,0.515681,26.828198,0.880096,1.246872,34.294253,25.05147,0.613466,4.789899,0.344746,0.403728,0.763914,0.845941,0.623504,0.697024,2341.840425,4509.862018,3006.349115,2271.385202,3708.808171,2715.75589,4.802721,6.893474,5.998873,1.65768,1.732921,2.04074,1.022816,4.455483,5.214553,5.057389,0.313863,0.655983,0.332991,1.112938,0.516113,0.62008,0.955208,1.272833,569.010086,1082.185146,736.121709,465.030573,632.543615,529.998768,0.918124,1.527335,0.495085,0.537785,0.478705,0.618284,0.526937,0.338261,0.482593,0.493549,0.430506,0.620485,0.464561,0.493997,0.349138,0.379078,0.498234,0.515369,0.497516,0.536237,0.501615,0.517875,0.349889,0.497956,0.540086,0.319031,0.460568,0.454563
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-38.041664,-0.836817,-0.878401,0.0,0.0,-1.0,-1.0,0.0,-1.0,-1.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,-0.445042,-0.62349
25%,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.304402,-0.52082,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,-0.445042,-0.5
50%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.005012,-0.003091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.684012,0.478483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.554958,0.5
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3256.0,3187.0,2252.0,2254.0,3330.0,1600.833336,73.369744,1.034185,4.0,4.0,2.0,5.0,957.0,45.0,59.0,1235.0,913.0,36.0,215.0,29.0,29.0,39.0,42.0,42.0,53.0,94053.0,139736.0,101995.0,92888.0,129006.0,97236.0,301.0,394.0,372.0,383.0,383.0,174.0,62.0,290.0,331.0,326.0,17.0,56.0,19.0,261.0,42.0,45.0,63.0,284.0,50820.0,76000.0,60000.0,29667.0,66000.0,32545.0,23.0,67.0,5.0,9.0,13.0,18.0,13.0,13.0,2.0,3.0,14.0,45.0,5.0,5.0,6.0,12.0,4.0,5.0,4.0,5.0,2.0,4.0,7.0,5.0,5.0,1.0,0.801938,0.62349


In [None]:
# -------------------- SMOTE --------------------
SAMPLING_STRATEGY = const.TARGET_MINORITY_RATIO_1_5
SMOTED_TRAIN_PATH = paths.SMOTE20_PREP_TRAIN_PATH

# SMOTE ci serve per bilanciare i dati del training set: il nostro dataset è fortemente sbilanciato (ci sono pochissime transazioni fraudolente).
print("Applying SMOTE to balance the training set...")
smote = SMOTE(random_state=const.RANDOM_STATE, sampling_strategy=SAMPLING_STRATEGY) # type: ignore
X_train, y_train = smote.fit_resample(X_train, y_train) # type: ignore[arg-type]

# Unisco X e y in un unico DataFrame
smote_prep_train = X_train.copy()
smote_prep_train["isFraud"] = y_train

print("DONE.")

Applying SMOTE to balance the training set...
Shape of NON-smote prep_train: (442905, 121)
Shape of smote_prep_train: (512889, 121)
