In [57]:
"""
Preprocessing utilities for fraud detection
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


def load_data(csv_path='dataset/transactions.csv'):
    """Load and perform initial data cleaning"""
    df = pd.read_csv(csv_path)
    print('Loaded dataset with shape:', df.shape)
    
    if csv_path == '../dataset/resampled_data.csv':
        df = df.dropna()
        df['target'] = df['target'].map({False: 0, True: 1})
        df['isFraud'] = df['target']
        df.drop(["enteredCVV", "creditLimit", "noacqCountry", 
                 "acqCountry_CAN", "acqCountry_MEX", "acqCountry_PR",
                 "acqCountry_US", "target"], 
                 axis=1, inplace=True)
        return df

    # Drop all null columns
    columns_to_drop = [
        "Unnamed: 0", "enteredCVV", "creditLimit", 
        "acqCountry","customerId", "echoBuffer", 
        "merchantCity", "merchantState", "merchantZip", 
        "posOnPremises", "recurringAuthInd"
    ]
    df = df.drop(columns_to_drop, axis=1)
    
    return df


def one_hot_encode_categorical(df):
    """Apply one-hot encoding to categorical columns"""
    print('Starting one-hot encoding...\n')
    
    columns_with_nulls = ['acqCountry', 'merchantCountryCode', 'transactionType']
    columns_without_nulls = ['merchantCategoryCode']
    all_encode_columns = columns_with_nulls + columns_without_nulls
    
    # Handle columns with nulls - create indicator columns
    for col in columns_with_nulls:
        if col in df.columns:
            null_indicator_col = f'no{col}'
            df[null_indicator_col] = df[col].isnull().astype(int)
            df[col] = df[col].fillna('MISSING')
    
    # Perform one-hot encoding
    encoded_dfs = []
    for col in all_encode_columns:
        if col in df.columns:
            one_hot = pd.get_dummies(df[col], prefix=col, drop_first=False)
            
            if col in columns_with_nulls:
                missing_col_name = f'{col}_MISSING'
                if missing_col_name in one_hot.columns:
                    one_hot = one_hot.drop(columns=[missing_col_name])
            
            encoded_dfs.append(one_hot)
            df = df.drop(columns=[col])
    
    if encoded_dfs:
        df = pd.concat([df] + encoded_dfs, axis=1)
    
    print(f'Encoding complete! New shape: {df.shape}\n')
    return df


def convert_dates_to_numeric(df):
    """Convert date columns to days difference"""
    print('Converting date columns to numeric features...\n')
    
    date_columns = {
        'currentExpDate': 'daysToCurrentExpDate',
        'accountOpenDate': 'daysSinceAccountOpen',
        'dateOfLastAddressChange': 'daysSinceLastAddressChange'
    }
    
    df['transactionDateTime'] = pd.to_datetime(df['transactionDateTime'], errors='coerce')
    
    for original_col, new_col in date_columns.items():
        if original_col in df.columns:
            df[original_col] = pd.to_datetime(df[original_col], errors='coerce')
            df[new_col] = (df['transactionDateTime'] - df[original_col]).dt.days
            
            if new_col == "daysToCurrentExpDate":
                df[new_col] = -df[new_col]
            
            df = df.drop(columns=[original_col])
    
    df.drop(['transactionDateTime'], axis=1, inplace=True)
    print('Date conversion complete!\n')
    return df


def ordinal_encode_merchant(df):
    """Apply ordinal encoding to merchantName based on fraud probability"""
    print('Applying ordinal encoding to merchantName...\n')
    
    if 'merchantName' not in df.columns:
        print('merchantName column not found - skipping')
        return df
    
    merchant_stats = df.groupby('merchantName').agg({
        'isFraud': ['sum', 'count']
    }).reset_index()
    
    merchant_stats.columns = ['merchantName', 'fraud_count', 'total_count']
    merchant_stats['prob_fraud'] = merchant_stats['fraud_count'] / merchant_stats['total_count']
    merchant_stats['score'] = merchant_stats['prob_fraud']
    merchant_stats = merchant_stats.sort_values('score', ascending=True).reset_index(drop=True)
    merchant_stats['ordinal_rank'] = range(len(merchant_stats))
    
    merchant_to_rank = dict(zip(merchant_stats['merchantName'], merchant_stats['ordinal_rank']))
    df['merchantName_ordinal'] = df['merchantName'].map(merchant_to_rank)
    
    unmapped_count = df['merchantName_ordinal'].isnull().sum()
    if unmapped_count > 0:
        median_rank = merchant_stats['ordinal_rank'].median()
        df['merchantName_ordinal'].fillna(median_rank, inplace=True)
    
    df = df.drop(columns=['merchantName'])
    print(f'Ordinal encoding complete! Total merchants: {len(merchant_stats)}\n')
    return df


def prepare_train_test_split(df, test_size=0.2, random_state=42):
    """Prepare X, y and create stratified train/test split"""
    if 'isFraud' not in df.columns:
        raise KeyError("Column 'isFraud' not found in dataframe")
    
    y = df['isFraud']
    X = df.drop(columns=['isFraud'])
    
    print(f'X shape: {X.shape}')
    print(f'y shape: {y.shape}')
    print(f'Fraud rate: {y.mean():.4f}\n')
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )
    
    print(f'Train shapes -> X: {X_train.shape}, y: {y_train.shape}')
    print(f'Test shapes  -> X: {X_test.shape}, y: {y_test.shape}')
    print(f'Train fraud rate: {y_train.mean():.4f}')
    print(f'Test fraud rate: {y_test.mean():.4f}\n')
    
    return X_train, X_test, y_train, y_test


def preprocess_pipeline(csv_path='../dataset/transactions.csv'):
    """Full preprocessing pipeline"""
    print('='*60)
    print('STARTING PREPROCESSING PIPELINE')
    print('='*60 + '\n')
    
    # Load data
    df = load_data(csv_path)

    if csv_path != '../dataset/resampled_data.csv':
        # One-hot encoding
        df = one_hot_encode_categorical(df)
        
        # Date conversion
        df = convert_dates_to_numeric(df)
        
        # Merchant encoding
        df = ordinal_encode_merchant(df)
    
    # 2. Preprocesare (scalare)
    # scaler = StandardScaler().set_output(transform="pandas")
    # df_scaled = scaler.fit_transform(df)

    df = df.dropna()

    df_scaled = df / df.max()
    df_scaled = df_scaled.apply(pd.to_numeric, errors='coerce')
    
    # Train/test split
    X_train, X_test, y_train, y_test = prepare_train_test_split(df_scaled)

    print('='*60)
    print('PREPROCESSING COMPLETE')
    print('='*60 + '\n')
    
    return X_train, X_test, y_train, y_test, df


In [58]:
X_train, X_test, y_train, y_test, df = preprocess_pipeline('../dataset/transactions.csv')

STARTING PREPROCESSING PIPELINE

Loaded dataset with shape: (786363, 30)
Starting one-hot encoding...

Encoding complete! New shape: (786363, 44)

Converting date columns to numeric features...



  df[original_col] = pd.to_datetime(df[original_col], errors='coerce')


Date conversion complete!

Applying ordinal encoding to merchantName...

Ordinal encoding complete! Total merchants: 2490

X shape: (781903, 42)
y shape: (781903,)
Fraud rate: 0.0155

Train shapes -> X: (625522, 42), y: (625522,)
Test shapes  -> X: (156381, 42), y: (156381,)
Train fraud rate: 0.0155
Test fraud rate: 0.0155

PREPROCESSING COMPLETE



In [None]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import classification_report, confusion_matrix

# # from imblearn.under_sampling import ClusterCentroids

# # cc = ClusterCentroids(random_state=42)
# # X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
# from imblearn.under_sampling import RandomUnderSampler

# total = y_train.shape[0]
# print(total)
# # Definește obiectul de undersampling; sampling_strategy="majority" reduce doar clasa majoritară
# undersample = RandomUnderSampler(sampling_strategy={0: int(0.9* total)}, random_state=42)

# # Aplică pe date (X = feature-uri, y = etichetă)
# X_resampled, y_resampled = undersample.fit_resample(X_train, y_train)

# # Distribuția claselor după undersampling
# from collections import Counter
# print(Counter(y_resampled))

# # Creează și antrenează modelul
# model = LogisticRegression(max_iter=2000, random_state=42, C=0.2)
# model.fit(X_resampled, y_resampled)

# # Predicții
# y_pred = model.predict(X_test)

# # Output metrici relevante
# print(classification_report(y_test, y_pred, digits=4))
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


625522
Counter({0.0: 562969, 1.0: 9702})
              precision    recall  f1-score   support

         0.0     0.9845    1.0000    0.9922    153956
         1.0     0.4000    0.0008    0.0016      2425

    accuracy                         0.9845    156381
   macro avg     0.6923    0.5004    0.4969    156381
weighted avg     0.9754    0.9845    0.9768    156381

Confusion Matrix:
 [[153953      3]
 [  2423      2]]


In [56]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

# Scaling obligatoriu pentru MLP!
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Definește modelele de bază
base_learners = [
    ('lgbm', lgb.LGBMClassifier(
        objective='binary',
        metric='f1',
        boosting_type='gbdt',
        n_estimators=800,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.6,
        subsample_freq=1,
        colsample_bytree=0.8,
        reg_alpha=0.5,
        reg_lambda=0.5,
        # scale_pos_weight=scale_pos_weight_lgb,
        random_state=42,
        n_jobs=-1,
        verbose=-1,
        is_unbalance=True
    )),
    ('mlp', MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', 
                    max_iter=100, random_state=42)),
    ('xgb', XGBClassifier(
        use_label_encoder=False,
        eval_metric='aucpr',  # XGBoost doesn't support f1 directly, use logloss
        scale_pos_weight=1,
        max_depth=6,
        learning_rate=0.1,
        n_estimators=1000,
        min_child_weight=1,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0,
        reg_alpha=0,
        reg_lambda=1,
        random_state=42,
        n_jobs=-1
    ))
]

# Meta-modelul
meta_learner = LogisticRegression(max_iter=200)

# Stacking
stack = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    passthrough=False,           # dacă vrei ca meta-model să vadă și features-urile originale, pune True
    n_jobs=-1
)

# Antrenează
stack.fit(X_train_scaled, y_train)

# Evaluează
y_pred = stack.predict(X_test_scaled)
print(classification_report(y_test, y_pred, digits=4))


              precision    recall  f1-score   support

         0.0     0.9866    0.9994    0.9930    153956
         1.0     0.7770    0.1394    0.2364      2425

    accuracy                         0.9860    156381
   macro avg     0.8818    0.5694    0.6147    156381
weighted avg     0.9834    0.9860    0.9812    156381



In [60]:
# X_train, X_test, y_train, y_test, df = preprocess_pipeline('../dataset/transactions.csv')

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# Scaling obligatoriu pentru MLP!
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Creează și antrenează MLP - poți ajusta hidden_layer_sizes (ex: (64, 32)), max_iter etc
mlp = MLPClassifier(hidden_layer_sizes=(64, 32, 16), activation='relu', solver='adam', 
                    max_iter=100, random_state=42)
mlp.fit(X_train_scaled, y_train)

# Predicții și metrici
y_pred = mlp.predict(X_test_scaled)
y_pred1 = y_pred.copy()

print(classification_report(y_test, y_pred, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

         0.0     0.9862    0.9992    0.9927    153956
         1.0     0.6830    0.1146    0.1963      2425

    accuracy                         0.9854    156381
   macro avg     0.8346    0.5569    0.5945    156381
weighted avg     0.9815    0.9854    0.9803    156381

Confusion Matrix:
 [[153827    129]
 [  2147    278]]


In [49]:
"""
LightGBM model training and evaluation for fraud detection
"""
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    classification_report, recall_score, precision_score, 
    f1_score, confusion_matrix, make_scorer
)


def train_lightgbm(X_train, y_train, verbose=True):
    """Train LightGBM model optimized for recall"""
    if verbose:
        print('='*60)
        print('TRAINING LIGHTGBM - OPTIMIZED FOR RECALL')
        print('='*60 + '\n')
    
    # Calculate scale_pos_weight for class imbalance
    neg_count = (y_train == 0).sum()
    pos_count = (y_train == 1).sum()
    scale_pos_weight_lgb = neg_count / pos_count if pos_count > 0 else 1
    
    if verbose:
        print(f'Class imbalance ratio (neg/pos): {scale_pos_weight_lgb:.2f}')
        print(f'Using scale_pos_weight={scale_pos_weight_lgb:.2f} to boost recall\n')
    
    lgb_model = lgb.LGBMClassifier(
        objective='binary',
        metric='f1',
        boosting_type='gbdt',
        n_estimators=800,
        learning_rate=0.05,
        max_depth=8,
        subsample=0.6,
        subsample_freq=1,
        colsample_bytree=0.8,
        reg_alpha=0.5,
        reg_lambda=0.5,
        # scale_pos_weight=scale_pos_weight_lgb,
        random_state=42,
        n_jobs=-1,
        verbose=-1,
        is_unbalance=True
    )
    
    if verbose:
        print('Training LightGBM model...')
    
    lgb_model.fit(X_train, y_train)
    
    if verbose:
        print('Training complete!\n')
    
    return lgb_model


def evaluate_lightgbm(model, X_train, y_train, X_test, y_test, verbose=True):
    """Evaluate LightGBM model on train and test sets"""
    if verbose:
        print('='*60)
        print('EVALUATING LIGHTGBM MODEL')
        print('='*60 + '\n')
    
    # Train set predictions
    y_train_pred_lgb = model.predict(X_train)
    
    if verbose:
        print('=== TRAIN SET PERFORMANCE ===')
        print(f'Recall (fraud class): {recall_score(y_train, y_train_pred_lgb):.4f}')
        print(f'Precision (fraud class): {precision_score(y_train, y_train_pred_lgb):.4f}')
        print(f'F1-Score (fraud class): {f1_score(y_train, y_train_pred_lgb):.4f}')
        print('\nConfusion Matrix (Train):')
        print(confusion_matrix(y_train, y_train_pred_lgb))
    
    # Test set predictions
    y_test_pred_lgb = model.predict(X_test)
    
    if verbose:
        print('\n=== TEST SET PERFORMANCE ===')
        print(f'Recall (fraud class): {recall_score(y_test, y_test_pred_lgb):.4f}')
        print(f'Precision (fraud class): {precision_score(y_test, y_test_pred_lgb):.4f}')
        print(f'F1-Score (fraud class): {f1_score(y_test, y_test_pred_lgb):.4f}')
        print('\nConfusion Matrix (Test):')
        print(confusion_matrix(y_test, y_test_pred_lgb))
        print('\nClassification Report (Test):')
        print(classification_report(y_test, y_test_pred_lgb))
    
    return y_train_pred_lgb, y_test_pred_lgb


def get_feature_importance(model, X_train, top_n=15, verbose=True):
    """Get and display feature importances"""
    if hasattr(X_train, 'columns'):
        fi_lgb = pd.Series(model.feature_importances_, index=X_train.columns)
        
        if verbose:
            print(f'\nTop {top_n} Most Important Features (LightGBM):')
            print(fi_lgb.nlargest(top_n))
        
        return fi_lgb
    return None


def train_lightgbm_with_random_search(X_train, y_train, n_iter=100, cv=5, verbose=True):
    """
    Train LightGBM with RandomizedSearchCV for hyperparameter tuning
    
    Args:
        X_train: Training features
        y_train: Training labels
        n_iter: Number of parameter settings sampled (default: 100)
        cv: Number of cross-validation folds (default: 5)
        verbose: Whether to print progress
    
    Returns:
        best_model: The best estimator found by RandomizedSearchCV
        search_results: The RandomizedSearchCV object with all results
    """
    if verbose:
        print('='*60)
        print('LIGHTGBM RANDOMIZED SEARCH CV')
        print('='*60 + '\n')
        print(f'Configuration:')
        print(f'  n_iter: {n_iter} (parameter combinations)')
        print(f'  cv: {cv} (cross-validation folds)')
        print(f'  Total fits: {n_iter * cv}\n')
    
    # Calculate scale_pos_weight for class imbalance
    neg_count = (y_train == 0).sum()
    pos_count = (y_train == 1).sum()
    scale_pos_weight_lgb = neg_count / pos_count if pos_count > 0 else 1
    
    if verbose:
        print(f'Class imbalance ratio: {scale_pos_weight_lgb:.2f}\n')
    
    # Define parameter distributions for random search
    param_distributions = {
        'n_estimators': [100, 150, 200, 250, 300, 350, 400],
        'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1, 0.15],
        'max_depth': [6, 8, 10, 12, 15, -1],
        'num_leaves': [31, 50, 63, 80, 100, 127],
        'min_child_samples': [10, 15, 20, 25, 30, 40, 50],
        'min_child_weight': [0.001, 0.01, 0.1, 1],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'subsample_freq': [0, 1, 2, 3],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'reg_alpha': [0, 0.01, 0.05, 0.1, 0.5, 1.0],
        'reg_lambda': [0, 0.01, 0.05, 0.1, 0.5, 1.0],
    }
    
    # Base model
    base_model = lgb.LGBMClassifier(
        objective='binary',
        metric='binary_logloss',
        boosting_type='gbdt',
        scale_pos_weight=scale_pos_weight_lgb,
        random_state=42,
        n_jobs=1,  # Set to 1 for each estimator since RandomizedSearchCV parallelizes
        verbose=-1
    )
    
    # Custom scorer for F1 (balancing precision and recall)
    f1_scorer = make_scorer(f1_score)
    
    # RandomizedSearchCV with parallelization
    random_search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_distributions,
        n_iter=n_iter,
        cv=cv,
        scoring=f1_scorer,
        n_jobs=-1,  # Parallelize across all CPUs
        verbose=2 if verbose else 0,
        random_state=42,
        return_train_score=True
    )
    
    if verbose:
        print('Starting RandomizedSearchCV...')
        print('This may take a while with parallelization across all CPUs...\n')
    
    # Fit the random search
    random_search.fit(X_train, y_train)
    
    if verbose:
        print('\n' + '='*60)
        print('RANDOM SEARCH COMPLETE')
        print('='*60 + '\n')
        print(f'Best score (CV F1): {random_search.best_score_:.4f}')
        print(f'\nBest parameters:')
        for param, value in random_search.best_params_.items():
            print(f'  {param}: {value}')
        
        # Show top 5 parameter combinations
        results_df = pd.DataFrame(random_search.cv_results_)
        results_df = results_df.sort_values('rank_test_score')
        
        print(f'\nTop 5 parameter combinations:')
        for idx, row in results_df.head(5).iterrows():
            print(f'\n  Rank {int(row["rank_test_score"])}:')
            print(f'    Mean CV F1: {row["mean_test_score"]:.4f} (+/- {row["std_test_score"]:.4f})')
            print(f'    Mean Fit Time: {row["mean_fit_time"]:.2f}s')
    
    best_model = random_search.best_estimator_
    
    if verbose:
        print('\n' + '='*60)
        print('Best model ready for predictions!')
        print('='*60 + '\n')
    
    return best_model, random_search


def compare_with_xgboost(y_test, y_test_pred_lgb, y_test_pred_xgb, verbose=True):
    """Compare LightGBM with XGBoost baseline"""
    if verbose:
        print('\n' + '='*60)
        print('COMPARISON: LIGHTGBM VS XGBOOST BASELINE')
        print('='*60 + '\n')
    
    lgb_test_recall = recall_score(y_test, y_test_pred_lgb)
    xgb_test_recall = recall_score(y_test, y_test_pred_xgb)
    lgb_test_precision = precision_score(y_test, y_test_pred_lgb)
    xgb_test_precision = precision_score(y_test, y_test_pred_xgb)
    
    if verbose:
        print(f'Test Recall:')
        print(f'  LightGBM: {lgb_test_recall:.4f}')
        print(f'  XGBoost:  {xgb_test_recall:.4f}')
        print(f'  Difference: {(lgb_test_recall - xgb_test_recall):+.4f}')
        
        print(f'\nTest Precision:')
        print(f'  LightGBM: {lgb_test_precision:.4f}')
        print(f'  XGBoost:  {xgb_test_precision:.4f}')
        print(f'  Difference: {(lgb_test_precision - xgb_test_precision):+.4f}')
        
        if lgb_test_recall > xgb_test_recall:
            improvement = ((lgb_test_recall - xgb_test_recall) / xgb_test_recall) * 100
            print(f'\n✅ LightGBM achieves {improvement:.2f}% better recall than XGBoost!')
        elif lgb_test_recall < xgb_test_recall:
            decline = ((xgb_test_recall - lgb_test_recall) / xgb_test_recall) * 100
            print(f'\n⚠️  LightGBM recall is {decline:.2f}% lower than XGBoost')
        else:
            print(f'\n➡️  LightGBM and XGBoost achieve the same recall')
    
    return {
        'lgb_recall': lgb_test_recall,
        'xgb_recall': xgb_test_recall,
        'lgb_precision': lgb_test_precision,
        'xgb_precision': xgb_test_precision
    }


def train_and_evaluate_lightgbm(X_train, y_train, X_test, y_test, 
                                y_train_pred_xgb, y_test_pred_xgb, 
                                verbose=True):
    """Complete LightGBM training and evaluation pipeline with XGBoost features"""
    
    # Train model
    model = train_lightgbm(X_train, y_train, verbose=verbose)
    
    # Evaluate model
    y_train_pred_lgb, y_test_pred_lgb = evaluate_lightgbm(
        model, X_train, y_train, X_test, y_test, verbose=verbose
    )
    
    # Feature importance
    feature_importance = get_feature_importance(model, X_train, verbose=verbose)
    
    # Compare with XGBoost
    comparison = compare_with_xgboost(y_test, y_test_pred_lgb, y_test_pred_xgb, verbose=verbose)
    
    if verbose:
        print('\n' + '='*60)
        print('LightGBM training and evaluation complete!')
        print('='*60)
    
    return model, y_train_pred_lgb, y_test_pred_lgb, feature_importance, comparison


In [None]:
_, y_pred2, y_test_pred2, _, _ = train_and_evaluate_lightgbm(X_train, y_train, X_test, y_test, y_train, y_test)

TRAINING LIGHTGBM - OPTIMIZED FOR RECALL

Class imbalance ratio (neg/pos): 63.47
Using scale_pos_weight=63.47 to boost recall

Training LightGBM model...
Training complete!

EVALUATING LIGHTGBM MODEL

=== TRAIN SET PERFORMANCE ===
Recall (fraud class): 0.9277
Precision (fraud class): 0.0793
F1-Score (fraud class): 0.1461

Confusion Matrix (Train):
[[511319 104501]
 [   701   9001]]

=== TEST SET PERFORMANCE ===
Recall (fraud class): 0.7035
Precision (fraud class): 0.0600
F1-Score (fraud class): 0.1105

Confusion Matrix (Test):
[[127218  26738]
 [   719   1706]]

Classification Report (Test):
              precision    recall  f1-score   support

         0.0       0.99      0.83      0.90    153956
         1.0       0.06      0.70      0.11      2425

    accuracy                           0.82    156381
   macro avg       0.53      0.76      0.51    156381
weighted avg       0.98      0.82      0.89    156381


Top 15 Most Important Features (LightGBM):
merchantName_ordinal          

(LGBMClassifier(colsample_bytree=0.8, is_unbalance=True, learning_rate=0.05,
                max_depth=8, metric='f1', n_estimators=800, n_jobs=-1,
                objective='binary', random_state=42, reg_alpha=0.5,
                reg_lambda=0.5, subsample=0.6, subsample_freq=1, verbose=-1),
 array([0., 0., 1., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 accountNumber                                2530
 availableMoney                               1926
 transactionAmount                            2323
 posEntryMode                                  487
 posConditionCode                              219
 cardCVV                                      2219
 cardLast4Digits                              2258
 currentBalance                               1869
 cardPresent                                   108
 expirationDateKeyInMatch                       14
 nomerchantCountryCode                          52
 notransactionType                               8
 merchantCountry