In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score



In [2]:
# Load datasets
real_data = pd.read_csv("creditcard.csv")
synthetic_data = pd.read_csv("smote_synthetic_data.csv")

# Split features and target for real data
X_real = real_data.drop(columns=["Class"])
y_real = real_data["Class"]

# Split features and target for synthetic data
X_synthetic = synthetic_data.drop(columns=["Class"])
y_synthetic = synthetic_data["Class"]


In [5]:
print(X_real)
print(y_real)
print(X_synthetic)
print(y_synthetic)

            Time         V1         V2        V3        V4        V5  \
0            0.0  -1.359807  -0.072781  2.536347  1.378155 -0.338321   
1            0.0   1.191857   0.266151  0.166480  0.448154  0.060018   
2            1.0  -1.358354  -1.340163  1.773209  0.379780 -0.503198   
3            1.0  -0.966272  -0.185226  1.792993 -0.863291 -0.010309   
4            2.0  -1.158233   0.877737  1.548718  0.403034 -0.407193   
...          ...        ...        ...       ...       ...       ...   
284802  172786.0 -11.881118  10.071785 -9.834783 -2.066656 -5.364473   
284803  172787.0  -0.732789  -0.055080  2.035030 -0.738589  0.868229   
284804  172788.0   1.919565  -0.301254 -3.249640 -0.557828  2.630515   
284805  172788.0  -0.240440   0.530483  0.702510  0.689799 -0.377961   
284806  172792.0  -0.533413  -0.189733  0.703337 -0.506271 -0.012546   

              V6        V7        V8        V9  ...       V20       V21  \
0       0.462388  0.239599  0.098698  0.363787  ...  0.25141

In [6]:
# Define XGBoost model
def evaluate_model(X, y, experiment_name):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    metrics = []

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
        print(f"Running {experiment_name} - Fold {fold}...")
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = XGBClassifier(
            scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
            use_label_encoder=False,
            eval_metric="logloss",
            random_state=42
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)

        metrics.append({
            "Experiment": experiment_name,
            "Fold": fold,
            "Accuracy": accuracy,
            "F1 Score": f1,
            "Recall": recall,
            "Precision": precision,
            "ROC-AUC": roc_auc
        })

    return metrics



In [7]:
# Perform 5 experiments for real and synthetic data
all_results = []
for experiment in range(1, 6):
    print(f"Experiment {experiment} for Real Data")
    real_metrics = evaluate_model(X_real, y_real, f"Real Data - Experiment {experiment}")
    all_results.extend(real_metrics)

    print(f"Experiment {experiment} for Synthetic Data")
    synthetic_metrics = evaluate_model(X_synthetic, y_synthetic, f"Synthetic Data - Experiment {experiment}")
    all_results.extend(synthetic_metrics)


Experiment 1 for Real Data
Running Real Data - Experiment 1 - Fold 1...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 1 - Fold 2...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 1 - Fold 3...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 1 - Fold 4...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 1 - Fold 5...


Parameters: { "use_label_encoder" } are not used.



Experiment 1 for Synthetic Data
Running Synthetic Data - Experiment 1 - Fold 1...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 1 - Fold 2...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 1 - Fold 3...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 1 - Fold 4...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 1 - Fold 5...


Parameters: { "use_label_encoder" } are not used.



Experiment 2 for Real Data
Running Real Data - Experiment 2 - Fold 1...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 2 - Fold 2...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 2 - Fold 3...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 2 - Fold 4...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 2 - Fold 5...


Parameters: { "use_label_encoder" } are not used.



Experiment 2 for Synthetic Data
Running Synthetic Data - Experiment 2 - Fold 1...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 2 - Fold 2...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 2 - Fold 3...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 2 - Fold 4...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 2 - Fold 5...


Parameters: { "use_label_encoder" } are not used.



Experiment 3 for Real Data
Running Real Data - Experiment 3 - Fold 1...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 3 - Fold 2...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 3 - Fold 3...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 3 - Fold 4...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 3 - Fold 5...


Parameters: { "use_label_encoder" } are not used.



Experiment 3 for Synthetic Data
Running Synthetic Data - Experiment 3 - Fold 1...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 3 - Fold 2...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 3 - Fold 3...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 3 - Fold 4...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 3 - Fold 5...


Parameters: { "use_label_encoder" } are not used.



Experiment 4 for Real Data
Running Real Data - Experiment 4 - Fold 1...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 4 - Fold 2...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 4 - Fold 3...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 4 - Fold 4...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 4 - Fold 5...


Parameters: { "use_label_encoder" } are not used.



Experiment 4 for Synthetic Data
Running Synthetic Data - Experiment 4 - Fold 1...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 4 - Fold 2...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 4 - Fold 3...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 4 - Fold 4...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 4 - Fold 5...


Parameters: { "use_label_encoder" } are not used.



Experiment 5 for Real Data
Running Real Data - Experiment 5 - Fold 1...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 5 - Fold 2...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 5 - Fold 3...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 5 - Fold 4...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Experiment 5 - Fold 5...


Parameters: { "use_label_encoder" } are not used.



Experiment 5 for Synthetic Data
Running Synthetic Data - Experiment 5 - Fold 1...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 5 - Fold 2...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 5 - Fold 3...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 5 - Fold 4...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Experiment 5 - Fold 5...


Parameters: { "use_label_encoder" } are not used.



In [8]:
# Convert results to a DataFrame
results_df = pd.DataFrame(all_results)

# Display results
print(results_df)

# Save results to a CSV file
results_df.to_csv("xgboost_evaluation_results.csv", index=False)

                       Experiment  Fold  Accuracy  F1 Score    Recall  \
0        Real Data - Experiment 1     1  0.999491  0.846561  0.808081   
1        Real Data - Experiment 1     2  0.999719  0.913043  0.848485   
2        Real Data - Experiment 1     3  0.999579  0.872340  0.836735   
3        Real Data - Experiment 1     4  0.999526  0.850829  0.785714   
4        Real Data - Experiment 1     5  0.999438  0.833333  0.816327   
5   Synthetic Data - Experiment 1     1  0.986723  0.986318  0.983893   
6   Synthetic Data - Experiment 1     2  0.988764  0.988430  0.986735   
7   Synthetic Data - Experiment 1     3  0.987294  0.986895  0.983667   
8   Synthetic Data - Experiment 1     4  0.988216  0.987859  0.985653   
9   Synthetic Data - Experiment 1     5  0.987360  0.986981  0.985067   
10       Real Data - Experiment 2     1  0.999491  0.846561  0.808081   
11       Real Data - Experiment 2     2  0.999719  0.913043  0.848485   
12       Real Data - Experiment 2     3  0.999579  

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score



In [10]:
# Load datasets
real_data = pd.read_csv("creditcard.csv")
synthetic_data = pd.read_csv("smote_synthetic_data.csv")

# Split features and target for real data
X_real = real_data.drop(columns=["Class"])
y_real = real_data["Class"]

# Split features and target for synthetic data
X_synthetic = synthetic_data.drop(columns=["Class"])
y_synthetic = synthetic_data["Class"]



In [11]:
# Define XGBoost model evaluation function
def evaluate_model(X, y, dataset_name):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    metrics = []

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
        print(f"Running {dataset_name} - Fold {fold}...")
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = XGBClassifier(
            scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
            use_label_encoder=False,
            eval_metric="logloss",
            random_state=42
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)

        metrics.append({
            "Dataset": dataset_name,
            "Fold": fold,
            "Accuracy": accuracy,
            "F1 Score": f1,
            "Recall": recall,
            "Precision": precision,
            "ROC-AUC": roc_auc
        })

    return metrics



In [12]:
# Perform 5-fold cross-validation for real and synthetic data
real_results = evaluate_model(X_real, y_real, "Real Data")
synthetic_results = evaluate_model(X_synthetic, y_synthetic, "Synthetic Data")



Running Real Data - Fold 1...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Fold 2...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Fold 3...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Fold 4...


Parameters: { "use_label_encoder" } are not used.



Running Real Data - Fold 5...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Fold 1...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Fold 2...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Fold 3...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Fold 4...


Parameters: { "use_label_encoder" } are not used.



Running Synthetic Data - Fold 5...


Parameters: { "use_label_encoder" } are not used.



In [13]:
# Combine results
all_results = real_results + synthetic_results

# Convert results to a DataFrame
results_df = pd.DataFrame(all_results)

# Display results in a tabular format
print(results_df)

# Save results to a CSV file
results_df.to_csv("xgboost_5fold_results.csv", index=False)

          Dataset  Fold  Accuracy  F1 Score    Recall  Precision   ROC-AUC
0       Real Data     1  0.999491  0.846561  0.808081   0.888889  0.980448
1       Real Data     2  0.999719  0.913043  0.848485   0.988235  0.976429
2       Real Data     3  0.999579  0.872340  0.836735   0.911111  0.994223
3       Real Data     4  0.999526  0.850829  0.785714   0.927711  0.969534
4       Real Data     5  0.999438  0.833333  0.816327   0.851064  0.977082
5  Synthetic Data     1  0.986723  0.986318  0.983893   0.988755  0.999070
6  Synthetic Data     2  0.988764  0.988430  0.986735   0.990130  0.999277
7  Synthetic Data     3  0.987294  0.986895  0.983667   0.990145  0.999228
8  Synthetic Data     4  0.988216  0.987859  0.985653   0.990075  0.999254
9  Synthetic Data     5  0.987360  0.986981  0.985067   0.988903  0.999221


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score



In [15]:
# Load datasets
real_data = pd.read_csv("creditcard.csv")
synthetic_data = pd.read_csv("smote_synthetic_data.csv")

# Split features and target for real data
X_real = real_data.drop(columns=["Class"])
y_real = real_data["Class"]

# Split features and target for synthetic data
X_synthetic = synthetic_data.drop(columns=["Class"])
y_synthetic = synthetic_data["Class"]



In [16]:
# Define Random Forest model evaluation function
def evaluate_model(X, y, dataset_name):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    metrics = []

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
        print(f"Running {dataset_name} - Fold {fold}...")
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = RandomForestClassifier(
            class_weight="balanced",  # Handle class imbalance
            random_state=42
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)

        metrics.append({
            "Dataset": dataset_name,
            "Fold": fold,
            "Accuracy": accuracy,
            "F1 Score": f1,
            "Recall": recall,
            "Precision": precision,
            "ROC-AUC": roc_auc
        })

    return metrics



In [17]:
# Perform 5-fold cross-validation for real and synthetic data
real_results = evaluate_model(X_real, y_real, "Real Data")
synthetic_results = evaluate_model(X_synthetic, y_synthetic, "Synthetic Data")



Running Real Data - Fold 1...
Running Real Data - Fold 2...
Running Real Data - Fold 3...
Running Real Data - Fold 4...
Running Real Data - Fold 5...
Running Synthetic Data - Fold 1...
Running Synthetic Data - Fold 2...
Running Synthetic Data - Fold 3...
Running Synthetic Data - Fold 4...
Running Synthetic Data - Fold 5...


In [18]:
# Combine results
all_results = real_results + synthetic_results

# Convert results to a DataFrame
results_df = pd.DataFrame(all_results)

# Display results in a tabular format
print(results_df)

# Save results to a CSV file
results_df.to_csv("random_forest_5fold_results.csv", index=False)

          Dataset  Fold  Accuracy  F1 Score    Recall  Precision   ROC-AUC
0       Real Data     1  0.999456  0.824859  0.737374   0.935897  0.943023
1       Real Data     2  0.999614  0.875000  0.777778   1.000000  0.958493
2       Real Data     3  0.999614  0.876404  0.795918   0.975000  0.962728
3       Real Data     4  0.999508  0.839080  0.744898   0.960526  0.942578
4       Real Data     5  0.999491  0.839779  0.775510   0.915663  0.947355
5  Synthetic Data     1  0.979679  0.978916  0.969906   0.988095  0.997605
6  Synthetic Data     2  0.981237  0.980532  0.971485   0.989749  0.998041
7  Synthetic Data     3  0.980842  0.980116  0.970763   0.989651  0.998128
8  Synthetic Data     4  0.979635  0.978859  0.969276   0.988634  0.998148
9  Synthetic Data     5  0.980096  0.979335  0.969637   0.989229  0.998063


In [19]:
# Import LightGBM
from lightgbm import LGBMClassifier

In [20]:
# Define LightGBM model evaluation function
def evaluate_model_lightgbm(X, y, dataset_name):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    metrics = []

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
        print(f"Running {dataset_name} - Fold {fold}...")
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = LGBMClassifier(
            class_weight="balanced",  # Handle class imbalance
            random_state=42
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)

        metrics.append({
            "Dataset": dataset_name,
            "Fold": fold,
            "Accuracy": accuracy,
            "F1 Score": f1,
            "Recall": recall,
            "Precision": precision,
            "ROC-AUC": roc_auc
        })

    return metrics



In [22]:
# Perform 5-fold cross-validation for real and synthetic data using LightGBM
real_results_lightgbm = evaluate_model_lightgbm(X_real, y_real, "Real Data")
synthetic_results_lightgbm = evaluate_model_lightgbm(X_synthetic, y_synthetic, "Synthetic Data")



Running Real Data - Fold 1...
[LightGBM] [Info] Number of positive: 393, number of negative: 227452
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022551 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 227845, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Running Real Data - Fold 2...
[LightGBM] [Info] Number of positive: 393, number of negative: 227452
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049374 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 227845, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[Light

In [23]:
# Combine results
all_results_lightgbm = real_results_lightgbm + synthetic_results_lightgbm

# Convert results to a DataFrame
results_df_lightgbm = pd.DataFrame(all_results_lightgbm)

# Display results in a tabular format
print(results_df_lightgbm)

# Save results to a CSV file
results_df_lightgbm.to_csv("lightgbm_5fold_results.csv", index=False)

          Dataset  Fold  Accuracy  F1 Score    Recall  Precision   ROC-AUC
0       Real Data     1  0.999280  0.791878  0.787879   0.795918  0.951359
1       Real Data     2  0.999491  0.852792  0.848485   0.857143  0.964549
2       Real Data     3  0.999526  0.860104  0.846939   0.873684  0.990796
3       Real Data     4  0.999386  0.822335  0.826531   0.818182  0.965546
4       Real Data     5  0.999350  0.812183  0.816327   0.808081  0.970747
5  Synthetic Data     1  0.985012  0.984534  0.980825   0.988271  0.998825
6  Synthetic Data     2  0.986219  0.985786  0.982539   0.989054  0.999017
7  Synthetic Data     3  0.985319  0.984852  0.981231   0.988501  0.998962
8  Synthetic Data     4  0.985692  0.985227  0.980916   0.989577  0.999092
9  Synthetic Data     5  0.985495  0.985032  0.981277   0.988816  0.998948


In [24]:
# Import Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

In [25]:
# Define Gradient Boosting model evaluation function
def evaluate_model_gradient_boosting(X, y, dataset_name):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    metrics = []

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
        print(f"Running {dataset_name} - Fold {fold}...")
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = GradientBoostingClassifier(
            random_state=42
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred_proba)

        metrics.append({
            "Dataset": dataset_name,
            "Fold": fold,
            "Accuracy": accuracy,
            "F1 Score": f1,
            "Recall": recall,
            "Precision": precision,
            "ROC-AUC": roc_auc
        })

    return metrics


In [26]:
# Perform 5-fold cross-validation for real and synthetic data using Gradient Boosting
real_results_gb = evaluate_model_gradient_boosting(X_real, y_real, "Real Data")
synthetic_results_gb = evaluate_model_gradient_boosting(X_synthetic, y_synthetic, "Synthetic Data")


Running Real Data - Fold 1...
Running Real Data - Fold 2...
Running Real Data - Fold 3...
Running Real Data - Fold 4...
Running Real Data - Fold 5...
Running Synthetic Data - Fold 1...
Running Synthetic Data - Fold 2...
Running Synthetic Data - Fold 3...
Running Synthetic Data - Fold 4...
Running Synthetic Data - Fold 5...


In [28]:
# Combine results
all_results_gb = real_results_gb + synthetic_results_gb

# Convert results to a DataFrame
results_df_gb = pd.DataFrame(all_results_gb)

# Display results in a tabular format
print(results_df_gb)

# Save results to a CSV file
results_df_gb.to_csv("gradient_boosting_5fold_results.csv", index=False)

          Dataset  Fold  Accuracy  F1 Score    Recall  Precision   ROC-AUC
0       Real Data     1  0.999052  0.703297  0.646465   0.771084  0.828070
1       Real Data     2  0.999052  0.712766  0.676768   0.752809  0.843105
2       Real Data     3  0.999350  0.808290  0.795918   0.821053  0.941922
3       Real Data     4  0.999175  0.734463  0.663265   0.822785  0.795746
4       Real Data     5  0.998490  0.348485  0.234694   0.676471  0.390827
5  Synthetic Data     1  0.977924  0.977109  0.968733   0.985632  0.997873
6  Synthetic Data     2  0.980381  0.979664  0.971576   0.987889  0.998246
7  Synthetic Data     3  0.979526  0.978763  0.970042   0.987643  0.998108
8  Synthetic Data     4  0.978999  0.978212  0.969231   0.987361  0.998320
9  Synthetic Data     5  0.979745  0.978999  0.970629   0.987515  0.998231


In [7]:
import pandas as pd
from scipy.stats import ttest_rel

# Load results for all models
xgboost_results = pd.read_csv("xgboost_5fold_results.csv")
lightgbm_results = pd.read_csv("lightgbm_5fold_results.csv")
random_forest_results = pd.read_csv("random_forest_5fold_results.csv")
gradient_boosting_results = pd.read_csv("gradient_boosting_5fold_results.csv")

# Extract F1 Score and Recall for real and synthetic datasets
# Real: First 5 rows, Synthetic: Last 5 rows
xgboost_f1_real = xgboost_results["F1 Score"][:5]
xgboost_f1_synthetic = xgboost_results["F1 Score"][5:]
xgboost_recall_real = xgboost_results["Recall"][:5]
xgboost_recall_synthetic = xgboost_results["Recall"][5:]

lightgbm_f1_real = lightgbm_results["F1 Score"][:5]
lightgbm_f1_synthetic = lightgbm_results["F1 Score"][5:]
lightgbm_recall_real = lightgbm_results["Recall"][:5]
lightgbm_recall_synthetic = lightgbm_results["Recall"][5:]

random_forest_f1_real = random_forest_results["F1 Score"][:5]
random_forest_f1_synthetic = random_forest_results["F1 Score"][5:]
random_forest_recall_real = random_forest_results["Recall"][:5]
random_forest_recall_synthetic = random_forest_results["Recall"][5:]

gradient_boosting_f1_real = gradient_boosting_results["F1 Score"][:5]
gradient_boosting_f1_synthetic = gradient_boosting_results["F1 Score"][5:]
gradient_boosting_recall_real = gradient_boosting_results["Recall"][:5]
gradient_boosting_recall_synthetic = gradient_boosting_results["Recall"][5:]

# Perform paired t-tests for F1 Score
print("Paired t-tests for F1 Score (Real vs Synthetic):")
print("XGBoost:", ttest_rel(xgboost_f1_real, xgboost_f1_synthetic).pvalue)
print("LightGBM:", ttest_rel(lightgbm_f1_real, lightgbm_f1_synthetic).pvalue)
print("Random Forest:", ttest_rel(random_forest_f1_real, random_forest_f1_synthetic).pvalue)
print("Gradient Boosting:", ttest_rel(gradient_boosting_f1_real, gradient_boosting_f1_synthetic).pvalue)

# Perform paired t-tests for Recall
print("\nPaired t-tests for Recall (Real vs Synthetic):")
print("XGBoost:", ttest_rel(xgboost_recall_real, xgboost_recall_synthetic).pvalue)
print("LightGBM:", ttest_rel(lightgbm_recall_real, lightgbm_recall_synthetic).pvalue)
print("Random Forest:", ttest_rel(random_forest_recall_real, random_forest_recall_synthetic).pvalue)
print("Gradient Boosting:", ttest_rel(gradient_boosting_recall_real, gradient_boosting_recall_synthetic).pvalue)

Paired t-tests for F1 Score (Real vs Synthetic):
XGBoost: 0.0008219474530923674
LightGBM: 0.0002369557697773966
Random Forest: 0.00022060692445166882
Gradient Boosting: 0.016951867843909053

Paired t-tests for Recall (Real vs Synthetic):
XGBoost: 0.00010974704075771666
LightGBM: 0.00014082157444126128
Random Forest: 4.438980820911954e-05
Gradient Boosting: 0.018783706873946652


In [8]:
import pandas as pd
from scipy.stats import f_oneway

# Load results for all models
xgboost_results = pd.read_csv("xgboost_5fold_results.csv")
lightgbm_results = pd.read_csv("lightgbm_5fold_results.csv")
random_forest_results = pd.read_csv("random_forest_5fold_results.csv")
gradient_boosting_results = pd.read_csv("gradient_boosting_5fold_results.csv")

# Aggregate metrics (mean F1 Score and Recall) for real and synthetic datasets
data = {
    "Model": ["XGBoost", "LightGBM", "Random Forest", "Gradient Boosting"],
    "F1 Score (Real)": [
        xgboost_results["F1 Score"][:5].mean(),
        lightgbm_results["F1 Score"][:5].mean(),
        random_forest_results["F1 Score"][:5].mean(),
        gradient_boosting_results["F1 Score"][:5].mean()
    ],
    "F1 Score (Synthetic)": [
        xgboost_results["F1 Score"][5:].mean(),
        lightgbm_results["F1 Score"][5:].mean(),
        random_forest_results["F1 Score"][5:].mean(),
        gradient_boosting_results["F1 Score"][5:].mean()
    ],
    "Recall (Real)": [
        xgboost_results["Recall"][:5].mean(),
        lightgbm_results["Recall"][:5].mean(),
        random_forest_results["Recall"][:5].mean(),
        gradient_boosting_results["Recall"][:5].mean()
    ],
    "Recall (Synthetic)": [
        xgboost_results["Recall"][5:].mean(),
        lightgbm_results["Recall"][5:].mean(),
        random_forest_results["Recall"][5:].mean(),
        gradient_boosting_results["Recall"][5:].mean()
    ]
}

# Convert to DataFrame
aggregated_data = pd.DataFrame(data)

# Perform ANOVA for F1 Score
f1_real = aggregated_data["F1 Score (Real)"]
f1_synthetic = aggregated_data["F1 Score (Synthetic)"]
anova_f1 = f_oneway(f1_real, f1_synthetic)

# Perform ANOVA for Recall
recall_real = aggregated_data["Recall (Real)"]
recall_synthetic = aggregated_data["Recall (Synthetic)"]
anova_recall = f_oneway(recall_real, recall_synthetic)

# Print results
print("ANOVA Results for F1 Score:")
print(f"F-statistic: {anova_f1.statistic:.4f}, p-value: {anova_f1.pvalue:.10f}")

print("\nANOVA Results for Recall:")
print(f"F-statistic: {anova_recall.statistic:.4f}, p-value: {anova_recall.pvalue:.10f}")

ANOVA Results for F1 Score:
F-statistic: 14.8873, p-value: 0.0083777219

ANOVA Results for Recall:
F-statistic: 18.4949, p-value: 0.0050896162
