In [2]:
#Import necessary libraries
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
import time
import numpy as np
import os

# Mapping dataset sizes to their corresponding CSV filenames
files = {
    100: "generated_data_100.csv",
    1000: "generated_data_1000.csv",
    10000: "generated_data_10000.csv",
    100000: "generated_data_1e+05.csv",
    1000000: "generated_data_1e+06.csv",
    10000000: "generated_data_1e+07.csv"
}

# Predictor columns and outcome column
cols_x = ['pregnant', 'glucose', 'pressure', 'triceps', 'insulin', 'mass', 'pedigree', 'age']
col_y = 'outcome'
folds = 5  # Number of folds for cross-validation

def run_cv(X, y, folds=5):
    """
    Runs XGBoost classification using scikit-learn's cross_val_score.

    Parameters:
    X (DataFrame): Predictor variables.
    y (Series): Outcome variable (binary classification).
    folds (int): Number of CV folds.

    Returns:
    Tuple: mean accuracy score, time taken for fitting
    """
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        use_label_encoder=False,
        n_jobs=-1,
        verbosity=0
    )
    cv = KFold(n_splits=folds, shuffle=True, random_state=42)  # 5-fold CV
    start = time.time()
    acc = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
    end = time.time()
    return np.mean(acc), end - start

def main():
    """
    Main function to process all datasets, run XGBoost with 5-fold CV,
    and store results in a summary table.
    """
    results = []

    print("Running XGBoost (Python + 5-fold CV)\n" + "-"*40)

    for sz, fname in files.items():
        print(f"\nDataset Size: {sz} | File: {fname}")

        try:
            # Check if the file exists
            if not os.path.exists(fname):
                raise FileNotFoundError(f"File not found: {fname}")

            # Load data and measure load time
            t_load = time.time()
            df = pd.read_csv(fname)
            print(f"Loaded {len(df)} rows in {time.time() - t_load:.2f}s")

            # Check for necessary columns
            required_cols = cols_x + [col_y]
            if not all(col in df.columns for col in required_cols):
                raise ValueError(f"Missing columns. Found: {df.columns.tolist()}")

            # Drop rows with missing values
            before_rows = len(df)
            df = df.dropna(subset=required_cols)
            dropped = before_rows - len(df)
            if dropped > 0:
                print(f"Dropped {dropped} rows with missing values")

            # Split into predictors and target
            X = df[cols_x]
            y = df[col_y].astype(int)

            print("Running cross-validation...")
            acc, t_cv = run_cv(X, y, folds)
            print(f"CV Time: {t_cv:.2f}s | Accuracy: {acc:.4f}")

            # Append result to list
            results.append({
                "Method": "XGBoost (Python 5-fold CV)",
                "Dataset size": sz,
                "Testing-set predictive performance": round(acc, 4),
                "Time taken for the model to be fit (s)": round(t_cv, 2)
            })

        except Exception as e:
            # Handle errors (e.g., file not found, missing columns, etc.)
            print(f"Error: {e}")
            results.append({
                "Method": "XGBoost (Python 5-fold CV)",
                "Dataset size": sz,
                "Testing-set predictive performance": "Error",
                "Time taken for the model to be fit (s)": "Error"
            })

    # Display and save final results
    print("\n" + "="*50)
    print("Summary of XGBoost (Python 5-fold CV)")
    print("="*50)
    df_results = pd.DataFrame(results)
    print(df_results.to_string(index=False))

    # Save to CSV
    df_results.to_csv("xgb_python_results.csv", index=False)
    print("\nSaved results to xgb_python_results.csv")

# Entry point of the script
if __name__ == "__main__":
    main()


Running XGBoost (Python + 5-fold CV)
----------------------------------------

Dataset Size: 100 | File: generated_data_100.csv
Loaded 100 rows in 0.01s
Running cross-validation...
CV Time: 3.10s | Accuracy: 0.9400

Dataset Size: 1000 | File: generated_data_1000.csv
Loaded 1000 rows in 0.01s
Running cross-validation...
CV Time: 0.42s | Accuracy: 0.9520

Dataset Size: 10000 | File: generated_data_10000.csv
Loaded 10000 rows in 0.02s
Running cross-validation...
CV Time: 1.44s | Accuracy: 0.9755

Dataset Size: 100000 | File: generated_data_1e+05.csv
Loaded 100000 rows in 0.14s
Running cross-validation...
CV Time: 4.46s | Accuracy: 0.9868

Dataset Size: 1000000 | File: generated_data_1e+06.csv
Loaded 1000000 rows in 1.06s
Running cross-validation...
CV Time: 51.62s | Accuracy: 0.9917

Dataset Size: 10000000 | File: generated_data_1e+07.csv
Loaded 10000000 rows in 7.73s
Running cross-validation...




CV Time: 427.75s | Accuracy: 0.9931

Summary of XGBoost (Python 5-fold CV)
                    Method  Dataset size  Testing-set predictive performance  Time taken for the model to be fit (s)
XGBoost (Python 5-fold CV)           100                              0.9400                                    3.10
XGBoost (Python 5-fold CV)          1000                              0.9520                                    0.42
XGBoost (Python 5-fold CV)         10000                              0.9755                                    1.44
XGBoost (Python 5-fold CV)        100000                              0.9868                                    4.46
XGBoost (Python 5-fold CV)       1000000                              0.9917                                   51.62
XGBoost (Python 5-fold CV)      10000000                              0.9931                                  427.75

Saved results to xgb_python_results.csv
