In [4]:
pip install pyreadr




In [5]:
import pyreadr
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
import time
import numpy as np

# Mapping sizes to correct RDS filenames
size_to_file = {
    100: "bootstrap_data_100.rds",
    1000: "bootstrap_data_1000.rds",
    10000: "bootstrap_data_10000.rds",
    100000: "bootstrap_data_1e+05.rds",
    1000000: "bootstrap_data_1000000.rds",
    10000000: "bootstrap_data_10000000.rds"
}

cols_x = ['pregnant', 'glucose', 'pressure', 'triceps', 'insulin', 'mass', 'pedigree', 'age']
col_y = 'outcome'
folds = 5
res = []

print(f"Running XGBoost on bootstrap RDS samples\n" + "-"*40)

def run_cv(X, y):
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        use_label_encoder=False,
        n_jobs=-1
    )
    cv = KFold(n_splits=folds, shuffle=True, random_state=42)
    t0 = time.time()
    acc = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
    return np.mean(acc), time.time() - t0

for sz, file in size_to_file.items():
    print(f"\nSize: {sz} | File: {file}")
    try:
        t0 = time.time()
        result = pyreadr.read_r(file)
        df = result[None]  # Only one object in RDS

        print(f"Loaded {len(df)} rows in {time.time() - t0:.2f}s")

        if not all(c in df.columns for c in cols_x + [col_y]):
            raise ValueError(f"Missing columns. Found columns: {df.columns.tolist()}")

        X = df[cols_x]
        y = df[col_y].astype(int)
        print("Running CV...")

        acc, t_cv = run_cv(X, y)
        print(f"CV Time: {t_cv:.2f}s | Accuracy: {acc:.4f}")

        res.append({"Method": "XGBoost (Python 5-fold CV)", "Size": sz, "Accuracy": acc, "CV Time (s)": t_cv})

    except Exception as e:
        print(f"Error: {e}")
        res.append({"Method": "XGBoost (Python 5-fold CV)", "Size": sz, "Accuracy": "Error", "CV Time (s)": "Error"})

# Report
print("\n" + "="*40)
print("Summary")
print("="*40)
df_res = pd.DataFrame(res)
print(df_res.to_string(index=False))
df_res.to_csv("xgb_results_python.csv", index=False)
print("\nSaved to xgb_results_python.csv")


Running XGBoost on bootstrap RDS samples
----------------------------------------

Size: 100 | File: bootstrap_data_100.rds
Loaded 100 rows in 0.01s
Running CV...
CV Time: 3.65s | Accuracy: 0.8900

Size: 1000 | File: bootstrap_data_1000.rds
Loaded 1000 rows in 0.01s
Running CV...
CV Time: 0.43s | Accuracy: 0.9440

Size: 10000 | File: bootstrap_data_10000.rds
Loaded 10000 rows in 0.06s
Running CV...
CV Time: 1.42s | Accuracy: 0.9754

Size: 100000 | File: bootstrap_data_1e+05.rds
Loaded 100000 rows in 0.31s
Running CV...
CV Time: 6.68s | Accuracy: 0.9869

Size: 1000000 | File: bootstrap_data_1000000.rds
Loaded 1000000 rows in 1.22s
Running CV...
CV Time: 26.67s | Accuracy: 0.9919

Size: 10000000 | File: bootstrap_data_10000000.rds
Loaded 10000000 rows in 11.87s
Running CV...




CV Time: 288.36s | Accuracy: 0.9931

Summary
                    Method     Size  Accuracy  CV Time (s)
XGBoost (Python 5-fold CV)      100  0.890000     3.647653
XGBoost (Python 5-fold CV)     1000  0.944000     0.430796
XGBoost (Python 5-fold CV)    10000  0.975400     1.418593
XGBoost (Python 5-fold CV)   100000  0.986940     6.675767
XGBoost (Python 5-fold CV)  1000000  0.991862    26.669050
XGBoost (Python 5-fold CV) 10000000  0.993118   288.356472

Saved to xgb_results_python.csv
