# ML (Gradient Boosting) Predictions With p-values Using Bootstraping

Imports

In [101]:
import numpy as np
import xgboost as xgb
from sklearn.utils import resample
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from scipy.stats import ttest_rel
from sklearn.model_selection import train_test_split


Dataset

In [118]:
# Load a sample dataset (Breast Cancer dataset from scikit-learn)
data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=42)

In [119]:
# Number of bootstrap samples
n_bootstrap_samples = 100

# Store the predictions from each bootstrap sample
bootstrap_predictions = []

for _ in range(n_bootstrap_samples):
    # Create a bootstrap sample, with replacements (same sample can appear multiple times)
    bootstrap_X, bootstrap_y = resample(X_train, y_train, replace=True)

    # Create and train an XGBoost classifier
    model = xgb.XGBClassifier()
    model.fit(bootstrap_X, bootstrap_y)
    
    # Make predictions on the original data
    y_pred = model.predict_proba(X_test)
    y_pred = y_pred[:, 1]
    bootstrap_predictions.append(y_pred)
bootstrap_predictions = np.asarray(bootstrap_predictions)

In [120]:
from sklearn.metrics import roc_auc_score
for i in range(100):
    print(roc_auc_score(y_test, bootstrap_predictions[i]))

0.9673575129533679
0.9481865284974094
0.9639572538860104
0.9767487046632124
0.9641677461139896
0.9514329663212435
0.9703853626943004
0.9672603626943005
0.9788374352331607
0.8987613341968912
0.9783516839378238
0.9741013601036269
0.9728465025906735
0.9660702720207255
0.8977331606217617
0.9676408678756476
0.8864637305699481
0.9379695595854922
0.8664022020725388
0.9777606865284975
0.9666531735751295
0.9648639896373057
0.9767163212435234
0.9655926165803108
0.9641110751295336
0.8664022020725388
0.9473040803108808
0.8987613341968912
0.9645725388601036
0.9791207901554404
0.9821567357512954
0.964872085492228
0.9775825777202073
0.9733403497409326
0.8742956606217616
0.9734132124352332
0.9635038860103626
0.9735589378238343
0.9730650906735752
0.9643296632124352
0.8977331606217617
0.9483646373056994
0.966839378238342
0.9733889248704662
0.9725631476683939
0.9612856217616581
0.9609536917098445
0.9380829015544042
0.9713406735751295
0.9733079663212436
0.9815495466321243
0.9726360103626943
0.897733160621

In [121]:
from scipy.stats import ttest_1samp
import matplotlib.pyplot as plt
t_stat, p_value_two_tailed = ttest_1samp(bootstrap_predictions, 0.5)

# Since we're interested in a one-tailed test (greater than 0.5), we'll divide the p-value by 2
p_value_one_tailed = p_value_two_tailed / 2


In [125]:
y_test.shape

(513,)

In [2]:
import pandas as pd

# Sample DataFrame
data = {
    'Group': ['A', 'A', 'B', 'B', 'C', 'C'],
    'Value': [10, 15, 20, 25, 30, 35]
}

df = pd.DataFrame(data)

df

Unnamed: 0,Group,Value
0,A,10
1,A,15
2,B,20
3,B,25
4,C,30
5,C,35


In [3]:
def custom_function(x):
    # Compute the sum and mean
    total = x['Value'].sum()
    average = x['Value'].mean()
    
    # Return as a Series with new column names
    return pd.Series([total, average], index=['Total', 'Average'])
df.groupby('Group').apply(custom_function).reset_index()


Unnamed: 0,Group,Total,Average
0,A,25.0,12.5
1,B,45.0,22.5
2,C,65.0,32.5
