In [1]:
import pandas as pd
from scipy.stats import zscore
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv',  na_values
                 =['NA','?'])

#generate dummie for job
df = pd.concat([df, pd.get_dummies
                (df['job'], prefix='job')], axis=1)
df.drop('job', axis=1, inplace = True)

#generate dummies for area
df = pd.concat([df, pd.get_dummies(df['area'], prefix='area')], axis=1)
df.drop('area', axis=1, inplace=True)

#generate dummies for products
df = pd.concat([df, pd.get_dummies(
                df['product'], prefix='product')], axis=1)
df.drop('product', axis=1, inplace=True)

#missing values for income
med = df['income'].median()
df['income'] = df['income'].fillna(med)

#standardize ranges
df['income'] = zscore(df['income'])
df['aspect'] = zscore(df['aspect'])
df['save_rate'] = zscore(df['save_rate'])
df['subscriptions'] = zscore(df['subscriptions'])

#convert to numpy -classification
x_columns = df.columns.drop('age').drop('id')
x = df[x_columns].values
y = df['age'].values

the following code perform the bootstrap. The architecture of the neural network can be adjusted to 
compare many different configurations

In [3]:
import os
import numpy as np
import time
import statistics
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import ShuffleSplit

In [6]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

In [7]:
SPLITS=50

#bootstrap
boot = ShuffleSplit(n_splits=SPLITS, test_size=0.1, random_state=42)

#track progress
mean_benchmark = []
epochs_needed = []
num = 0

#loop through samples
for train, test in boot.split(x):
    start_time = time.time()
    num+=1
    
    #split train and test
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]
    
    #construct neural network
    model = Sequential()
    model.add(Dense(20, input_dim=x_train.shape[1], activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    
    monitor = EarlyStopping(monitor = 'val_loss', min_delta=1e-3,
                           patience=5, verbose=0, mode='auto', restore_best_weights=True)
    # train on the bootstrap sample
    model.fit(x_train, y_train, validation_data=(x_test, y_test), callbacks=[monitor],verbose=0, epochs=1000)
    epochs = monitor.stopped_epoch
    epochs_needed.append(epochs)
    
    #predict on the out of boot (validation)
    pred = model.predict(x_test)
    
    
    #measure this bootstrap's log loss
    score = np.sqrt(metrics.mean_squared_error(pred, y_test))
    mean_benchmark.append(score)
    m1 = statistics.mean(mean_benchmark)
    m2 = statistics.mean(epochs_needed)
    mdev = statistics.pstdev(mean_benchmark)
    
    # Record this iteration
    time_took = time.time() - start_time
    print(f"#{num}: score={score:.6f}, mean score={m1:.6f}, stdev={mdev:.6f},epochs={epochs}, mean epochs={int(m2)},  time={hms_string(time_took)}")

#1: score=0.693469, mean score=0.693469, stdev=0.000000,epochs=101, mean epochs=101,  time=0:00:16.77
#2: score=0.741775, mean score=0.717622, stdev=0.024153,epochs=121, mean epochs=111,  time=0:00:20.29
#3: score=0.792233, mean score=0.742492, stdev=0.040323,epochs=139, mean epochs=120,  time=0:00:22.07
#4: score=0.758366, mean score=0.746461, stdev=0.035591,epochs=134, mean epochs=123,  time=0:00:21.33
#5: score=0.725727, mean score=0.742314, stdev=0.032896,epochs=131, mean epochs=125,  time=0:00:21.03
#6: score=1.142235, mean score=0.808967, stdev=0.152037,epochs=135, mean epochs=126,  time=0:00:21.84
#7: score=0.659789, mean score=0.787656, stdev=0.150127,epochs=93, mean epochs=122,  time=0:00:14.90
#8: score=0.556555, mean score=0.758769, stdev=0.159882,epochs=113, mean epochs=120,  time=0:00:18.69
#9: score=0.693585, mean score=0.751526, stdev=0.152124,epochs=127, mean epochs=121,  time=0:00:20.66
#10: score=1.166042, mean score=0.792978, stdev=0.190504,epochs=78, mean epochs=117

# bootstrapping for classification

regression bootstrapping uses the **StratifiedShuffleSplit** object to perform the splits. This is similar to StratifiedKFold
for cross validation,as the classes are balanced so that the sampling has no effect on propotions. To demonstrate this technique
we will attempt to predict the prodcut column for simple dataset this data is loaded below

In [8]:
df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])
# Generate dummies for job
df = pd.concat([df,pd.get_dummies(df['job'],prefix="job")],axis=1)
df.drop('job', axis=1, inplace=True)

# Generate dummies for area
df = pd.concat([df,pd.get_dummies(df['area'],prefix="area")],axis=1)
df.drop('area', axis=1, inplace=True)

# Missing values for income
med = df['income'].median()
df['income'] = df['income'].fillna(med)

# Standardize ranges
df['income'] = zscore(df['income'])
df['aspect'] = zscore(df['aspect'])
df['save_rate'] = zscore(df['save_rate'])
df['age'] = zscore(df['age'])
df['subscriptions'] = zscore(df['subscriptions'])

# Convert to numpy - Classification
x_columns = df.columns.drop('product').drop('id')
x = df[x_columns].values
dummies = pd.get_dummies(df['product']) # Classification
products = dummies.columns
y = dummies.values

In [18]:
x_train.shape

(1800, 47)

In [17]:
y.shape

(2000, 7)

In [11]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit

SPLITS = 50

# Bootstrap
boot = StratifiedShuffleSplit(n_splits=SPLITS, test_size=0.1, 
                                random_state=42)

# Track progress
mean_benchmark = []
epochs_needed = []
num = 0

# Loop through samples
for train, test in boot.split(x,df['product']):
    start_time = time.time()
    num+=1

    # Split train and test
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]

    # Construct neural network
    model = Sequential()
    model.add(Dense(50, input_dim=x.shape[1], activation='relu')) # Hidden 1
    model.add(Dense(25, activation='relu')) # Hidden 2
    model.add(Dense(y.shape[1],activation='softmax')) # Output
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
        patience=25, verbose=0, mode='auto', restore_best_weights=True)

    # Train on the bootstrap sample
    model.fit(x_train,y_train,validation_data=(x_test,y_test),
              callbacks=[monitor],verbose=0,epochs=1000)
    epochs = monitor.stopped_epoch
    epochs_needed.append(epochs)
    
    # Predict on the out of boot (validation)
    pred = model.predict(x_test)
  
    # Measure this bootstrap's log loss
    y_compare = np.argmax(y_test,axis=1) # For log loss calculation
    score = metrics.log_loss(y_compare, pred)
    mean_benchmark.append(score)
    m1 = statistics.mean(mean_benchmark)
    m2 = statistics.mean(epochs_needed)
    mdev = statistics.pstdev(mean_benchmark)
    
    # Record this iteration
    time_took = time.time() - start_time
    print(f"#{num}: score={score:.6f}, mean score={m1:.6f}," +\
          f"stdev={mdev:.6f}, epochs={epochs}, mean epochs={int(m2)}," +\
          f" time={hms_string(time_took)}")

#1: score=0.662159, mean score=0.662159,stdev=0.000000, epochs=78, mean epochs=78, time=0:00:12.41
#2: score=0.662452, mean score=0.662306,stdev=0.000146, epochs=65, mean epochs=71, time=0:00:09.51
#3: score=0.671561, mean score=0.665391,stdev=0.004365, epochs=56, mean epochs=66, time=0:00:07.59
#4: score=0.656017, mean score=0.663048,stdev=0.005546, epochs=78, mean epochs=69, time=0:00:10.26
#5: score=0.657234, mean score=0.661885,stdev=0.005479, epochs=97, mean epochs=74, time=0:00:12.85
#6: score=0.690042, mean score=0.666578,stdev=0.011625, epochs=55, mean epochs=71, time=0:00:07.80
#7: score=0.713074, mean score=0.673220,stdev=0.019508, epochs=52, mean epochs=68, time=0:00:07.20
#8: score=0.766360, mean score=0.684863,stdev=0.035802, epochs=51, mean epochs=66, time=0:00:07.02
#9: score=0.627125, mean score=0.678447,stdev=0.038323, epochs=56, mean epochs=65, time=0:00:07.64
#10: score=0.641238, mean score=0.674726,stdev=0.038031, epochs=99, mean epochs=68, time=0:00:13.02
#11: scor

#  benchmarking

now that we've seen how to bootstrap with both classification and regression we can start to try to optimize the hyperparameters for the simple dataset data. For this example we willencode for classficiation of the product column. Evaluation will be log loss

In [12]:
import pandas as pd
from scipy.stats import zscore

# Read the data set
df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])

# Generate dummies for job
df = pd.concat([df,pd.get_dummies(df['job'],prefix="job")],axis=1)
df.drop('job', axis=1, inplace=True)

# Generate dummies for area
df = pd.concat([df,pd.get_dummies(df['area'],prefix="area")],
               axis=1)
df.drop('area', axis=1, inplace=True)

# Missing values for income
med = df['income'].median()
df['income'] = df['income'].fillna(med)

# Standardize ranges
df['income'] = zscore(df['income'])
df['aspect'] = zscore(df['aspect'])
df['save_rate'] = zscore(df['save_rate'])
df['age'] = zscore(df['age'])
df['subscriptions'] = zscore(df['subscriptions'])

# Convert to numpy - Classification
x_columns = df.columns.drop('product').drop('id')
x = df[x_columns].values
dummies = pd.get_dummies(df['product']) # Classification
products = dummies.columns
y = dummies.values

I performed some optimization and the code is currently set to the best setting that i came up with. Later, we can use an automatic process to optimize the hyperparameters

In [15]:
import pandas as pd
import os
import numpy as np
import time
import tensorflow.keras.initializers
import statistics
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedShuffleSplit
from tensorflow.keras.layers import LeakyReLU,PReLU

SPLITS = 100

# Bootstrap
boot = StratifiedShuffleSplit(n_splits=SPLITS, test_size=0.1)

# Track progress
mean_benchmark = []
epochs_needed = []
num = 0

# Loop through samples
for train, test in boot.split(x,df['product']):
    start_time = time.time()
    num+=1

    # Split train and test
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]

    # Construct neural network
    model = Sequential()
    model.add(Dense(100, input_dim=x.shape[1], activation=PReLU(), \
        kernel_regularizer=regularizers.l2(1e-4))) # Hidden 1
    model.add(Dropout(0.5))
    model.add(Dense(100, activation=PReLU(), \
        activity_regularizer=regularizers.l2(1e-4))) # Hidden 2
    model.add(Dropout(0.5))
    model.add(Dense(100, activation=PReLU(), \
        activity_regularizer=regularizers.l2(1e-4)
    )) # Hidden 3
#    model.add(Dropout(0.5)) - Usually better performance 
# without dropout on final layer
    model.add(Dense(y.shape[1],activation='softmax')) # Output
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
        patience=100, verbose=0, mode='auto', restore_best_weights=True)

    # Train on the bootstrap sample
    model.fit(x_train,y_train,validation_data=(x_test,y_test), \
              callbacks=[monitor],verbose=0,epochs=1000)
    epochs = monitor.stopped_epoch
    epochs_needed.append(epochs)
    
    # Predict on the out of boot (validation)
    pred = model.predict(x_test)
  
    # Measure this bootstrap's log loss
    y_compare = np.argmax(y_test,axis=1) # For log loss calculation
    score = metrics.log_loss(y_compare, pred)
    mean_benchmark.append(score)
    m1 = statistics.mean(mean_benchmark)
    m2 = statistics.mean(epochs_needed)
    mdev = statistics.pstdev(mean_benchmark)
    
    # Record this iteration
    time_took = time.time() - start_time
    
    print(f"#{num}: score={score:.6f}, mean score={m1:.6f}," +\
          f"stdev={mdev:.6f}, epochs={epochs}, mean epochs={int(m2)}," +\
          f" time={hms_string(time_took)}")

#1: score=0.626487, mean score=0.626487,stdev=0.000000, epochs=230, mean epochs=230, time=0:00:57.87
#2: score=0.585000, mean score=0.605743,stdev=0.020743, epochs=491, mean epochs=360, time=0:01:58.10
#3: score=0.644065, mean score=0.618517,stdev=0.024763, epochs=174, mean epochs=298, time=0:00:40.58
#4: score=0.627637, mean score=0.620797,stdev=0.021806, epochs=221, mean epochs=279, time=0:00:50.85
#5: score=0.632618, mean score=0.623161,stdev=0.020069, epochs=199, mean epochs=263, time=0:00:46.38
#6: score=0.599026, mean score=0.619139,stdev=0.020409, epochs=164, mean epochs=246, time=0:00:40.31
#7: score=0.640858, mean score=0.622241,stdev=0.020366, epochs=170, mean epochs=235, time=0:00:39.81
#8: score=0.685518, mean score=0.630151,stdev=0.028300, epochs=277, mean epochs=240, time=0:01:03.92
#9: score=0.651465, mean score=0.632519,stdev=0.027509, epochs=156, mean epochs=231, time=0:00:36.41
#10: score=0.698371, mean score=0.639104,stdev=0.032732, epochs=140, mean epochs=222, time=

#82: score=0.640171, mean score=0.662487,stdev=0.052130, epochs=184, mean epochs=191, time=0:00:46.74
#83: score=0.627748, mean score=0.662068,stdev=0.051954, epochs=177, mean epochs=191, time=0:01:10.98
#84: score=0.640439, mean score=0.661811,stdev=0.051697, epochs=227, mean epochs=192, time=0:01:53.10
#85: score=0.720661, mean score=0.662503,stdev=0.051782, epochs=133, mean epochs=191, time=0:01:06.45
#86: score=0.660071, mean score=0.662475,stdev=0.051481, epochs=214, mean epochs=191, time=0:01:35.99
#87: score=0.680771, mean score=0.662685,stdev=0.051221, epochs=181, mean epochs=191, time=0:01:21.17
#88: score=0.658859, mean score=0.662641,stdev=0.050931, epochs=127, mean epochs=190, time=0:00:57.88
#89: score=0.697581, mean score=0.663034,stdev=0.050778, epochs=193, mean epochs=190, time=0:01:27.27
#90: score=0.629913, mean score=0.662666,stdev=0.050614, epochs=185, mean epochs=190, time=0:01:23.69
#91: score=0.681532, mean score=0.662873,stdev=0.050374, epochs=173, mean epochs=1