In [1]:
import pandas as pd
import openml
import numpy as np

dataset = openml.datasets.get_dataset(44061)
X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)
columns_to_encode = ['X3', 'X4', 'X6']
X = pd.get_dummies(X, columns=columns_to_encode)
X = X.astype(float)
y = y.astype(float)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



In [13]:
X_train.shape

(2946, 379)

In [2]:
from sklearn.ensemble import RandomForestRegressor
from uncertainty_analysis import bootstrap

n_rubin_samples = 600
n_pbf_btstrp_smpls = 2946 #600

# Bayesian Forest

In [3]:
merc_bf_results = [] # List[ensemble np.array[regressors np.array[predictions]]
for i in range(0, 10):
    m = RandomForestRegressor(criterion='poisson', max_depth=None, max_features=50,
                              min_samples_split=30, min_samples_leaf=5, n_estimators=600
                              ).fit(X_train, y_train)

    predictions = np.empty((m.n_estimators, len(X_test)))

    for j, tree in enumerate(m.estimators_):
        predictions[j] = tree.predict(X_test)

    bootstrapped_predictions = np.empty((len(X_test), 600))
    for k, result_set in enumerate(predictions.T):
        bootstrapped_predictions[k] = bootstrap.bayesian_bootstrap(result_set, n_rubin_samples)

    merc_bf_results.append(bootstrapped_predictions)



In [4]:
from uncertainty_analysis import project_helper_functions as phf

In [5]:
best_model_results = merc_bf_results[phf.bf_get_best_model_idx(merc_bf_results, y_test)]
phf.write_ensemble_model_results(best_model_results, "merc_bf_results.txt")

# Proper Bayesian Forest, w=0

In [15]:
from uncertainty_analysis import proper_bayesian_forest as pbf

k_w0 = pbf.get_k(0, n_pbf_btstrp_smpls)

In [16]:
# entire dataset has binary values; set params manually
def create_priors(X):
    n_features = X.shape[1]
    priors = np.empty(n_features, dtype=pbf.UniformPrior)
    for i in range(0, n_features):
        priors[i] = pbf.UniformPrior(0, 1)

    return priors

In [17]:
def proper_bf(k):
    pbf_results = []

    for i in range(0, 3):
        m = pbf.ProperBayesianForest(X_train, y_train,
                                     create_priors(X_train),
                                     k_values=np.array([k] * X_train.shape[1], dtype=np.int32),
                                     n_galvani_samples=n_pbf_btstrp_smpls,
                                     criterion='poisson', max_depth=None, max_features=50,
                                     min_samples_split=30, min_samples_leaf=5, n_estimators=600)
        m.fit()

        predictions = np.empty((len(X_test), n_rubin_samples))

        for j, xs in enumerate(X_test):
            predictions[j] = m.get_prediction_distribution(xs.reshape(1, -1), n_rubin_samples=n_rubin_samples)

        pbf_results.append(predictions)

    return pbf_results

In [18]:
# pbf class uses np arrays, not dataframes - convert to arrays
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [19]:
pbf_w0_results = proper_bf(k_w0)
best_model_results = pbf_w0_results[phf.bf_get_best_model_idx(pbf_w0_results, y_test)]
phf.write_ensemble_model_results(best_model_results, "merc_pbf_w0_full_set_results.txt")

# Proper Bayesian Forest, w=0.1

In [11]:
k_w1 = pbf.get_k(0.1, n_pbf_btstrp_smpls)

In [20]:
pbf_w1_results = proper_bf(k_w1)
best_model_results = pbf_w1_results[phf.bf_get_best_model_idx(pbf_w1_results, y_test)]
phf.write_ensemble_model_results(best_model_results, "merc_pbf_w01_full_set_results.txt")