In [3]:
import joblib
import nbimporter
import numpy as np

from data_notebooks  import Santander, may, Cat, Rain, heartDisease, spaceshipTitanic, starClassification
from sklearn.metrics import accuracy_score

In [4]:
fileprefs =  ["Rain", "Santander", "Cat", "May", "heartDisease", "spaceshipTitanic", "starClassification"] 

models = {i: {} for i in fileprefs}

# Read models
for filepref in fileprefs:
#     for suff in ["co2", "rf"]:
    for suff in ["rf"]:
        with open("Models/" + filepref + "_" + suff + ".joblib", 'rb') as file: 
            models[filepref][suff] = joblib.load(file) 

In [5]:
for filepref in fileprefs:
    print(models[filepref]["rf"].best_estimator_.get_params)

<bound method BaseEstimator.get_params of RandomForestClassifier(max_depth=50, min_samples_leaf=10, min_samples_split=40,
                       n_jobs=-1, random_state=0)>
<bound method BaseEstimator.get_params of RandomForestClassifier(max_depth=5, min_samples_leaf=10, min_samples_split=40,
                       n_jobs=-1, random_state=0)>
<bound method BaseEstimator.get_params of RandomForestClassifier(max_depth=50, min_samples_leaf=10, min_samples_split=40,
                       n_jobs=-1, random_state=0)>
<bound method BaseEstimator.get_params of RandomForestClassifier(max_depth=50, min_samples_leaf=10, min_samples_split=40,
                       n_jobs=-1, random_state=0)>
<bound method BaseEstimator.get_params of RandomForestClassifier(max_depth=50, min_samples_leaf=10, min_samples_split=40,
                       n_jobs=-1, random_state=0)>
<bound method BaseEstimator.get_params of RandomForestClassifier(max_depth=20, min_samples_leaf=10, min_samples_split=40,
              

In [20]:
data = {i: [] for i in fileprefs}

# Read test data
for filepref, notebook in zip(fileprefs, [Rain, Santander, Cat, may, heartDisease, spaceshipTitanic, starClassification]):
    data[filepref] = notebook.DatasetName.load().prepare()[2:]

In [21]:
for filepref in fileprefs:
    for model in ["co2", "rf"]:
        %time print(f"{filepref}:   {model}: score {models[filepref][model].score(*data[filepref])}")

Rain:   co2: score 0.8543471286493424
Wall time: 36.4 s
Rain:   rf: score 0.8519180530730097
Wall time: 228 ms
Santander:   co2: score 0.960185915987021
Wall time: 5.94 s
Santander:   rf: score 0.9604490046478997
Wall time: 90.5 ms
Cat:   co2: score 0.7156111111111111
Wall time: 1min 58s
Cat:   rf: score 0.7299333333333333
Wall time: 550 ms
May:   co2: score 0.745225925925926
Wall time: 11min 36s
May:   rf: score 0.7705333333333333
Wall time: 1.88 s
heartDisease:   co2: score 0.905813097866078
Wall time: 23.8 s
heartDisease:   rf: score 0.9072847682119205
Wall time: 275 ms
spaceshipTitanic:   co2: score 0.7680214723926381
Wall time: 2.55 s
spaceshipTitanic:   rf: score 0.7795245398773006
Wall time: 33.7 ms
starClassification:   co2: score 0.9671883230531342
Wall time: 1min 34s
starClassification:   rf: score 0.9805780700412907
Wall time: 174 ms


In [22]:
for filepref in fileprefs:
    print(models[filepref]["co2"].get_params)

<bound method BaseEstimator.get_params of CO2_forest(max_depth=40, max_features='auto', max_iter=1000, max_samples=0.7,
           min_samples_split=0.0001, n_estimators=40, n_jobs=-1,
           random_state=100, splitter='best')>
<bound method BaseEstimator.get_params of CO2_forest(max_depth=40, max_features='auto', max_iter=1000, n_estimators=20,
           n_jobs=-1, random_state=100, splitter='best')>
<bound method BaseEstimator.get_params of CO2_forest(max_depth=40, max_features='auto', max_iter=1000,
           min_samples_split=50, n_jobs=-1, random_state=100, splitter='best',
           tol=0.0001)>
<bound method BaseEstimator.get_params of CO2_forest(max_depth=50, max_features='auto', max_iter=1000,
           min_samples_split=10, n_jobs=-1, random_state=100, splitter='best',
           tol=0.0001)>
<bound method BaseEstimator.get_params of CO2_forest(max_depth=30, max_features='auto', max_iter=1200,
           min_samples_split=1e-05, n_jobs=-1, random_state=100,
          

In [27]:
impurities_co2 = {i: {} for i in fileprefs}

for model in fileprefs:
    strees = models[model]["co2"].estimators_
    for tree in strees:
        stack = [(tree.tree_, 0)]
        while stack:
            node, depth = stack[0]
            stack.pop(0)
            if not node.is_leaf():
                left = node.get_down()
                right = node.get_up()
                left_size = len(left._y)
                right_size = len(right._y)
                sum_size = left_size + right_size
                left_size /= sum_size
                right_size /= sum_size
                stack.append((left, depth+1))
                stack.append((right, depth+1))
                new_impurity = node.get_impurity() - left_size * left.get_impurity() - right_size * right.get_impurity()
                if depth not in impurities_co2[model].keys():
                    impurities_co2[model][depth] = [new_impurity]
                else:
                    impurities_co2[model][depth].append(new_impurity)
            else:
                if depth not in impurities_co2[model].keys():
                    impurities_co2[model][depth] = [node.get_impurity()]
                else:
                    impurities_co2[model][depth].append(node.get_impurity())

In [45]:
impurities_rf = {i: {} for i in fileprefs}


for model in fileprefs:
    for i, estimator in enumerate(models[model]["rf"].best_estimator_.estimators_):
        impurities_rf[model][i] = estimator.tree_.impurity

In [29]:
impurities_co2_means = {name: {key: np.mean(value) for key, value in impurities_co2[name].items() if value} for name in fileprefs}

In [30]:
print(f"Mean: {np.mean(list(impurities_co2_means['Rain'].values()))}")
impurities_co2_means['Rain']

Mean: 0.04937699984564938


{0: 0.028809363376542753,
 1: 0.025137604005293836,
 2: 0.019527705523150175,
 3: 0.013204179022203518,
 4: 0.012497340210061592,
 5: 0.014102297492712123,
 6: 0.023997039904220488,
 7: 0.058353661196561935,
 8: 0.09880788563909348,
 9: 0.11712726237664162,
 10: 0.1058080897118589,
 11: 0.0824601187071726,
 12: 0.06341997289589958,
 13: 0.05488222106707897,
 14: 0.04826482965469394,
 15: 0.04724625013856893,
 16: 0.04563707332806832,
 17: 0.04144070881634605,
 18: 0.0461209699973494,
 19: 0.03550529050800487,
 20: 0.07154743881788707,
 21: 0.032396694214876044}

In [31]:
impurities_rf_tree_means = {name: {key: np.mean(value) for key, value in impurities_rf[name].items()} for name in fileprefs}

In [32]:
mean_impurities_rf = np.array(list(impurities_rf_tree_means["Rain"].values()))
print(mean_impurities_rf.mean(), mean_impurities_rf.std())

0.24549365329054734 0.003362559039495975


In [33]:
print(f"Mean: {np.mean(list(impurities_co2_means['Santander'].values()))}")
impurities_co2_means['Santander']

Mean: 0.04519160162394691


{0: 0.0008066756873748181,
 1: 0.0006771384674254491,
 2: 0.0019490515609000218,
 3: 0.01715842925395591,
 4: 0.02790871177474608,
 5: 0.03510620889615751,
 6: 0.04039916181606644,
 7: 0.05296825932636962,
 8: 0.044672123302931514,
 9: 0.047049572846522475,
 10: 0.05423379462168863,
 11: 0.0507264180962416,
 12: 0.05861760358883811,
 13: 0.06345767806644559,
 14: 0.06632234419174782,
 15: 0.0592656632236473,
 16: 0.06742052830240897,
 17: 0.055497152442129064,
 18: 0.051213311556625,
 19: 0.06007923270882167,
 20: 0.0537737067817177,
 21: 0.049160468005351864,
 22: 0.051256642930968266,
 23: 0.06138443208876527,
 24: 0.03734146843593688,
 25: 0.04997059029912551,
 26: 0.06149787640314556,
 27: 0.059644960694149075,
 28: 0.03579080925271691,
 29: 0.04175411180436713,
 30: 0.04061592213535757,
 31: 0.027683295540438396,
 32: 0.1111111111111111,
 33: 0.0}

In [34]:
mean_impurities_rf = np.array(list(impurities_rf_tree_means["Santander"].values()))
print(mean_impurities_rf.mean(), mean_impurities_rf.std())

0.12865797171007243 0.023748464578711232


In [35]:
print(f"Mean: {np.mean(list(impurities_co2_means['Cat'].values()))}")
impurities_co2_means['Cat']

Mean: 0.1735028482786607


{0: 0.023111213576424375,
 1: 0.029565085570200433,
 2: 0.0303867828454832,
 3: 0.0216231952940741,
 4: 0.02987800512874652,
 5: 0.04040515730676241,
 6: 0.0613674918938844,
 7: 0.11045805471795624,
 8: 0.12435487872323385,
 9: 0.1387397872260572,
 10: 0.1762658212606883,
 11: 0.22329189488826204,
 12: 0.25709714762097774,
 13: 0.2882338332829986,
 14: 0.29923525626316755,
 15: 0.3082348168201295,
 16: 0.3252996113157437,
 17: 0.34751489773925553,
 18: 0.2885036654903696,
 19: 0.346490368608799}

In [36]:
mean_impurities_rf = np.array(list(impurities_rf_tree_means["Cat"].values()))
print(mean_impurities_rf.mean(), mean_impurities_rf.std())

0.3245430032646801 0.001147726611373176


In [37]:
print(f"Mean: {np.mean(list(impurities_co2_means['May'].values()))}")
impurities_co2_means['May']

Mean: 0.09065221885235493


{0: 0.0019711726979558366,
 1: 0.0031703689448091352,
 2: 0.009759289387940677,
 3: 0.017864054598794584,
 4: 0.02399508212242724,
 5: 0.02935383378357131,
 6: 0.04477203335656807,
 7: 0.06301703098286267,
 8: 0.07916582544061143,
 9: 0.08995689536853388,
 10: 0.10362734610614155,
 11: 0.11434373948267416,
 12: 0.12241320532982156,
 13: 0.12737819422430247,
 14: 0.13167854666150525,
 15: 0.13363938906016465,
 16: 0.12909426950315883,
 17: 0.12975996776342924,
 18: 0.12314575293967524,
 19: 0.1213876662651995,
 20: 0.11739583987470745,
 21: 0.11634330975364855,
 22: 0.11004971562410545,
 23: 0.10947714596635687,
 24: 0.09792254798296506,
 25: 0.10007448354162705,
 26: 0.09662331890870883,
 27: 0.09337055989934018,
 28: 0.09541803266146241,
 29: 0.09244583879380427,
 30: 0.11456248621114945,
 31: 0.10130342884668174,
 32: 0.0883780608836981,
 33: 0.12485608030726669,
 34: 0.06749261459774696,
 35: 0.13295209761753618,
 36: 0.10347880165200551,
 37: 0.12261262187449734,
 38: 0.07239366063

In [38]:
mean_impurities_rf = np.array(list(impurities_rf_tree_means["May"].values()))
print(mean_impurities_rf.mean(), mean_impurities_rf.std())

0.32899034042583664 0.0022026921544092064
