In [None]:
# importing the libraries 
# make sure that tree_functions_2.py is in the same directory as this notebook

from tree_functions_2 import *

# calling the appropriate tools for classification

from sklearn import tree

from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score

from sklearn.preprocessing import PolynomialFeatures, StandardScaler

from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.model_selection import train_test_split

import graphviz

In [None]:
# change figure configurations

%matplotlib inline

import matplotlib

font = {'size':16}

matplotlib.rc('font', **font)

matplotlib.rc('figure', figsize = (5.0, 5.0))

In [None]:
# the order of the trees 

n = 12

l = int(np.ceil(n / 2))

In [None]:
# setting a random seed

random.seed(42)

## Generating the Tree List and the Associated Data Frame

In [None]:
# creating the list of all non-isomorphic trees of order n

Tree_List = list(nx.nonisomorphic_trees(n))

In [None]:
# classes for trees (0 is path-like and 1 is star-like)

classes = [0, 1] 

In [None]:
# evaluation-based total ordering on Tree_List

total_tree_evaluation_list = get_total_list_evaluation_based(Tree_List, 2, 1)

In [None]:
# classifying the trees Tree_List as 'path-like' and 'star-like'
# based on the total ordering

for j in range(len(total_tree_evaluation_list)):
    
    if j < len(total_tree_evaluation_list)/2:
        
        total_tree_evaluation_list[j].append(classes[0])
        
    else:
        
        total_tree_evaluation_list[j].append(classes[1]) 

In [None]:
# data frame containing the following graph statistics:
# log_{10}(P(.;2,1)), radius,  diameter, degree centrality, 
# closeness centrality, between centrality,
# Stirling Numbers of the First Kind for Trees, number of leaves, and class 

df = []

for i in range(len(total_tree_evaluation_list)):
    
    x = total_tree_evaluation_list[i]
        
    df.append([np.log10(float(x[0])), nx.radius(x[1]), nx.diameter(x[1]),
               get_degree_centrality(x[1]),
               get_closeness_centrality(x[1]),
               get_betweenness_centrality(x[1]),
               get_stirling_trees(x[1], n),
               get_leaf_number(x[1]),
               x[3]])
    
df = pd.DataFrame(df, columns = ['Log_Dist', 'Rad', 'Diam', 'Deg_Cent', 
                                 'Cls_Cent', 'Btw_Cent', 'Stirling', 'Leaf_Num', 'Class'])

In [None]:
df.head()

## Splitting the Tree List to Train and Test Sets

In [None]:
X = df.iloc[:, [4, 5, 8]]

In [None]:
y = pd.DataFrame(np.array(list(df.loc[:, 'Stirling']))).iloc[:, 0:-1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = 42)

In [None]:
X_train = np.array(X_train)

X_test = np.array(X_test)

## Training and Testing Regressor

In [None]:
# setting a random seed

random_state =  np.random.RandomState(seed = 42)

In [None]:
DF = []

## Trees

### Decision Tree

#### Decision Tree

In [None]:
# decision tree

dtr = DecisionTreeRegressor(random_state = random_state)

# DecisionTreeRegressor(*, criterion='squared_error', splitter='best', 
#                       max_depth=None, min_samples_split=2, min_samples_leaf=1, 
#                       min_weight_fraction_leaf=0.0, max_features=None, 
#                       random_state=None, max_leaf_nodes=None, 
#                       min_impurity_decrease=0.0, ccp_alpha=0.0)

dtr.fit(X_train, y_train)

y_train_dtr_pred = dtr.predict(X_train)

y_test_dtr_pred = dtr.predict(X_test)

print('Train MSE:', mean_squared_error(y_train, y_train_dtr_pred))

print('Test MSE:', mean_squared_error(y_test, y_test_dtr_pred))

print('--------')

print('Train Score:', dtr.score(X_train, y_train))

print('Test Score:', dtr.score(X_test, y_test))

print('--------')

print('Train EVS:', explained_variance_score(y_train, y_train_dtr_pred))

print('Test EVS:', explained_variance_score(y_test, y_test_dtr_pred))

print('--------')

# https://scikit-learn.org/stable/modules/tree.html#tree

#tree_data = tree.export_graphviz(dtr, out_file = None) 

#graph = graphviz.Source(tree_data) 

#graph  

In [None]:
# from https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html

path = dtr.cost_complexity_pruning_path(X_train, y_train)

ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [None]:
fig, ax = plt.subplots()

ax.plot(ccp_alphas[:-1], impurities[:-1], marker = 'o', drawstyle = 'steps-post')

ax.set_xlabel('effective alpha')

ax.set_ylabel('total impurity of leaves')

ax.set_title('Total Impurity vs effective alpha for training set')

In [None]:
dtrs = []

for ccp_alpha in ccp_alphas:
    
    dtr = DecisionTreeRegressor(random_state = random_state, ccp_alpha = ccp_alpha)
    
    dtr.fit(X_train, y_train)
    
    dtrs.append(dtr)

In [None]:
dtrs = dtrs[:-1]

ccp_alphas = ccp_alphas[:-1]

node_counts = [dtr.tree_.node_count for dtr in dtrs]

depth = [dtr.tree_.max_depth for dtr in dtrs]

fig, ax = plt.subplots(2, 1)

ax[0].plot(ccp_alphas, node_counts, marker = 'o', drawstyle = 'steps-post')

ax[0].set_xlabel('alpha')

ax[0].set_ylabel('number of nodes')

ax[0].set_title('Number of nodes vs alpha')

ax[1].plot(ccp_alphas, depth, marker = 'o', drawstyle = 'steps-post')

ax[1].set_xlabel('alpha')

ax[1].set_ylabel('depth of tree')

ax[1].set_title('Depth vs alpha')

fig.tight_layout()

In [None]:
train_scores = [dtr.score(X_train, y_train) for dtr in dtrs]

test_scores = [dtr.score(X_test, y_test) for dtr in dtrs]

fig, ax = plt.subplots()

ax.set_xlabel('alpha')

ax.set_ylabel('accuracy')

ax.set_title('Accuracy vs alpha for training and testing sets')

ax.plot(ccp_alphas, train_scores, marker = 'o', label = 'train', drawstyle = 'steps-post')

ax.plot(ccp_alphas, test_scores, marker = 'o', label = 'test', drawstyle = 'steps-post')

ax.legend()

plt.show()

In [None]:
# decision tree with pruning

dtr = DecisionTreeRegressor(random_state = random_state, ccp_alpha = 1)

# DecisionTreeRegressor(*, criterion=splitter='best', 
#                       max_depth=None, min_samples_split=2, min_samples_leaf=1, 
#                       min_weight_fraction_leaf=0.0, max_features=None, 
#                       random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, 
#                       class_weight=None, ccp_alpha=0.0)

dtr.fit(X_train, y_train)

y_train_dtr_pred = dtr.predict(X_train)

y_test_dtr_pred = dtr.predict(X_test)

print('Train MSE:', mean_squared_error(y_train, y_train_dtr_pred))

print('Test MSE:', mean_squared_error(y_test, y_test_dtr_pred))

print('--------')

print('Train Score:', dtr.score(X_train, y_train))

print('Test Score:', dtr.score(X_test, y_test))

print('--------')

print('Train EVS:', explained_variance_score(y_train, y_train_dtr_pred))

print('Test EVS:', explained_variance_score(y_test, y_test_dtr_pred))

print('--------')

# https://scikit-learn.org/stable/modules/tree.html#tree

#tree_data = tree.export_graphviz(dtr, out_file = None) 

#graph = graphviz.Source(tree_data) 

#graph 

In [None]:
def dt_reg(X_train, X_test, y_train, y_test, prune_list = np.zeros(shape(y_train)[1] - 1), prune = False):   
    
    for m in range(shape(y_train)[1] - 1):
    
        y_train_trunc = np.ravel(np.array(list(y_train.iloc[:, m])))

        y_test_trunc = np.ravel(np.array(list(y_test.iloc[:, m])))
    
        dtr = DecisionTreeRegressor(random_state = random_state, ccp_alpha = prune_list[m] * prune)
        
        dtr.fit(X_train, y_train_trunc)

        y_pred_train = dtr.predict(X_train)

        y_pred_train_rounded = [np.rint(y_pred_train[i]) for i in range(len(y_pred_train))]

        y_pred_test = dtr.predict(X_test)

        y_pred_test_rounded = [np.rint(y_pred_test[i]) for i in range(len(y_pred_test))]
        
        if prune == False:
            
            path = dtr.cost_complexity_pruning_path(X_train, y_train_trunc)
            
            ccp_alphas, impurities = path.ccp_alphas, path.impurities
        
            fig, ax = plt.subplots()

            ax.plot(ccp_alphas[:-1], impurities[:-1], marker = 'o', drawstyle = 'steps-post')

            ax.set_xlabel('effective alpha')

            ax.set_ylabel('total impurity of leaves')

            ax.set_title('Total Impurity vs effective alpha for training set')
        
            dtrs = []

            for ccp_alpha in ccp_alphas:
    
                dtr = DecisionTreeRegressor(random_state = random_state, ccp_alpha = ccp_alpha)
    
                dtr.fit(X_train, y_train_trunc)
    
                dtrs.append(dtr)
        
            dtrs = dtrs[:-1]

            ccp_alphas = ccp_alphas[:-1]

            node_counts = [dtr.tree_.node_count for dtr in dtrs]

            depth = [dtr.tree_.max_depth for dtr in dtrs]

            fig, ax = plt.subplots(2, 1)

            ax[0].plot(ccp_alphas, node_counts, marker = 'o', drawstyle = 'steps-post')

            ax[0].set_xlabel('alpha')

            ax[0].set_ylabel('number of nodes')

            ax[0].set_title('Number of nodes vs alpha')

            ax[1].plot(ccp_alphas, depth, marker = 'o', drawstyle = 'steps-post')

            ax[1].set_xlabel('alpha')

            ax[1].set_ylabel('depth of tree')

            ax[1].set_title('Depth vs alpha')

            fig.tight_layout()
        
            train_scores = [dtr.score(X_train, y_train_trunc) for dtr in dtrs]

            test_scores = [dtr.score(X_test, y_test_trunc) for dtr in dtrs]

            fig, ax = plt.subplots()

            ax.set_xlabel('alpha')

            ax.set_ylabel('accuracy')

            ax.set_title('Accuracy vs alpha for training and testing sets')

            ax.plot(ccp_alphas, train_scores, marker = 'o', label = 'train', drawstyle = 'steps-post')

            ax.plot(ccp_alphas, test_scores, marker = 'o', label = 'test', drawstyle = 'steps-post')

            ax.legend()

            plt.show()

        DF.append(['dtr', 1 * prune, m + l,
                   round(dtr.score(X_train, y_train_trunc), 5),
                   round(dtr.score(X_test, y_test_trunc), 5),
                   round(explained_variance_score(y_train_trunc, y_pred_train), 5),
                   round(explained_variance_score(y_test_trunc, y_pred_test), 5)])
        
    return DF

In [None]:
dt_reg(X_train, X_test, y_train, y_test)

In [None]:
dt_reg(X_train, X_test, y_train, y_test, [0.002, 0.5, 2, 2, 1], prune = True)

### Extra Tree

#### Extra Tree

In [None]:
# extra tree

etr = ExtraTreeRegressor(random_state = random_state)

# ExtraTreeRegressor(*, criterion='squared_error', splitter='random', 
#                    max_depth=None, min_samples_split=2, min_samples_leaf=1, 
#                    min_weight_fraction_leaf=0.0, max_features=1.0, 
#                    random_state=None, min_impurity_decrease=0.0, 
#                    max_leaf_nodes=None, ccp_alpha=0.0)

etr.fit(X_train, y_train)

y_train_etr_pred = etr.predict(X_train)

y_test_etr_pred = etr.predict(X_test)

print('Train MSE:', mean_squared_error(y_train, y_train_dtr_pred))

print('Test MSE:', mean_squared_error(y_test, y_test_dtr_pred))

print('--------')

print('Train Score:', etr.score(X_train, y_train))

print('Test Score:', etr.score(X_test, y_test))

print('--------')

print('Train EVS:', explained_variance_score(y_train, y_train_dtr_pred))

print('Test EVS:', explained_variance_score(y_test, y_test_dtr_pred))

#print('--------')

#tree_data = tree.export_graphviz(dtr, out_file = None) 

#graph = graphviz.Source(tree_data) 

#graph 

In [None]:
path = etr.cost_complexity_pruning_path(X_train, y_train)

ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [None]:
fig, ax = plt.subplots()

ax.plot(ccp_alphas[:-1], impurities[:-1], marker = 'o', drawstyle = 'steps-post')

ax.set_xlabel('effective alpha')

ax.set_ylabel('total impurity of leaves')

ax.set_title('Total Impurity vs effective alpha for training set')

In [None]:
etrs = []

for ccp_alpha in ccp_alphas:
    
    etr = ExtraTreeRegressor(random_state = random_state, ccp_alpha = ccp_alpha)
    
    etr.fit(X_train, y_train)
    
    etrs.append(etr)

In [None]:
etrs = etrs[:-1]

ccp_alphas = ccp_alphas[:-1]

node_counts = [etr.tree_.node_count for etr in etrs]

depth = [etr.tree_.max_depth for etr in etrs]

fig, ax = plt.subplots(2, 1)

ax[0].plot(ccp_alphas, node_counts, marker = 'o', drawstyle = 'steps-post')

ax[0].set_xlabel('alpha')

ax[0].set_ylabel('number of nodes')

ax[0].set_title('Number of nodes vs alpha')

ax[1].plot(ccp_alphas, depth, marker='o', drawstyle='steps-post')

ax[1].set_xlabel('alpha')

ax[1].set_ylabel('depth of tree')

ax[1].set_title('Depth vs alpha')

fig.tight_layout()

In [None]:
train_scores = [etr.score(X_train, y_train) for etr in etrs]

test_scores = [etr.score(X_test, y_test) for etr in etrs]

fig, ax = plt.subplots()

ax.set_xlabel('alpha')

ax.set_ylabel('accuracy')

ax.set_title('Accuracy vs alpha for training and testing sets')

ax.plot(ccp_alphas, train_scores, marker = 'o', label = 'train', drawstyle = 'steps-post')

ax.plot(ccp_alphas, test_scores, marker = 'o', label = 'test', drawstyle = 'steps-post')

ax.legend()

plt.show()

In [None]:
# extra tree with pruning

etr = ExtraTreeRegressor(random_state = random_state, ccp_alpha = 0.5)

# ExtraTreeRegressor(*, criterion='squared_error', splitter='random', 
#                    max_depth=None, min_samples_split=2, min_samples_leaf=1, 
#                    min_weight_fraction_leaf=0.0, max_features=1.0, 
#                    random_state=None, min_impurity_decrease=0.0, 
#                    max_leaf_nodes=None, ccp_alpha=0.0)

etr.fit(X_train, y_train)

y_train_etr_pred = etr.predict(X_train)

y_test_etr_pred = etr.predict(X_test)

print('Train MSE:', mean_squared_error(y_train, y_train_dtr_pred))

print('Test MSE:', mean_squared_error(y_test, y_test_dtr_pred))

print('--------')

print('Train Score:', etr.score(X_train, y_train))

print('Test Score:', etr.score(X_test, y_test))

print('--------')

print('Train EVS:', explained_variance_score(y_train, y_train_dtr_pred))

print('Test EVS:', explained_variance_score(y_test, y_test_dtr_pred))

#print('--------')

#tree_data = tree.export_graphviz(etr, out_file = None) 

#graph = graphviz.Source(tree_data) 

#graph  

In [None]:
def et_reg(X_train, X_test, y_train, y_test, prune_list = np.zeros(shape(y_train)[1] - 1), prune = False):
    
    for m in range(shape(y_train)[1] - 1):
    
        y_train_trunc = np.ravel(np.array(list(y_train.iloc[:, m])))

        y_test_trunc = np.ravel(np.array(list(y_test.iloc[:, m])))
    
        etr = ExtraTreeRegressor(random_state = random_state, ccp_alpha = prune_list[m] * prune)
        
        etr.fit(X_train, y_train_trunc)

        y_pred_train = etr.predict(X_train)

        y_pred_train_rounded = [np.rint(y_pred_train[i]) for i in range(len(y_pred_train))]

        y_pred_test = etr.predict(X_test)

        y_pred_test_rounded = [np.rint(y_pred_test[i]) for i in range(len(y_pred_test))]
        
        if prune == False:
            
            path = etr.cost_complexity_pruning_path(X_train, y_train_trunc)
            
            ccp_alphas, impurities = path.ccp_alphas, path.impurities
        
            fig, ax = plt.subplots()

            ax.plot(ccp_alphas[:-1], impurities[:-1], marker = 'o', drawstyle = 'steps-post')

            ax.set_xlabel('effective alpha')

            ax.set_ylabel('total impurity of leaves')

            ax.set_title('Total Impurity vs effective alpha for training set')
        
            etrs = []

            for ccp_alpha in ccp_alphas:
    
                etr = DecisionTreeRegressor(random_state = random_state, ccp_alpha = ccp_alpha)
    
                etr.fit(X_train, y_train_trunc)
    
                etrs.append(etr)
        
            etrs = etrs[:-1]

            ccp_alphas = ccp_alphas[:-1]

            node_counts = [etr.tree_.node_count for etr in etrs]

            depth = [etr.tree_.max_depth for etr in etrs]

            fig, ax = plt.subplots(2, 1)

            ax[0].plot(ccp_alphas, node_counts, marker = 'o', drawstyle = 'steps-post')

            ax[0].set_xlabel('alpha')

            ax[0].set_ylabel('number of nodes')

            ax[0].set_title('Number of nodes vs alpha')

            ax[1].plot(ccp_alphas, depth, marker = 'o', drawstyle = 'steps-post')

            ax[1].set_xlabel('alpha')

            ax[1].set_ylabel('depth of tree')

            ax[1].set_title('Depth vs alpha')

            fig.tight_layout()
        
            train_scores = [etr.score(X_train, y_train_trunc) for etr in etrs]

            test_scores = [etr.score(X_test, y_test_trunc) for etr in etrs]

            fig, ax = plt.subplots()

            ax.set_xlabel('alpha')

            ax.set_ylabel('accuracy')

            ax.set_title('Accuracy vs alpha for training and testing sets')

            ax.plot(ccp_alphas, train_scores, marker = 'o', label = 'train', drawstyle = 'steps-post')

            ax.plot(ccp_alphas, test_scores, marker = 'o', label = 'test', drawstyle = 'steps-post')

            ax.legend()

            plt.show()

        DF.append(['etr', 1 * prune, m + l,
                   round(etr.score(X_train, y_train_trunc), 5),
                   round(etr.score(X_test, y_test_trunc), 5),
                   round(explained_variance_score(y_train_trunc, y_pred_train), 5),
                   round(explained_variance_score(y_test_trunc, y_pred_test), 5)])
        
    return DF

In [None]:
et_reg(X_train, X_test, y_train, y_test)

In [None]:
et_reg(X_train, X_test, y_train, y_test, prune_list = [0.002, 0.2, 2, 2, 1], prune = True)

## Ensembles

### Bagging

In [None]:
# bagging

br = BaggingRegressor(random_state = random_state)

# BaggingRegressor(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, 
#                  bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, 
#                 n_jobs=None, random_state=None, verbose=0)

br.fit(X_train, y_train)

y_train_br_pred = br.predict(X_train)

y_test_br_pred = br.predict(X_test)

print('Train MSE:', mean_squared_error(y_train, y_train_br_pred))

print('Test MSE:', mean_squared_error(y_test, y_test_br_pred))

print('--------')

print('Train Score:', br.score(X_train, y_train))

print('Test Score:', br.score(X_test, y_test))

print('--------')

print('Train EVS:', explained_variance_score(y_train, y_train_br_pred))

print('Test EVS:', explained_variance_score(y_test, y_test_br_pred))

In [None]:
def b_reg(X_train, X_test, y_train, y_test):
    
    for m in range(shape(y_train)[1] - 1):
    
        y_train_trunc = np.ravel(np.array(list(y_train.iloc[:, m])))

        y_test_trunc = np.ravel(np.array(list(y_test.iloc[:, m])))
    
        br = BaggingRegressor(random_state = random_state)
        
        br.fit(X_train, y_train_trunc)

        y_pred_train = br.predict(X_train)

        y_pred_train_rounded = [np.rint(y_pred_train[i]) for i in range(len(y_pred_train))]

        y_pred_test = br.predict(X_test)

        y_pred_test_rounded = [np.rint(y_pred_test[i]) for i in range(len(y_pred_test))]

        DF.append(['br', 'na', m + l,
                   round(br.score(X_train, y_train_trunc), 5),
                   round(br.score(X_test, y_test_trunc), 5),
                   round(explained_variance_score(y_train_trunc, y_pred_train), 5),
                   round(explained_variance_score(y_test_trunc, y_pred_test), 5)])
        
    return DF

In [None]:
b_reg(X_train, X_test, y_train, y_test)

### Random Forest

#### Random Forest

In [None]:
# random forest

rfr = RandomForestRegressor(n_estimators = 50, random_state = random_state)

# RandomForestRegressor(n_estimators=100, *, criterion=max_depth=None, 
#                       min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
#                       max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, 
#                       bootstrap=True, oob_score=False, n_jobs=None, 
#                       random_state=None, verbose=0, warm_start=False, 
#                       class_weight=None, ccp_alpha=0.0, max_samples=None)

rfr.fit(X_train, y_train)

y_train_rfr_pred = rfr.predict(X_train)

y_test_rfr_pred = rfr.predict(X_test)

print('Train MSE:', mean_squared_error(y_train, y_train_dtr_pred))

print('Test MSE:', mean_squared_error(y_test, y_test_dtr_pred))

print('--------')

print('Train Score:', rfr.score(X_train, y_train))

print('Test Score:', rfr.score(X_test, y_test))

print('--------')

print('Train EVS:', explained_variance_score(y_train, y_train_dtr_pred))

print('Test EVS:', explained_variance_score(y_test, y_test_dtr_pred))

In [None]:
# random forest with pruning

rfr = RandomForestRegressor(n_estimators = 50, random_state = random_state, ccp_alpha = 0.5)

# RandomForestRegressor(n_estimators=100, *, criterion=max_depth=None, 
#                       min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
#                       max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, 
#                       bootstrap=True, oob_score=False, n_jobs=None, 
#                       random_state=None, verbose=0, warm_start=False, 
#                       class_weight=None, ccp_alpha=0.0, max_samples=None)

rfr.fit(X_train, y_train)

y_train_rfr_pred = rfr.predict(X_train)

y_test_rfr_pred = rfr.predict(X_test)

print('Train MSE:', mean_squared_error(y_train, y_train_dtr_pred))

print('Test MSE:', mean_squared_error(y_test, y_test_dtr_pred))

print('--------')

print('Train Score:', rfr.score(X_train, y_train))

print('Test Score:', rfr.score(X_test, y_test))

print('--------')

print('Train EVS:', explained_variance_score(y_train, y_train_dtr_pred))

print('Test EVS:', explained_variance_score(y_test, y_test_dtr_pred))

In [None]:
def rf_reg(X_train, X_test, y_train, y_test, prune_list = np.zeros(shape(y_train)[1] - 1), prune = False):
    
    for m in range(shape(y_train)[1] - 1):
    
        y_train_trunc = np.ravel(np.array(list(y_train.iloc[:, m])))

        y_test_trunc = np.ravel(np.array(list(y_test.iloc[:, m])))
    
        rfr = RandomForestRegressor(n_estimators = 50, random_state = random_state, 
                                    ccp_alpha = prune_list[m] * prune)
        
        rfr.fit(X_train, y_train_trunc)

        y_pred_train = rfr.predict(X_train)

        y_pred_train_rounded = [np.rint(y_pred_train[i]) for i in range(len(y_pred_train))]

        y_pred_test = rfr.predict(X_test)

        y_pred_test_rounded = [np.rint(y_pred_test[i]) for i in range(len(y_pred_test))]

        DF.append(['rfr', 1 * prune, m + l,
                   round(rfr.score(X_train, y_train_trunc), 5),
                   round(rfr.score(X_test, y_test_trunc), 5),
                   round(explained_variance_score(y_train_trunc, y_pred_train), 5),
                   round(explained_variance_score(y_test_trunc, y_pred_test), 5)])
        
    return DF

In [None]:
rf_reg(X_train, X_test, y_train, y_test)

In [None]:
rf_reg(X_train, X_test, y_train, y_test, prune_list = [0.002, 0.4, 2, 2, 1], prune = True)

### Extra Trees 

#### Extra Trees  with Gini Criterion

In [None]:
# extra trees with gini

etsr = ExtraTreesRegressor(n_estimators = 50, random_state = random_state)

# ExtraTreesRegressor(n_estimators=100, *, criterion=max_depth=None, 
#                     min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
#                     max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, 
#                     bootstrap=False, oob_score=False, n_jobs=None, 
#                     random_state=None, verbose=0, warm_start=False, class_weight=None, 
#                     ccp_alpha=0.0, max_samples=None)

etsr.fit(X_train, y_train)

y_train_etsr_pred = etsr.predict(X_train)

y_test_etsr_pred = etsr.predict(X_test)

print('Train MSE:', mean_squared_error(y_train, y_train_dtr_pred))

print('Test MSE:', mean_squared_error(y_test, y_test_dtr_pred))

print('--------')

print('Train Score:', etsr.score(X_train, y_train))

print('Test Score:', etsr.score(X_test, y_test))

print('--------')

print('Train EVS:', explained_variance_score(y_train, y_train_dtr_pred))

print('Test EVS:', explained_variance_score(y_test, y_test_dtr_pred))

In [None]:
# extra trees with pruning

etsr = ExtraTreesRegressor(n_estimators = 50, random_state = random_state, ccp_alpha = 0.5)

# ExtraTreesRegressor(n_estimators=100, *, criterion=max_depth=None, 
#                     min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
#                     max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, 
#                     bootstrap=False, oob_score=False, n_jobs=None, 
#                     random_state=None, verbose=0, warm_start=False, class_weight=None, 
#                     ccp_alpha=0.0, max_samples=None)

etsr.fit(X_train, y_train)

y_train_etsr_pred = etsr.predict(X_train)

y_test_etsr_pred = etsr.predict(X_test)

print('Train MSE:', mean_squared_error(y_train, y_train_dtr_pred))

print('Test MSE:', mean_squared_error(y_test, y_test_dtr_pred))

print('--------')

print('Train Score:', etsr.score(X_train, y_train))

print('Test Score:', etsr.score(X_test, y_test))

print('--------')

print('Train EVS:', explained_variance_score(y_train, y_train_dtr_pred))

print('Test EVS:', explained_variance_score(y_test, y_test_dtr_pred))

In [None]:
def ets_reg(X_train, X_test, y_train, y_test, prune_list = np.zeros(shape(y_train)[1] - 1), prune = False):
    
    for m in range(shape(y_train)[1] - 1):
    
        y_train_trunc = np.ravel(np.array(list(y_train.iloc[:, m])))

        y_test_trunc = np.ravel(np.array(list(y_test.iloc[:, m])))
    
        etsr = ExtraTreesRegressor(n_estimators = 50, random_state = random_state,
                                   ccp_alpha = prune_list[m] * prune)
        
        etsr.fit(X_train, y_train_trunc)

        y_pred_train = etsr.predict(X_train)

        y_pred_train_rounded = [np.rint(y_pred_train[i]) for i in range(len(y_pred_train))]

        y_pred_test = etsr.predict(X_test)

        y_pred_test_rounded = [np.rint(y_pred_test[i]) for i in range(len(y_pred_test))]

        DF.append(['etsr', 1 * prune, m + l,
                   round(etsr.score(X_train, y_train_trunc), 5),
                   round(etsr.score(X_test, y_test_trunc), 5),
                   round(explained_variance_score(y_train_trunc, y_pred_train), 5),
                   round(explained_variance_score(y_test_trunc, y_pred_test), 5)])
        
    return DF

In [None]:
ets_reg(X_train, X_test, y_train, y_test)

In [None]:
ets_reg(X_train, X_test, y_train, y_test, prune_list = [0.002, 0.2, 2, 2, 1], prune = True)

## Table

In [None]:
DF2 = pd.DataFrame(np.array(DF), columns = ['Method', 'Pruning', 'k', 'Train_Score',
                                            'Test_Score', 'Train_EVS', 'Test_EVS'])

In [None]:
DF2

In [None]:
DF2.to_csv('Regression--Tree--Subset--Full_Set.csv')