In [None]:
# importing the libraries 
# make sure that tree_functions_2.py is in the same directory as this notebook

from tree_functions_2 import *

# calling the appropriate tools for regression

from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet

from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score

from sklearn.preprocessing import PolynomialFeatures, StandardScaler

from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.model_selection import train_test_split

import graphviz

In [None]:
# change figure configurations

%matplotlib inline

import matplotlib

font = {'size':16}

matplotlib.rc('font', **font)

matplotlib.rc('figure', figsize = (5.0, 5.0))

In [None]:
# the order of the trees 

n = 18

l = int(np.ceil(n / 2))

In [None]:
# setting a random seed

random.seed(42)

## Generating the Tree List and the Associated Data Frame

In [None]:
# sampling 500 trees using unifrom sampling

K_n = nx.complete_graph(n)

Tree_List = []

for i in range(500):
    
    Tree_List.append(nx.to_networkx_graph(get_spanning_tree_u_w(K_n)))

In [None]:
# classes for trees (0 is path-like and 1 is star-like)

classes = [0, 1] 

In [None]:
# evaluation-based total ordering on Tree_List

total_tree_evaluation_list = get_total_list_evaluation_based(Tree_List, 2, 1)

In [None]:
# classifying the trees Tree_List as 'path-like' and 'star-like'
# based on the total ordering

for j in range(len(total_tree_evaluation_list)):
    
    if j < len(total_tree_evaluation_list)/2:
        
        total_tree_evaluation_list[j].append(classes[0])
        
    else:
        
        total_tree_evaluation_list[j].append(classes[1]) 

In [None]:
# data frame containing the following graph statistics:
# log_{10}(P(.;2,1)), radius,  diameter, degree centrality, 
# closeness centrality, between centrality,
# Stirling Numbers of the First Kind for Trees, number of leaves, and class 

df = []

for i in range(len(total_tree_evaluation_list)):
    
    x = total_tree_evaluation_list[i]
        
    df.append([np.log10(float(x[0])), nx.radius(x[1]), nx.diameter(x[1]),
               get_degree_centrality(x[1]),
               get_closeness_centrality(x[1]),
               get_betweenness_centrality(x[1]),
               get_stirling_trees(x[1], n),
               get_leaf_number(x[1]),
              x[3]])
    
df = pd.DataFrame(df, columns = ['Log_Dist', 'Rad', 'Diam', 'Deg_Cent', 
                                 'Cls_Cent', 'Btw_Cent', 'Stirling', 'Leaf_Num', 'Class'])

In [None]:
df.head()

## Splitting the Tree List to Train and Test Sets

In [None]:
X = df.iloc[:, [4, 5, 8]]

In [None]:
y = pd.DataFrame(np.array(list(df.loc[:, 'Stirling']))).iloc[:, 0:-1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = 42)

In [None]:
X_train = np.array(X_train)

X_test = np.array(X_test)

## Training and Testing Regressors

In [None]:
# setting a random seed

random_state =  np.random.RandomState(seed = 42)

In [None]:
DF = []

In [None]:
k = 6

In [None]:
y_train_trunc = np.ravel(np.array(list(y_train.iloc[:, k])))

y_test_trunc = np.ravel(np.array(list(y_test.iloc[:, k])))

## Linear Regression

In [None]:
# Linear Regression

reg = LinearRegression()

# LinearRegression(,fit_intercept=True, normalize='deprecated', copy_X=True, n_jobs=None, positive=False)

reg.fit(X_train, y_train_trunc)

y_pred_train = reg.predict(X_train)

y_pred_test = reg.predict(X_test)

print('Train MSE:', mean_squared_error(y_train_trunc, y_pred_train))

print('Test MSE:', mean_squared_error(y_test_trunc, y_pred_test))

print('--------')

print('Train Score:', reg.score(X_train, y_train_trunc))

print('Test Score:', reg.score(X_test, y_test_trunc))

print('--------')

print('Train EVS:', explained_variance_score(y_train_trunc, y_pred_train))

print('Test EVS:', explained_variance_score(y_test_trunc, y_pred_test))

print('--------')

print('Reg. Coef.:', reg.coef_)

print('Reg. Intercept:', reg.intercept_)

In [None]:
def linear_reg(X_train, X_test, y_train, y_test):
    
    for m in range(shape(y_train)[1] - 1):
    
        y_train_trunc = np.ravel(np.array(list(y_train.iloc[:, m])))

        y_test_trunc = np.ravel(np.array(list(y_test.iloc[:, m])))
    
        reg = LinearRegression()
        
        reg.fit(X_train, y_train_trunc)

        y_pred_train = reg.predict(X_train)

        y_pred_train_rounded = [np.rint(y_pred_train[i]) for i in range(len(y_pred_train))]

        y_pred_test = reg.predict(X_test)

        y_pred_test_rounded = [np.rint(y_pred_test[i]) for i in range(len(y_pred_test))]

        DF.append(['linear', m + l,
                   round(reg.score(X_train, y_train_trunc), 5),
                   round(reg.score(X_test, y_test_trunc), 5),
                   round(explained_variance_score(y_train_trunc, y_pred_train), 5),
                   round(explained_variance_score(y_test_trunc, y_pred_test), 5)])
        
    return DF

In [None]:
linear_reg(X_train, X_test, y_train, y_test)

In [None]:
# Ridge Regression

rng = Ridge(random_state = random_state)

# Ridge(alpha=1.0, *, fit_intercept=True, normalize='deprecated', 
#       copy_X=True, max_iter=None, tol=0.001, solver='auto', 
#       positive=False, random_state=None)

rng.fit(X_train, y_train_trunc)

y_pred_train = rng.predict(X_train)

y_pred_test = rng.predict(X_test)

print('Train MSE:', mean_squared_error(y_train_trunc, y_pred_train))

print('Test MSE:', mean_squared_error(y_test_trunc, y_pred_test))

print('--------')

print('Train Score:', rng.score(X_train, y_train_trunc))

print('Test Score:', rng.score(X_test, y_test_trunc))

print('--------')

print('Train EVS:', explained_variance_score(y_train_trunc, y_pred_train))

print('Test EVS:', explained_variance_score(y_test_trunc, y_pred_test))

print('--------')

print('Reg. Coef.:', rng.coef_)

print('Reg. Intercept:', rng.intercept_)

In [None]:
def ridge(X_train, X_test, y_train, y_test):
    
    for m in range(shape(y_train)[1] - 1): 
    
        y_train_trunc = np.ravel(np.array(list(y_train.iloc[:, m])))

        y_test_trunc = np.ravel(np.array(list(y_test.iloc[:, m])))
        
        rng = Ridge(random_state = random_state)

        rng.fit(X_train, y_train_trunc)

        y_pred_train = rng.predict(X_train)

        y_pred_train_rounded = [np.rint(y_pred_train[i]) for i in range(len(y_pred_train))]

        y_pred_test = rng.predict(X_test)

        y_pred_test_rounded = [np.rint(y_pred_test[i]) for i in range(len(y_pred_test))]

        DF.append(['ridge', m + l,
                   round(rng.score(X_train, y_train_trunc), 5),
                   round(rng.score(X_test, y_test_trunc), 5),
                   round(explained_variance_score(y_train_trunc, y_pred_train), 5),
                   round(explained_variance_score(y_test_trunc, y_pred_test), 5)])
    
    return DF

In [None]:
ridge(X_train, X_test, y_train, y_test)

In [None]:
# Lasso Regression

lss = Lasso(random_state = random_state)

# Lasso(alpha=1.0, *, fit_intercept=True, normalize='deprecated', 
#       precompute=False, copy_X=True, max_iter=1000, tol=0.0001, 
#       warm_start=False, positive=False, 
#       random_state=None, selection='cyclic')

lss.fit(X_train, y_train_trunc)

y_pred_train = lss.predict(X_train)

y_pred_test = lss.predict(X_test)

print('Train MSE:', mean_squared_error(y_train_trunc, y_pred_train))

print('Test MSE:', mean_squared_error(y_test_trunc, y_pred_test))

print('--------')

print('Train Score:', lss.score(X_train, y_train_trunc))

print('Test Score:', lss.score(X_test, y_test_trunc))

print('--------')

print('Train EVS:', explained_variance_score(y_train_trunc, y_pred_train))

print('Test EVS:', explained_variance_score(y_test_trunc, y_pred_test))

print('--------')

print('Reg. Coef.:', lss.coef_)

print('Reg. Intercept:', lss.intercept_)

In [None]:
def lasso(X_train, X_test, y_train, y_test):
    
    for m in range(shape(y_train)[1] - 1): 
        
        y_train_trunc = np.ravel(np.array(list(y_train.iloc[:, m])))

        y_test_trunc = np.ravel(np.array(list(y_test.iloc[:, m])))
        
        lss = Lasso(random_state = random_state)

        lss.fit(X_train, y_train_trunc)

        y_pred_train = lss.predict(X_train)

        y_pred_train_rounded = [np.rint(y_pred_train[i]) for i in range(len(y_pred_train))]

        y_pred_test = lss.predict(X_test)

        y_pred_test_rounded = [np.rint(y_pred_test[i]) for i in range(len(y_pred_test))]

        DF.append(['lasso', m + l,
                   round(lss.score(X_train, y_train_trunc), 5),
                   round(lss.score(X_test, y_test_trunc), 5),
                   round(explained_variance_score(y_train_trunc, y_pred_train), 5),
                   round(explained_variance_score(y_test_trunc, y_pred_test), 5)])
        
    return DF

In [None]:
lasso(X_train, X_test, y_train, y_test)

In [None]:
# Elastic Net

ent = ElasticNet(random_state = random_state)

# ElasticNet(alpha=1.0, *, l1_ratio=0.5, fit_intercept=True, 
#            normalize='deprecated', precompute=False, max_iter=1000, 
#            copy_X=True, tol=0.0001, warm_start=False, 
#            positive=False, random_state=None, selection='cyclic')

ent.fit(X_train, y_train_trunc)

y_pred_train = ent.predict(X_train)

y_pred_test = ent.predict(X_test)

print('Train MSE:', mean_squared_error(y_train_trunc, y_pred_train))

print('Test MSE:', mean_squared_error(y_test_trunc, y_pred_test))

print('--------')

print('Train Score:', ent.score(X_train, y_train_trunc))

print('Test Score:', ent.score(X_test, y_test_trunc))

print('--------')

print('Train EVS:', explained_variance_score(y_train_trunc, y_pred_train))

print('Test EVS:', explained_variance_score(y_test_trunc, y_pred_test))

print('--------')

print('Reg. Coef.:', ent.coef_)

print('Reg. Intercept:', ent.intercept_)

In [None]:
def elastic(X_train, X_test, y_train, y_test):

    for m in range(shape(y_train)[1] - 1): 
        
        y_train_trunc = np.ravel(np.array(list(y_train.iloc[:, m])))

        y_test_trunc = np.ravel(np.array(list(y_test.iloc[:, m])))
        
        ent = ElasticNet(random_state = random_state)

        ent.fit(X_train, y_train_trunc)

        y_pred_train = ent.predict(X_train)

        y_pred_train_rounded = [np.rint(y_pred_train[i]) for i in range(len(y_pred_train))]

        y_pred_test = ent.predict(X_test)

        y_pred_test_rounded = [np.rint(y_pred_test[i]) for i in range(len(y_pred_test))]

        DF.append(['elasticnet',  m + l,
                   round(ent.score(X_train, y_train_trunc), 5),
                   round(ent.score(X_test, y_test_trunc), 5),
                   round(explained_variance_score(y_train_trunc, y_pred_train), 5),
                   round(explained_variance_score(y_test_trunc, y_pred_test), 5)])
        
    return DF

In [None]:
elastic(X_train, X_test, y_train, y_test)

In [None]:
# Quadratic Regression 

model = Pipeline([('poly', PolynomialFeatures(degree = 2)), ('reg', LinearRegression())])

# PolynomialFeatures(degree=2, *, interaction_only=False, include_bias=True, order='C')

model.fit(X_train, y_train_trunc)

y_poly_train_pred = model.predict(X_train)

y_poly_test_pred = model.predict(X_test)

print('Train MSE:', mean_squared_error(y_train_trunc, y_poly_train_pred))

print('Test MSE:', mean_squared_error(y_test_trunc, y_poly_test_pred))

print('--------')

print('Train Score:', model.score(X_train, y_train_trunc))

print('Test Score:', model.score(X_test, y_test_trunc))

print('--------')

print('Train EVS:', explained_variance_score(y_train_trunc, y_poly_train_pred))

print('Test EVS:', explained_variance_score(y_test_trunc, y_poly_test_pred))

In [None]:
def poly_2(X_train, X_test, y_train, y_test):

    for m in range(shape(y_train)[1] - 1): 
        
        y_train_trunc = np.ravel(np.array(list(y_train.iloc[:, m])))

        y_test_trunc = np.ravel(np.array(list(y_test.iloc[:, m])))
        
        model = Pipeline([('poly', PolynomialFeatures(degree = 2)), ('reg', LinearRegression())])
        
        model.fit(X_train, y_train_trunc)

        y_poly_train_pred = model.predict(X_train)

        y_poly_train_pred_rounded = [np.rint(y_poly_train_pred[i]) for i in range(len(y_poly_train_pred))]

        y_poly_test_pred = model.predict(X_test)

        y_poly_test_pred_rounded = [np.rint(y_poly_test_pred[i]) for i in range(len(y_poly_test_pred))]

        DF.append(['quadratic',  m + l,
               round(model.score(X_train, y_train_trunc), 5),
               round(model.score(X_test, y_test_trunc), 5),
               round(explained_variance_score(y_train_trunc, y_pred_train), 5),
               round(explained_variance_score(y_test_trunc, y_pred_test), 5)])
                  
    return DF

In [None]:
poly_2(X_train, X_test, y_train, y_test)

In [None]:
# Stochatic Gradient Descent

sgd = make_pipeline(StandardScaler(), SGDRegressor(random_state = random_state))

# SGDRegressor(loss='squared_error', *, penalty='l2', alpha=0.0001, 
#              l1_ratio=0.15, fit_intercept=True, max_iter=1000, 
#              tol=0.001, shuffle=True, verbose=0, epsilon=0.1, 
#              random_state=None, learning_rate='invscaling', eta0=0.01, 
#              power_t=0.25, early_stopping=False, validation_fraction=0.1, 
#              n_iter_no_change=5, warm_start=False, average=False)

sgd.fit(X_train, y_train_trunc)

y_sgd_train_pred = sgd.predict(X_train)

y_sgd_test_pred = sgd.predict(X_test)

print('Train MSE:', mean_squared_error(y_train_trunc, y_sgd_train_pred))

print('Test MSE:', mean_squared_error(y_test_trunc, y_sgd_test_pred))

print('--------')

print('Train Score:', sgd.score(X_train, y_train_trunc))

print('Test Score:', sgd.score(X_test, y_test_trunc))

print('--------')

print('Train EVS:', explained_variance_score(y_train_trunc, y_sgd_train_pred))

print('Test EVS:', explained_variance_score(y_test_trunc, y_sgd_test_pred))

In [None]:
def stochastic(X_train, X_test, y_train, y_test):

    for m in range(shape(y_train)[1] - 1):          
                
        y_train_trunc = np.ravel(np.array(list(y_train.iloc[:, m])))

        y_test_trunc = np.ravel(np.array(list(y_test.iloc[:, m]))) 
    
        sgd = make_pipeline(StandardScaler(), SGDRegressor(random_state = random_state))

        sgd.fit(X_train, y_train_trunc)

        y_sgd_train_pred = sgd.predict(X_train)

        y_sgd_train_pred_rounded = [np.rint(y_sgd_train_pred[i]) for i in range(len(y_sgd_train_pred))]

        y_sgd_test_pred = sgd.predict(X_test)

        y_sgd_test_pred_rounded = [np.rint(y_sgd_test_pred[i]) for i in range(len(y_sgd_test_pred))]
    
        DF.append(['sgd',  m + l,
                   round(ent.score(X_train, y_train_trunc), 5),
                   round(ent.score(X_test, y_test_trunc), 5),
                   round(explained_variance_score(y_train_trunc, y_pred_train), 5),
                   round(explained_variance_score(y_test_trunc, y_pred_test), 5)])
        
    return DF

In [None]:
stochastic(X_train, X_test, y_train, y_test)

## Support Vector Machines

In [None]:
# Linear svr

# SVR(*, kernel='rbf', degree=3, gamma='scale', coef0=0.0, 
#    tol=0.001, C=1.0, epsilon=0.1, shrinking=True, 
#    cache_size=200, verbose=False, max_iter=-1)

def sv_reg(X_train, X_test, y_train, y_test):
    
    for m in range(shape(y_train)[1] - 1):
    
        y_train_trunc = np.ravel(np.array(list(y_train.iloc[:, m])))

        y_test_trunc = np.ravel(np.array(list(y_test.iloc[:, m])))
    
        svr = SVR(kernel = 'linear')
        
        svr.fit(X_train, y_train_trunc)

        y_pred_train = svr.predict(X_train)

        y_pred_train_rounded = [np.rint(y_pred_train[i]) for i in range(len(y_pred_train))]

        y_pred_test = svr.predict(X_test)

        y_pred_test_rounded = [np.rint(y_pred_test[i]) for i in range(len(y_pred_test))]

        DF.append(['svr', m + l,
                   round(svr.score(X_train, y_train_trunc), 5),
                   round(svr.score(X_test, y_test_trunc), 5),
                   round(explained_variance_score(y_train_trunc, y_pred_train), 5),
                   round(explained_variance_score(y_test_trunc, y_pred_test), 5)])
        
    return DF

In [None]:
sv_reg(X_train, X_test, y_train, y_test)

In [None]:
# Quadratic svr

# SVR(*, kernel='rbf', degree=3, gamma='scale', coef0=0.0, 
#    tol=0.001, C=1.0, epsilon=0.1, shrinking=True, 
#    cache_size=200, verbose=False, max_iter=-1)

def sv2_reg(X_train, X_test, y_train, y_test):
    
    for m in range(shape(y_train)[1] - 1):
    
        y_train_trunc = np.ravel(np.array(list(y_train.iloc[:, m])))

        y_test_trunc = np.ravel(np.array(list(y_test.iloc[:, m])))
    
        svr2 = SVR(kernel = 'poly', degree = 2, gamma = 'auto')
        
        svr2.fit(X_train, y_train_trunc)

        y_pred_train = svr2.predict(X_train)

        y_pred_train_rounded = [np.rint(y_pred_train[i]) for i in range(len(y_pred_train))]

        y_pred_test = svr2.predict(X_test)

        y_pred_test_rounded = [np.rint(y_pred_test[i]) for i in range(len(y_pred_test))]

        DF.append(['svr2', m + l,
                   round(svr2.score(X_train, y_train_trunc), 5),
                   round(svr2.score(X_test, y_test_trunc), 5),
                   round(explained_variance_score(y_train_trunc, y_pred_train), 5),
                   round(explained_variance_score(y_test_trunc, y_pred_test), 5)])
        
    return DF

In [None]:
sv2_reg(X_train, X_test, y_train, y_test)

## Table

In [None]:
DF2 = pd.DataFrame(np.array(DF), columns = ['Method', 'k',
                                            'Train_Score', 'Test_Score',
                                            'Train_EVS', 'Test_EVS'])

In [None]:
DF2

In [None]:
DF2.to_csv('Regression--Subset--Uniform_Sampling.csv')