In [1]:
import glob
from IPython.display import display
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
iris = load_iris()
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.tree import _tree, DecisionTreeClassifier
from sklearn import tree

import numpy as np
import pandas as pd
from pathlib import Path
import random
from scipy import stats
from scipy.io.arff import loadarff

from nodes import Node, GaussianLeaf
from spn import SPN
from learning import LearnSPN
from utils import learncats
from trees import Tree, RandomForest
from sktrees import SKRandomForest, tree2spn

%load_ext autoreload
%autoreload 2

In [2]:
def print_spn(node):
    node.print()
    for child in node.children:
        print_spn(child)

In [3]:
def get_dummies(data):
    for col in data.columns:
        data[col] = pd.Categorical(data[col])
        data[col] = data[col].cat.codes
    return data

In [61]:
def fp(y, y_pred):
    mask = y_pred >= 0
    return np.mean(y_pred[mask] != y[mask])

In [62]:
def fn(y, y_pred):
    mask = y_pred <= 0
    return np.mean(y_pred[mask] != y[mask])

In [69]:
def tp(y, y_pred):
    mask = y_pred >= 0
    return np.mean(y_pred[mask] == y[mask])

In [66]:
def tn(y, y_pred):
    mask = y_pred <= 0
    return np.mean(y_pred[mask] == y[mask])

## Outlier Detection

In [97]:
data = pd.read_csv('../data/csv/winequality_red.csv').values
data[:, -1] -= 3
X, y = data[:, :-1], data[:, -1]

In [102]:
inliers = data[(y == 1) + (y == 2) + (y == 3) + (y == 4)]
outliers = data[(y == 0) + (y == 5)]

In [104]:
shuffle = np.random.choice(np.arange(inliers.shape[0]), inliers.shape[0], replace=False)
train_data = inliers[shuffle[:int(.7*inliers.shape[0])], :]
test_data = inliers[shuffle[int(.7*inliers.shape[0]):], :]

In [105]:
X_train, y_train = train_data[:, :-1], train_data[:, -1]
X_test, y_test = test_data[:, :-1], test_data[:, -1]

In [108]:
rf = RandomForest(n_estimators=10)
rf.fit(X_train, y_train)
spn = rf.tospn()

In [110]:
y_pred, logprs = spn.classify(X_test, classcol=-1, return_prob=True)

In [114]:
oy_pred, ologprs = spn.classify(outliers[:, :-1], classcol=-1, return_prob=True)

In [117]:
print(max(logprs))
print(np.mean(logprs))
print(min(logprs))

37.484327105089356
1.5122587053099386
-49.77613637308498


In [116]:
print(max(ologprs))
print(np.mean(ologprs))
print(min(ologprs))

4.703242020121373
-7.106883801310878
-26.718932610181948


In [118]:
isof = IsolationForest(n_estimators=100)
y_pred = isof.fit_predict(data)

In [119]:
y_true = np.ones(y.shape)
y_true[(y == 0) + (y == 5)] = -1

In [120]:
tp(y_true, y_pred)

0.9917469050894085

In [121]:
fp(y_true, y_pred)

0.008253094910591471

In [122]:
tn(y_true, y_pred)

0.1103448275862069

In [123]:
fn(y_true, y_pred)

0.8896551724137931

In [76]:
rf = RandomForest(n_estimators=10)
rf.fit(X, y)

## Playing with SPN Code

In [296]:
file = '../data/arff/breast-cancer.arff'
data = pd.DataFrame(loadarff(file)[0])
data = get_dummies(data).values
X = data[:, :-1]
y = data[:, -1]

In [458]:
data = pd.read_csv('../data/csv/winequality_red.csv').values
X = data[:, :-1]
y = data[:, -1]-3

In [449]:
iris.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [450]:
tree.ncat

array([1., 1., 1., 1., 3.])

In [460]:
tree = Tree(criterion='gini')
tree.fit(X, y)

In [423]:
for i in range(tree.n_nodes_):
    print(tree.feature[i], tree.threshold[i])

3 0.8
-2 -2
3 1.75
2 4.95
3 1.55
-2 -2
-2 -2
3 1.55
-2 -2
-2 -2
2 4.85
-2 -2
-2 -2


In [462]:
np.mean(tree.predict(X)==y)

1.0

In [433]:
sktree.tree_.value

array([[[50., 50., 50.]],

       [[50.,  0.,  0.]],

       [[ 0., 50., 50.]],

       [[ 0., 49.,  5.]],

       [[ 0., 47.,  1.]],

       [[ 0., 47.,  0.]],

       [[ 0.,  0.,  1.]],

       [[ 0.,  2.,  4.]],

       [[ 0.,  0.,  3.]],

       [[ 0.,  2.,  1.]],

       [[ 0.,  2.,  0.]],

       [[ 0.,  0.,  1.]],

       [[ 0.,  1., 45.]],

       [[ 0.,  1.,  2.]],

       [[ 0.,  0.,  2.]],

       [[ 0.,  1.,  0.]],

       [[ 0.,  0., 43.]]])

In [425]:
np.mean(tree.predict(iris.data)==iris.target)

1.0

In [427]:
sktree = DecisionTreeClassifier()
sktree.fit(iris.data, iris.target)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [444]:
for i in range(sktree.tree_.node_count):
    print(sktree.tree_.feature[i], sktree.tree_.threshold[i])

3 0.800000011920929
-2 -2.0
3 1.75
2 4.950000047683716
3 1.6500000357627869
-2 -2.0
-2 -2.0
3 1.550000011920929
-2 -2.0
0 6.949999809265137
-2 -2.0
-2 -2.0
2 4.8500001430511475
1 3.100000023841858
-2 -2.0
-2 -2.0
-2 -2.0


In [416]:
values = np.array([1, 2, 3])
((values + np.roll(values, 1))/2)[1:]

array([1.5, 2.5])

In [379]:
sktree.tree_.value

array([[[ 10.,  53., 681., 638., 199.,  18.]],

       [[  7.,  29., 519., 260.,  25.,   2.]],

       [[  4.,  19., 262.,  66.,   2.,   0.]],

       ...,

       [[  0.,   0.,   0.,   3.,   0.,   3.]],

       [[  0.,   0.,   0.,   3.,   0.,   0.]],

       [[  0.,   0.,   0.,   0.,   0.,   3.]]])

In [370]:
np.mean(tree.predict(X) == y)

0.8986866791744841

In [364]:
tree.value

array([[ 10.,  53., 681., 638., 199.,  18.],
       [  7.,  34., 575., 328.,  37.,   2.],
       [  3.,  19., 235.,  62.,   1.,   0.],
       ...,
       [  0.,   0.,   2.,   3.,   0.,   0.],
       [  0.,   0.,   0.,   3.,   0.,   0.],
       [  0.,   0.,   2.,   0.,   0.,   0.]])

In [361]:
y

array([2., 2., 2., ..., 3., 2., 3.])

In [358]:
X[:, :].shape

(1599, 11)

In [283]:
a, b = np.unique([1, 1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4], return_counts=True)

In [286]:
np.sum(np.square(b/np.sum(b)))

0.3984375

In [264]:
RF.fit(iris.data, iris.target)

In [295]:
np.mean(tree.predict(X) == y)

0.8849280800500313

In [186]:
spn = tree2spn(tree, iris.data, iris.target)

In [157]:
tree.children_left

array([ 1., -2.,  3.,  4., -2.,  6., -2., -2., -2.])

In [158]:
tree.children_right

array([ 2., -2.,  8.,  5., -2.,  7., -2., -2., -2.])

In [187]:
np.mean(tree.predict(iris.data) == iris.target)

0.98

In [188]:
tree = DecisionTreeClassifier()
tree.fit(iris.data, iris.target)
np.mean(tree.predict(iris.data) == iris.target)
#spn = tree2spn(tree, iris.data, iris.target)

1.0

In [50]:
left.shape

(150,)

In [5]:
print_spn(spn.root) # (Depth) (Node type) s: (Scope) w: (Weights) n:(Number of datapoints) (Node ID)

 0 S s:  [0, 1, 2, 3, 4]  w:  [0.33552632 0.66447368]  n:  151 4690910
  1 P s:  [0, 1, 2, 3, 4]  n:  51 9024279
   2 L s:  [2] leq value:  2.449999988079071  n:  51 9968185
   2 P s:  [0, 1, 2, 3, 4]  n:  51 2424872
    3 G s:  [0]  n:  51 347081
    3 G s:  [1]  n:  51 5047867
    3 G s:  [2]  n:  51 4690348
    3 G s:  [3]  n:  51 2873827
    3 S s:  [4]  w:  [0.92857143 0.03571429 0.03571429]  n:  51 8392112
     4 L s:  [4] eq value:  0.0  n:  52 8442205
     4 L s:  [4] eq value:  1.0  n:  2 9419671
     4 L s:  [4] eq value:  2.0  n:  2 230303
  1 P s:  [0, 1, 2, 3, 4]  n:  101 8098566
   2 L s:  [2] g value:  2.449999988079071  n:  101 882936
   2 S s:  [0, 1, 2, 3, 4]  w:  [0.53921569 0.46078431]  n:  101 221386
    3 P s:  [0, 1, 2, 3, 4]  n:  55 7215363
     4 L s:  [3] leq value:  1.75  n:  55 1972496
     4 S s:  [0, 1, 2, 3, 4]  w:  [0.875 0.125]  n:  55 1892854
      5 P s:  [0, 1, 2, 3, 4]  n:  49 4035176
       6 L s:  [2] leq value:  4.950000047683716  n:  49 4361852


In [6]:
# Check if they are equivalent
# spn.classify returns the classes at position 0 and the log-probabilities at position 1
# classcol is needed to retrieve the number of classes internally (bad design, working on it)
tree.predict(iris.data) == spn.classify(iris.data, classcol=4)[0]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

## Playing with Custom Random Forest -> SPN Code

In [25]:
sktree = SKRandomForest(n_estimators=10)
sktree.fit(iris.data, iris.target)
spn = sktree.tospn()
rf = sktree.torf()

In [26]:
rf.predict(iris.data)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [8]:
sktree.predict(iris.data, vote=False) # Returns probabilities

array([[1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. 

In [9]:
# The spn should return less skewed results as it models the joint
spn.probs(iris.data, classcol=4) # Returns probabilities

array([[2.90181711e+00, 1.08051054e-01, 1.08051054e-01],
       [1.41976400e+00, 5.14103543e-02, 5.14103543e-02],
       [1.15055612e+00, 4.14508709e-02, 4.14508709e-02],
       [1.03446931e+00, 3.72884200e-02, 3.72884200e-02],
       [2.70597500e+00, 1.00716527e-01, 1.00716527e-01],
       [1.12511194e-01, 4.26768517e-03, 4.26768517e-03],
       [1.45750920e+00, 5.27770328e-02, 5.27770328e-02],
       [3.05318406e+00, 1.13477398e-01, 1.13477398e-01],
       [2.68780258e-01, 9.50510980e-03, 9.50510980e-03],
       [8.08917412e-01, 2.91256117e-02, 2.91256117e-02],
       [1.38780833e+00, 5.28240835e-02, 5.28240835e-02],
       [1.75727467e+00, 6.49650209e-02, 6.49650209e-02],
       [5.48036906e-01, 1.95006511e-02, 1.95006511e-02],
       [1.26078700e-02, 4.35667687e-04, 4.35667687e-04],
       [2.96050780e-02, 1.11698353e-03, 1.11698353e-03],
       [1.06108045e-02, 4.09173223e-04, 4.09173223e-04],
       [2.06612605e-01, 7.66789763e-03, 7.66789763e-03],
       [2.79313056e+00, 1.03865

## Analysing a scikit-learn Random Forest

In [10]:
# Run cross-validation on a tree
def tree_cv(data, n_folds, classcol=None, stratify=True, seed=42):
    n, m = data.shape[0], data.shape[1]  # Number of rows and columns
    np.random.seed(seed)

    if classcol is None:
        classcol = m-1
        print("Taking last column as the class variable.")

    if n/n_folds < 30:
        n_folds = np.floor(n/30).astype(int)
        print("Too many folds for this dataset. Using %d " % n_folds)

    folds = np.zeros(n)
    if stratify:
        for c in np.unique(data[ :, classcol]):
            nn = np.sum(data[ :, classcol] == c)
            ind = np.tile(np.arange(n_folds), int(np.ceil(nn/n_folds)))[:nn]
            folds[data[:, classcol] == c] = np.random.choice(ind, nn, replace=False)
    else:
        ind = np.tile(np.arange(n_folds), int(np.ceiling(n/nfolds)))[:n]
        folds = np.random.choice(ind, n, replace=False)

    y, y_pred = np.array([]), np.array([])
    for fold in range(n_folds):
        train_data = data[np.where(folds!=fold)[0], :].copy()
        test_data = data[np.where(folds==fold)[0], :].copy()
        mask = np.delete(np.arange(m), classcol)
        X_test = test_data[:, mask]
        y_test = test_data[:, classcol]
        clf = Tree(classcol)
        clf.fit(train_data)
        y_pred = np.append(y_pred, clf.classify(X_test))
        y = np.append(y, y_test)
    return y_pred, y

In [11]:
# Run cross-validation on a scikit-learn Random Forest
def rf_cv(data, n_folds, classcol=None, stratify=True, seed=42, n_estimators=10, max_depth=None):
    n, m = data.shape[0], data.shape[1]  # Number of rows and columns
    np.random.seed(seed)

    if classcol is None:
        classcol = m-1
        print("Taking last column as the class variable.")
        
    if max_depth is None:
        max_depth = data.shape[1]-1

    if n/n_folds < 30:
        n_folds = np.floor(n/30).astype(int)
        print("Too many folds for this dataset. Using %d " % n_folds)

    folds = np.zeros(n)
    if stratify:
        for c in np.unique(data[ :, classcol]):
            nn = np.sum(data[ :, classcol] == c)
            ind = np.tile(np.arange(n_folds), int(np.ceil(nn/n_folds)))[:nn]
            folds[data[:, classcol] == c] = np.random.choice(ind, nn, replace=False)
    else:
        ind = np.tile(np.arange(n_folds), int(np.ceiling(n/nfolds)))[:n]
        folds = np.random.choice(ind, n, replace=False)

    y, y_pred = np.array([]), np.array([])
    for fold in range(n_folds):
        train_data = data[np.where(folds!=fold)[0], :]
        test_data = data[np.where(folds==fold)[0], :]
        mask = np.delete(np.arange(m), classcol)
        X_train, X_test = train_data[:, mask], test_data[:, mask]
        y_train, y_test = train_data[:, classcol], test_data[:, classcol]
        clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
        clf.fit(X_train, y_train)
        y_pred = np.append(y_pred, clf.predict(X_test).astype(int))
        y = np.append(y, y_test)
    return y_pred, y

In [12]:
# Run Cross-validation on custom Random Forest wrapper
def sk_rf_cv(data, n_folds, classcol=None, stratify=True, seed=42, n_estimators=10, max_depth=None):
    n, m = data.shape[0], data.shape[1]  # Number of rows and columns
    np.random.seed(seed)

    if classcol is None:
        classcol = m-1
        print("Taking last column as the class variable.")
        
    if max_depth is None:
        max_depth = data.shape[1]-1

    if n/n_folds < 30:
        n_folds = np.floor(n/30).astype(int)
        print("Too many folds for this dataset. Using %d " % n_folds)

    folds = np.zeros(n)
    if stratify:
        for c in np.unique(data[ :, classcol]):
            nn = np.sum(data[ :, classcol] == c)
            ind = np.tile(np.arange(n_folds), int(np.ceil(nn/n_folds)))[:nn]
            folds[data[:, classcol] == c] = np.random.choice(ind, nn, replace=False)
    else:
        ind = np.tile(np.arange(n_folds), int(np.ceiling(n/nfolds)))[:n]
        folds = np.random.choice(ind, n, replace=False)

    y, y_pred, y_pred_2 = np.array([]), np.array([]), None
    for fold in range(n_folds):
        train_data = data[np.where(folds!=fold)[0], :]
        test_data = data[np.where(folds==fold)[0], :]
        mask = np.delete(np.arange(m), classcol)
        X_train, X_test = train_data[:, mask], test_data[:, mask]
        y_train, y_test = train_data[:, classcol], test_data[:, classcol]
        clf = SKRandomForest(n_estimators=n_estimators)
        clf.fit(X_train, y_train)
        spn = clf.tospn()
        y_pred = np.append(y_pred, np.argmax(clf.predict(X_test, vote=False), axis=1).astype(int))
        if y_pred_2 is None:
            y_pred_2 = spn.classify(X_test, classcol=X_test.shape[1])[0]
        else:
            y_pred_2 = np.append(y_pred_2, spn.classify(X_test, classcol=X_test.shape[1])[0])
        y = np.append(y, y_test)
    return y_pred, y_pred_2, y

In [13]:
datasets = ['colic',
            'car',
            'breast-cancer',
            'heart-h',
            'bridges_version1',
            'ecoli',
            'autos',
            'liver-disorders',
            'cmc',
            'diabetes',
            'tic-tac-toe',
            'solar-flare_2',
            'zoo',
            'flags',
            'lymph',
            'segment',
            'hypothyroid',
            'vowel',
            'soybean',
            'vehicle',
            'dermatology',
            'spambase',
            'sick',
            'balance-scale',
            'nursery']

## Run Comparison with single train-test split
#### (Faster results)

In [32]:
Acc = pd.DataFrame(columns=['Dataset', 
                            'Number of Instances', 
                            'Number of Variables', 
                            'Number of Classes',
                            'RandomForest',
                            'Derived SPN'])
datafiles = glob.glob('../data/arff/*.{}'.format('arff'))
for file in datafiles:
    d = {}
    name = Path(file).stem
    if name in datasets:
        print(name)
        n_trees = 10
        data = pd.DataFrame(loadarff(file)[0])
        data = get_dummies(data).values
        d['Dataset'] = name
        d['Number of Instances'] = data.shape[0]
        d['Number of Variables'] = data.shape[1]
        d['Number of Classes'] = len(np.unique(data[:, -1]))
        _, counts = np.unique(data[:, -1], return_counts=True)
        
        shuffle = np.random.choice(np.arange(data.shape[0]), data.shape[0], replace=False)
        train_data = data[shuffle[:int(.7*data.shape[0])], :]
        test_data = data[shuffle[int(.7*data.shape[0]):], :]

        clf = SKRandomForest(n_estimators=2)
        clf.fit(train_data[:, :-1], train_data[:, -1])
        y_pred = np.argmax(clf.predict(test_data[:, :-1], vote=False), axis=1).astype(int)
        
        spn = clf.tospn()
        y_pred_2 = spn.classify(test_data[:, :-1], classcol=data.shape[1]-1)[0]
        
        y = test_data[:, -1]
        
        d['RandomForest'] = np.mean(y_pred == y)
        d['Derived SPN'] = np.mean(y_pred_2 == y)
        Acc = Acc.append(d, ignore_index=True)

colic
ecoli
breast-cancer
flags
heart-h
solar-flare_2
zoo
tic-tac-toe
lymph
liver-disorders
nursery
bridges_version1
hypothyroid
segment
autos
sick
vehicle
spambase
vowel
car
dermatology
balance-scale
cmc
soybean
diabetes


In [41]:
Acc

Unnamed: 0,Dataset,Number of Instances,Number of Variables,Number of Classes,RandomForest,Derived SPN
0,colic,368,23,2,0.738739,0.774775
1,ecoli,336,6,8,0.712871,0.732673
2,breast-cancer,286,10,2,0.639535,0.627907
3,flags,194,29,8,0.440678,0.474576
4,heart-h,294,12,2,0.752809,0.752809
5,solar-flare_2,1066,12,6,0.715625,0.696875
6,zoo,101,17,7,0.870968,0.967742
7,tic-tac-toe,958,10,2,0.850694,0.822917
8,lymph,148,18,4,0.688889,0.755556
9,liver-disorders,345,7,2,0.586538,0.557692


## Run 5-fold cross-validation

In [44]:
Acc = pd.DataFrame(columns=['Dataset', 
                            'Number of Instances', 
                            'Number of Variables', 
                            'Number of Classes',
                            'RandomForest',
                            'Derived SPN'])
datafiles = glob.glob('../data/arff/*.{}'.format('arff'))
for file in datafiles:
    d = {}
    name = Path(file).stem
    if name in datasets:
        print(name)
        n_trees = 10
        data = pd.DataFrame(loadarff(file)[0])
        data = get_dummies(data).values
        d['Dataset'] = name
        d['Number of Instances'] = data.shape[0]
        d['Number of Variables'] = data.shape[1]
        d['Number of Classes'] = len(np.unique(data[:, -1]))
        _, counts = np.unique(data[:, -1], return_counts=True)
        
        y_pred, y_pred_2, y = sk_rf_cv(data.copy(), n_folds=5, n_estimators=n_trees)
        d['RandomForest'] = np.mean(y_pred == y)
        d['Derived SPN'] = np.mean(y_pred_2 == y)
        Acc = Acc.append(d, ignore_index=True)

colic
Taking last column as the class variable.
ecoli
Taking last column as the class variable.
breast-cancer
Taking last column as the class variable.
flags
Taking last column as the class variable.
heart-h
Taking last column as the class variable.
solar-flare_2
Taking last column as the class variable.
zoo
Taking last column as the class variable.
Too many folds for this dataset. Using 3 
tic-tac-toe
Taking last column as the class variable.
lymph
Taking last column as the class variable.
Too many folds for this dataset. Using 4 
liver-disorders
Taking last column as the class variable.
nursery
Taking last column as the class variable.
bridges_version1
Taking last column as the class variable.
Too many folds for this dataset. Using 3 
hypothyroid
Taking last column as the class variable.
segment
Taking last column as the class variable.
autos
Taking last column as the class variable.
sick
Taking last column as the class variable.
vehicle
Taking last column as the class variable.
spam

In [45]:
Acc

Unnamed: 0,Dataset,Number of Instances,Number of Variables,Number of Classes,RandomForest,Derived SPN
0,colic,368,23,2,0.820652,0.842391
1,ecoli,336,6,8,0.729167,0.72619
2,breast-cancer,286,10,2,0.699301,0.716783
3,flags,194,29,8,0.623711,0.603093
4,heart-h,294,12,2,0.795918,0.782313
5,solar-flare_2,1066,12,6,0.728893,0.730769
6,zoo,101,17,7,0.950495,0.930693
7,tic-tac-toe,958,10,2,0.934238,0.911273
8,lymph,148,18,4,0.790541,0.797297
9,liver-disorders,345,7,2,0.631884,0.64058
