In [1]:
%matplotlib inline
import sklearn
import pandas
import numpy
import matplotlib.pyplot as plt

import os.path
import sys
import time

sys.path.insert(0, '..')
import emtrees

In [2]:
from sklearn import model_selection, metrics, datasets

seed = 1
    
def get_data():
    test_size = 0.5

    X,Y = datasets.make_classification(n_features=6, n_samples=200000, random_state=seed)
    
    s = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)
    X_train, X_test, Y_train, Y_test = s
    
    return X_train, X_test, Y_train, Y_test


metric = 'average_precision'

parameters = {
    'n_estimators': 100,
    'min_samples_split': 0.01,
}

X_train, X_test, Y_train, Y_test = get_data()

sk = sklearn.ensemble.RandomForestClassifier(**parameters)
sk.fit(X_train, Y_train)
em = emtrees.RandomForest(**parameters)
em.fit(X_train, Y_train)

print('Instances', X_test.shape)
tests = { 'sklearn': sk, 'emtrees': em }
for name, estimator in tests.items():
    
    start = time.time()
    pred = estimator.predict(X_test)
    end = time.time()
    
    a = metrics.accuracy_score(Y_test, pred)
    print('accuracy', a)
    assert a > 0.90
    t = end - start
    print('{}: {} seconds'.format(name, t))




Instances (100000, 6)
accuracy 0.94856
sklearn: 3.02668833732605 seconds




accuracy 0.94588
emtrees: 1.7071821689605713 seconds


In [3]:
len(em.forest_[0])/len(em.forest_[1])

170.06

In [4]:
%load_ext line_profiler
def pred():
    em.predict(X_test)

In [5]:
%lprun -f em.predict pred()

