In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.datasets import make_classification

from classification_random_forest import MyForestClf

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/static/public/267/banknote+authentication.zip', header=None)
df.columns = ['variance', 'skewness', 'curtosis', 'entropy', 'target']
X, y = df.iloc[:,:4], df['target']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
X_train.shape, X_test.shape

((1097, 4), (275, 4))

In [5]:
forest = MyForestClf()
forest

MyForestClf class: n_estimators=10, max_features=0.5, max_samples=0.5, max_depth=5, min_samples_split=2, max_leaves=20, bins=16, criterion=entropy, random_state=42

In [6]:
forest.fit(X_train, y_train)

In [7]:
forest.fi

{'variance': 1.4012194865629808,
 'skewness': 0.9702719291847656,
 'curtosis': 0.3862815599252041,
 'entropy': 0.23195695093785698}

In [8]:
preds = forest.predict(X_test, type='vote')
preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0])

In [9]:
confusion_matrix(y_test, preds)

array([[143,   5],
       [ 10, 117]])

In [10]:
forest_2 = MyForestClf(20, 0.5, 0.5, 7, 10, 20, criterion='gini', oob_score='roc_auc')

In [11]:
forest_2.fit(X_train, y_train)

In [12]:
preds_2 = forest_2.predict(X_test, type='mean')

In [13]:
forest_2.fi

{'variance': 1.1923838892904595,
 'skewness': 1.2125853164350848,
 'curtosis': 0.5356387583591643,
 'entropy': 0.26978705604785136}

In [14]:
forest_2.oob_score_

0.9890916570565346

In [15]:
confusion_matrix(y_test, preds_2)

array([[144,   4],
       [ 16, 111]])

In [16]:
X, y = make_classification(n_samples=1000, n_features=14, n_informative=10, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]
X_test = X.sample(n=150, random_state = 42)
y_test = y.loc[X_test.index]

In [17]:
forest_3 = MyForestClf(n_estimators=100, bins=10, oob_score='roc_auc')

In [18]:
forest_3.fit(X, y)

In [19]:
forest_3.fi

{'col_0': 1.9569621433565227,
 'col_1': 0.23353414853761553,
 'col_2': 1.4293833056958798,
 'col_3': 2.0683896591620323,
 'col_4': 4.595902877088849,
 'col_5': 2.549618134952868,
 'col_6': 1.883909530419694,
 'col_7': 1.8069883446624209,
 'col_8': 1.3971425401628097,
 'col_9': 1.066787919590144,
 'col_10': 2.1637151869399722,
 'col_11': 0.7858758075118963,
 'col_12': 0.1957449627469463,
 'col_13': 3.367344565040658}

In [20]:
forest_3.oob_score_

0.9679358717434869

In [21]:
%%timeit
forest_3 = MyForestClf(100, bins=10, oob_score = 'roc_auc')
forest_3.fit(X, y)

1min 7s ± 1.55 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
from classification_multiprocess_random_forest import MultiProcessForestClf

In [23]:
forest_4 = MultiProcessForestClf(n_estimators=100, bins=10, oob_score='roc_auc')

In [24]:
forest_4.fit(X, y)

In [25]:
forest_4.fi == forest_3.fi

True

In [26]:
forest_4.oob_score_

0.9679358717434869

In [27]:
%%timeit
forest_4 = MultiProcessForestClf(100, bins=10, oob_score = 'roc_auc')
forest_4.fit(X, y)

33.9 s ± 5.68 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
preds = forest_4.predict(X_test, type='vote')

In [30]:
confusion_matrix(y_test, preds)

array([[68,  3],
       [ 2, 77]])