In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.datasets import make_classification

from classification_tree import MyTreeClf

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/static/public/267/banknote+authentication.zip', header=None)
df.columns = ['variance', 'skewness', 'curtosis', 'entropy', 'target']
X, y = df.iloc[:,:4], df['target']

In [4]:
X.head()

Unnamed: 0,variance,skewness,curtosis,entropy
0,3.6216,8.6661,-2.8073,-0.44699
1,4.5459,8.1674,-2.4586,-1.4621
2,3.866,-2.6383,1.9242,0.10645
3,3.4566,9.5228,-4.0112,-3.5944
4,0.32924,-4.4552,4.5718,-0.9888


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
X_train.shape, X_test.shape

((1097, 4), (275, 4))

In [7]:
tree = MyTreeClf()
tree.fit(X_train, y_train)

In [8]:
tree.print_tree()

variance 0.320165
 skewness 5.86535
  curtosis 3.0642
   leaf_left 1.0
   skewness -1.81995
    variance -0.651195
     leaf_left 1.0
     leaf_right 0.6
    variance -2.15635
     leaf_left 0.6666666666666666
     leaf_right 0.0
  variance -3.4448999999999996
   curtosis 2.1114
    leaf_left 1.0
    leaf_right 0.0
   leaf_right 0.0
 variance 2.2354000000000003
  curtosis -2.2721999999999998
   skewness 6.41995
    leaf_left 1.0
    leaf_right 0.0
   entropy 0.22994
    variance 0.42002
     leaf_left 0.07142857142857142
     leaf_right 0.0
    curtosis 0.28118
     leaf_left 0.8181818181818182
     leaf_right 0.034482758620689655
  leaf_right 0.0


In [15]:
preds = tree.predict(X_test)

In [16]:
preds = np.array(preds)

In [17]:
preds = np.where(preds>0.5, 1, 0)
preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0])

In [18]:
confusion_matrix(y_test, preds)

array([[147,   1],
       [  5, 122]])

In [21]:
tree2 = MyTreeClf(5, 10, 20, criterion='gini')
tree2.fit(X_train, y_train)

In [22]:
tree2

MyTreeClf class: max_depth=5, min_samples_split=10, max_leaves=20

In [23]:
tree2.print_tree()

variance 0.320165
 skewness 7.76395
  variance -0.458565
   curtosis 6.21865
    skewness 7.2249
     leaf_left 1.0
     leaf_right 0.8
    skewness -4.6745
     leaf_left 1.0
     leaf_right 0.08333333333333333
   curtosis 2.62465
    skewness 5.45355
     leaf_left 1.0
     leaf_right 0.0
    leaf_right 0.0
  variance -4.726
   leaf_left 1.0
   leaf_right 0.0
 curtosis -4.45585
  variance 3.22215
   leaf_left 1.0
   leaf_right 0.0
  variance 1.5652
   curtosis -2.2721999999999998
    skewness 5.6574
     leaf_left 1.0
     leaf_right 0.0
    entropy 0.22994
     leaf_left 0.009433962264150943
     leaf_right 0.45454545454545453
   variance 2.0388
    curtosis -2.6483499999999998
     leaf_left 0.75
     leaf_right 0.0
    leaf_right 0.0


In [24]:
preds_2 = tree2.predict(X_test)

In [25]:
preds_2 = np.array(preds_2)
preds_2 = np.where(preds_2>0.5, 1, 0)
preds_2

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0])

In [26]:
confusion_matrix(y_test, preds_2)

array([[148,   0],
       [  9, 118]])

In [44]:
X, y = make_classification(n_samples=1000, n_features=14, n_informative=10, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]
X_test = X.sample(n=150, random_state = 42)
y_test = y.loc[X_test.index]

In [45]:
X_test.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13
521,-7.076086,-0.45326,-0.636779,2.767567,0.487817,4.505408,1.827859,3.830747,-2.269701,2.164781,-2.407617,0.539655,1.71844,-1.616646
737,5.707872,0.003583,0.420771,0.839401,5.012524,-1.403803,0.203846,1.354653,4.154947,-1.576621,1.201844,0.772307,-0.194664,0.992817
740,4.970085,-0.125819,-0.250745,-2.620766,1.345045,1.875148,-4.563262,2.697279,3.098685,-1.75191,4.050941,-1.315866,-1.073091,1.157727
660,1.429822,0.915654,-2.512073,1.38145,0.881182,-0.361891,-0.058583,-0.422694,4.356675,-2.311557,-1.865138,4.473568,1.186416,0.180454
411,4.223676,0.440494,1.380958,-3.104249,-2.525465,-1.239507,-2.358298,2.097283,2.462375,-1.042123,1.43078,0.790309,-0.075027,-0.369003


In [50]:
tree3 = MyTreeClf(3, 20, 10)
tree3.fit(X, y)

In [51]:
tree3.print_tree()

col_4 -0.032813398968507546
 col_13 -1.7098345996002615
  col_9 1.66622032336373
   leaf_left 0.46511627906976744
   leaf_right 0.0
  col_10 0.256667471292098
   leaf_left 0.6889763779527559
   leaf_right 0.9554140127388535
 col_10 0.6409800682452367
  col_6 0.37836029954673145
   leaf_left 0.3076923076923077
   leaf_right 0.0392156862745098
  col_8 -1.1426770157567585
   leaf_left 0.15384615384615385
   leaf_right 0.8641975308641975


In [52]:
preds_3 = tree3.predict(X_test)
preds_3 = np.array(preds_3)
preds_3 = np.where(preds_3>0.5, 1, 0)
preds_3

array([0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1])

In [53]:
confusion_matrix(y_test, preds_3)

array([[55, 16],
       [17, 62]])