# Arboria Demo

Importing useful libs :

In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

Importing arboria :

In [2]:
from arboria import DecisionTree, RandomForest, accuracy

Creating dataset :

In [3]:
bc = load_breast_cancer()
X = bc.data.astype(np.float32)  
y = bc.target.astype(np.int32) 

print("Breast Cancer Wisconsin Dataset")
print(f"Number of samples : {X.shape[0]}")
print(f"Number of features : {X.shape[1]}")

Breast Cancer Wisconsin Dataset
Number of samples : 569
Number of features : 30


In [4]:
x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=10)

### DecisionTree

DecisionTree with default hyperparameters
- **max_depth** by default is set to None (no depth limits)
- **min_sample_split** by default is set to None (no minimum sample per leaf limit)

In [5]:
tree = DecisionTree()
tree.fit(x_train, y_train, criterion="gini")

In [6]:
train_pred = tree.predict(x_train)
training_acc = accuracy(train_pred, y_train)

In [7]:
tree_pred = tree.predict(x_test)
testing_acc = accuracy(tree_pred, y_test)

In [8]:
print("DecisionTree :")
print(f"Training accuracy : {training_acc:.2f}")
print(f"Testing accuracy : {testing_acc:.2f}")

DecisionTree :
Training accuracy : 1.00
Testing accuracy : 0.92


With no depth limit and no constraint on the minimum number of samples per leaf, decision tree models tend to overfit

#### DecisionTree with hyperparameters

In [9]:
param_tree = DecisionTree(max_depth=5, min_sample_split=10)
param_tree.fit(x_train, y_train)

In [10]:
train_pred = param_tree.predict(x_train)
training_acc = accuracy(train_pred, y_train)

test_pred = param_tree.predict(x_test)
testing_acc = accuracy(test_pred, y_test)

In [11]:
print("DecisionTree with hyperparameters :")
print(f"Training accuracy : {training_acc:.2f}")
print(f"Testing accuracy : {testing_acc:.2f}")

DecisionTree with hyperparameters :
Training accuracy : 0.98
Testing accuracy : 0.97


### RandomForest

In [71]:
rf = RandomForest(
    n_estimators= 80,
    max_depth= 10, 
    max_features=6,
    max_samples= 0.9,
    min_sample_split=10,
    seed=10
)


rf.fit(x_train, y_train)

In [72]:
train_pred = rf.predict(x_train)
rf_pred  = rf.predict(x_test)

training_acc = accuracy(train_pred, y_train)
testing_acc = accuracy(rf_pred, y_test)
oob = rf.out_of_bag(x_train, y_train)

print("RandomForest :")
print(f"Training accuracy : {training_acc:.2f}")
print(f"Testing accuracy : {testing_acc:.2f}")
print(f"Out-of-bag accuracy : {oob:.2f}")

RandomForest :
Training accuracy : 0.99
Testing accuracy : 0.99
Out-of-bag accuracy : 0.95


In [51]:
rf_pred_probs = rf.predict_proba(x_test)

In [52]:
print(f"Predicted class for the first test sample : {rf_pred_probs[0]:.2f}")

Predicted class for the first test sample : 0.16
