In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.append("..")

In [3]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier as AdaBoostClassifier_sk
from sklearn.tree import export_text as export_text_sk
from sklearn.tree import plot_tree as plot_tree_sk

In [4]:
iris = load_iris()
X, y = load_iris(return_X_y=True)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [6]:
from algosfromscratch.supervised_learning import AdaBoostClassifier
from algosfromscratch.utils.tree import export_text

In [7]:
clf = AdaBoostClassifier(n_estimators=10, learning_rate=0.8, random_state=647)

In [8]:
clf.fit(X_train, y_train)

In [9]:
y_pred = clf.predict(X_test)

In [10]:
y_pred

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 2, 1, 0, 1, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0,
       0])

In [11]:
clf_sk = AdaBoostClassifier_sk(n_estimators=10, learning_rate=0.8, algorithm='SAMME', random_state=647)

In [12]:
clf_sk.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME', learning_rate=0.8, n_estimators=10,
                   random_state=647)

In [13]:
y_pred_sk = clf_sk.predict(X_test)

In [14]:
y_pred_sk

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0,
       0])

In [15]:
y_pred == y_pred_sk

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

In [16]:
clf.estimator_weights_

array([1.21429658, 1.56113637, 1.7361966 , 1.800409  , 1.53748926,
       1.21810039, 1.31617677, 1.77458047, 1.63620761, 1.07657972])

In [17]:
clf.estimator_errors_

array([0.3047619 , 0.22127128, 0.18586606, 0.17402521, 0.22640652,
       0.30375539, 0.27846796, 0.17871492, 0.20552525, 0.34240895])

In [18]:
clf_sk.estimator_weights_

array([1.21429658, 1.58705341, 1.90200506, 1.65433868, 1.49776533,
       1.58603614, 1.72480323, 1.30252824, 1.58786825, 1.15295796])

In [19]:
clf_sk.estimator_errors_

array([0.3047619 , 0.2157395 , 0.15651911, 0.20184928, 0.23522135,
       0.21595472, 0.18803076, 0.28190877, 0.21556721, 0.32124628])

In [20]:
for e in clf.estimators_:
    print(export_text(e.root, feature_names=iris['feature_names']))

|---  petal length (cm) <= 2.35
|   |--- n_obs: 32 class: 0
|---  petal length (cm) > 2.35
|   |--- n_obs: 73 class: 2

|---  petal length (cm) <= 5.05
|   |--- n_obs: 86 class: 1
|---  petal length (cm) > 5.05
|   |--- n_obs: 19 class: 2

|---  petal length (cm) <= 2.3
|   |--- n_obs: 51 class: 0
|---  petal length (cm) > 2.3
|   |--- n_obs: 54 class: 1

|---  petal width (cm) <= 1.45
|   |--- n_obs: 32 class: 0
|---  petal width (cm) > 1.45
|   |--- n_obs: 73 class: 2

|---  petal length (cm) <= 4.95
|   |--- n_obs: 83 class: 1
|---  petal length (cm) > 4.95
|   |--- n_obs: 22 class: 2

|---  petal width (cm) <= 0.75
|   |--- n_obs: 29 class: 0
|---  petal width (cm) > 0.75
|   |--- n_obs: 76 class: 1

|---  petal width (cm) <= 1.55
|   |--- n_obs: 46 class: 1
|---  petal width (cm) > 1.55
|   |--- n_obs: 59 class: 2

|---  petal width (cm) <= 0.7
|   |--- n_obs: 41 class: 0
|---  petal width (cm) > 0.7
|   |--- n_obs: 64 class: 2

|---  petal width (cm) <= 0.75
|   |--- n_obs: 23 cl

In [21]:
for e in clf_sk.estimators_:
    print(export_text_sk(e, feature_names=iris['feature_names'], show_weights=True))

|--- petal length (cm) <= 2.35
|   |--- weights: [0.32, 0.00, 0.00] class: 0
|--- petal length (cm) >  2.35
|   |--- weights: [0.00, 0.30, 0.37] class: 2

|--- petal width (cm) <= 0.75
|   |--- weights: [0.19, 0.00, 0.00] class: 0
|--- petal width (cm) >  0.75
|   |--- weights: [0.00, 0.60, 0.22] class: 1

|--- petal length (cm) <= 4.95
|   |--- weights: [0.10, 0.31, 0.04] class: 1
|--- petal length (cm) >  4.95
|   |--- weights: [0.00, 0.01, 0.53] class: 2

|--- petal width (cm) <= 0.75
|   |--- weights: [0.36, 0.00, 0.00] class: 0
|--- petal width (cm) >  0.75
|   |--- weights: [0.00, 0.20, 0.44] class: 2

|--- petal width (cm) <= 0.75
|   |--- weights: [0.20, 0.00, 0.00] class: 0
|--- petal width (cm) >  0.75
|   |--- weights: [0.00, 0.57, 0.24] class: 1

|--- petal width (cm) <= 1.75
|   |--- weights: [0.11, 0.31, 0.10] class: 1
|--- petal width (cm) >  1.75
|   |--- weights: [0.00, 0.01, 0.48] class: 2

|--- petal length (cm) <= 2.35
|   |--- weights: [0.29, 0.00, 0.00] class: 0
|

# Titanic

In [22]:
import pandas as pd

In [23]:
df = pd.read_csv('../data/titanic.csv')

In [24]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000


In [25]:
X = df[['Pclass', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']].to_numpy()

# Make female 1 and male 0
X[:,1] = [0 if s == 'female' else 1 for s in X[:,1]]

In [26]:
y = df['Survived'].to_numpy()

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [28]:
clf = AdaBoostClassifier(n_estimators=100, learning_rate=0.8, random_state=647)

In [29]:
clf.fit(X_train, y_train)

In [30]:
y_pred = clf.predict(X_test)

In [31]:
clf_sk = AdaBoostClassifier_sk(n_estimators=100, learning_rate=0.8, algorithm='SAMME', random_state=647)

In [32]:
clf_sk.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME', learning_rate=0.8, n_estimators=100,
                   random_state=647)

In [33]:
y_pred_sk = clf_sk.predict(X_test)

In [34]:
y_pred == y_pred_sk

array([ True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,

In [35]:
clf.estimator_errors_

array([0.22573363, 0.38574093, 0.3705298 , 0.40722652, 0.47931857,
       0.4561147 , 0.45094045, 0.48182184, 0.44756656, 0.48215268,
       0.47669339, 0.47925757, 0.48354915, 0.47206417, 0.48652911,
       0.47408448, 0.44650916, 0.46731863, 0.47916991, 0.45954717,
       0.489741  , 0.4587026 , 0.48985172, 0.4892124 , 0.50663558,
       0.4827233 , 0.4593135 , 0.47136969, 0.48807177, 0.46500118,
       0.47577156, 0.47558395, 0.47698939, 0.47077173, 0.47814326,
       0.48645939, 0.48029976, 0.44245206, 0.48436354, 0.48100979,
       0.47161944, 0.49538909, 0.45979545, 0.46945093, 0.50740508,
       0.47817431, 0.49443467, 0.49021073, 0.48440719, 0.47225466,
       0.49552762, 0.46935527, 0.48374242, 0.5011801 , 0.49105267,
       0.48668331, 0.47974434, 0.48001508, 0.47643288, 0.46266468,
       0.49302171, 0.46466398, 0.5001962 , 0.49331572, 0.47749408,
       0.50753061, 0.49034352, 0.4885253 , 0.48540842, 0.47440882,
       0.47492392, 0.48836435, 0.50000747, 0.47133568, 0.48644

In [36]:
clf_sk.estimator_errors_

array([0.22573363, 0.34494019, 0.43450839, 0.39091556, 0.43215253,
       0.45351045, 0.44258008, 0.46294882, 0.48285012, 0.43841174,
       0.4670365 , 0.45941952, 0.49186673, 0.4693662 , 0.47999182,
       0.46331835, 0.48595296, 0.49311384, 0.49178642, 0.49217596,
       0.49221472, 0.49232165, 0.492412  , 0.48136798, 0.48426152,
       0.4919828 , 0.4884475 , 0.49381398, 0.49282342, 0.48867005,
       0.48985153, 0.49369191, 0.49300674, 0.49323489, 0.48296609,
       0.48560687, 0.48774185, 0.48610019, 0.49059879, 0.48998508,
       0.49433532, 0.49353378, 0.49377173, 0.49379816, 0.49386587,
       0.49392383, 0.49398238, 0.49403948, 0.48961235, 0.48132541,
       0.48763609, 0.48085342, 0.48275175, 0.48757566, 0.46703292,
       0.47988441, 0.49402528, 0.49474757, 0.49465672, 0.49472897,
       0.49476743, 0.49481182, 0.49379108, 0.49008945, 0.49583376,
       0.48693195, 0.48541751, 0.49199066, 0.49093318, 0.49597395,
       0.49045219, 0.48791086, 0.49663691, 0.49492216, 0.49531

In [37]:
from sklearn.metrics import accuracy_score

In [38]:
accuracy_score(y_pred, y_test)

0.8243243243243243

In [39]:
accuracy_score(y_pred_sk, y_test)

0.8243243243243243