In [1]:
import pandas as pd
import numpy as np
import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from model import DecisionTree

np.random.seed(0)

# Iris data

In [2]:

iris = datasets.load_iris()
iris_df = pd.DataFrame(iris.data, columns = ["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"])
iris_df['target'] = iris.target
list_feature = ["sepal length", "sepal width", "petal length", "petal width"]
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [3]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

(120, 4)
(30, 4)


## Impurity function - Entropy

In [4]:
DecisionTreeModel = DecisionTree(min_sample= 2, max_depth= 5, impurity_function= "entropy")
DecisionTreeModel.fit(X_train, y_train)
y_pred = DecisionTreeModel.predict(X_test)
y_pred

tensor([1., 0., 2., 1., 1., 0., 1., 2., 1., 1., 2., 0., 0., 0., 0., 1., 2., 1.,
        1., 2., 0., 2., 0., 2., 2., 2., 2., 2., 0., 0.], dtype=torch.float64)

In [5]:
DecisionTreeModel.print_tree(DecisionTreeModel.root, list_feature=list_feature)

                  ┌0.0
 petal length, 1.9┤
                  |                                  ┌1.0
                  |                 ┌petal width, 1.6┤
                  |                 |                └2.0
                  └petal length, 4.7┤
                                    |                                  ┌1.0
                                    |                ┌petal length, 4.9┤
                                    |                |                 |                ┌2.0
                                    |                |                 └petal width, 1.5┤
                                    |                |                                  |                 ┌1.0
                                    |                |                                  └sepal length, 6.7┤
                                    |                |                                                    └2.0
                                    └petal width, 1.7┤
                               

In [6]:
accuracy_score(y_test, y_pred)

1.0

## Impurity function - Gini

In [7]:
DecisionTreeModel = DecisionTree(min_sample= 2, max_depth= 5, impurity_function= "gini")
DecisionTreeModel.fit(X_train, y_train)
y_pred = DecisionTreeModel.predict(X_test)
y_pred

tensor([1., 0., 2., 1., 1., 0., 1., 2., 1., 1., 2., 0., 0., 0., 0., 1., 2., 1.,
        1., 2., 0., 2., 0., 2., 2., 2., 2., 2., 0., 0.], dtype=torch.float64)

In [8]:
DecisionTreeModel.print_tree(DecisionTreeModel.root, list_feature=list_feature)

                  ┌0.0
 petal length, 1.9┤
                  |                                  ┌1.0
                  |                 ┌petal width, 1.6┤
                  |                 |                └2.0
                  └petal length, 4.7┤
                                    |                                  ┌1.0
                                    |                ┌petal length, 4.9┤
                                    |                |                 |                ┌2.0
                                    |                |                 └petal width, 1.5┤
                                    |                |                                  |                 ┌1.0
                                    |                |                                  └sepal length, 6.7┤
                                    |                |                                                    └2.0
                                    └petal width, 1.7┤
                               

In [9]:
accuracy_score(y_test, y_pred)

1.0

# Titanic data

In [10]:
path_csv = "data/titanic_modified_dataset.csv"
titanic_data_df = pd.read_csv(path_csv, index_col="PassengerId")
list_feature = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "Title"]
titanic_data_df

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,0,22.0,1,0,7.2500,0,0,0
2,1,1,38.0,1,0,71.2833,1,1,1
3,3,1,26.0,0,0,7.9250,0,2,1
4,1,1,35.0,1,0,53.1000,0,1,1
5,3,0,35.0,0,0,8.0500,0,0,0
...,...,...,...,...,...,...,...,...,...
887,2,0,27.0,0,0,13.0000,0,5,0
888,1,1,19.0,0,0,30.0000,0,2,1
889,3,1,28.0,1,2,23.4500,0,2,0
890,1,0,26.0,0,0,30.0000,1,0,1


In [11]:
# convert to numpy array
titanic_data_arr = titanic_data_df.to_numpy().astype(np.float32)

# shuffle data
idx = np.arange(titanic_data_arr.shape[0])
np.random.shuffle(idx)
titanic_data_arr = titanic_data_arr[idx]

In [12]:
# devide features to X, label to y 
X, y = titanic_data_arr[:, :-1], titanic_data_arr[:, -1]
print(X.shape)
print(y.shape)

(891, 8)
(891,)


In [13]:
TRAIN_SIZE = 0.7
VAL_SIZE = 0.2
#TEST_SIZE = 0.1
TRAIN_SAMPLES = int(TRAIN_SIZE * titanic_data_arr.shape[0])
VAL_SAMPLES = TRAIN_SAMPLES + int(VAL_SIZE * titanic_data_arr.shape[0])

X_train, y_train = X[:TRAIN_SAMPLES], y[:TRAIN_SAMPLES]
x_val, y_val = X[TRAIN_SAMPLES:VAL_SAMPLES], y[TRAIN_SAMPLES:VAL_SAMPLES]
X_test, y_test = X[VAL_SAMPLES:], y[VAL_SAMPLES:]

print(X_train.shape, x_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

(623, 8) (178, 8) (90, 8)
(623,) (178,) (90,)


## Impurity function - Entropy

In [14]:
DecisionTreeModel = DecisionTree(min_sample= 5, max_depth= 7, impurity_function= "entropy")
DecisionTreeModel.fit(X_train, y_train)
y_pred = DecisionTreeModel.predict(X_test)
y_pred

tensor([1., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 0.,
        1., 0., 1., 0., 0., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 1.,
        1., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1.,
        1., 0., 1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0.,
        1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1., 1., 1., 0., 1., 0.])

In [15]:
DecisionTreeModel.print_tree(DecisionTreeModel.root, list_feature=list_feature)

                                                                                            ┌0.0
                                                                                 ┌Fare, 7.12┤
                                                                                 |          └0.0
                                                                      ┌Fare, 13.0┤
                                                                      |          └0.0
                                                          ┌Fare, 15.55┤
                                                          |           └1.0
                                              ┌Fare, 15.74┤
                                              |           └0.0
                                  ┌Fare, 21.68┤
                                  |           └1.0
                      ┌Fare, 23.25┤
                      |           └0.0
           ┌Fare, 26.0┤
           |          |         ┌0.0
           |          └Age, 21.0┤
    

In [16]:
accuracy_score(y_test, y_pred)

0.6444444444444445

## Impurity function - Gini

In [17]:
DecisionTreeModel = DecisionTree(min_sample= 5, max_depth= 7, impurity_function= "gini")
DecisionTreeModel.fit(X_train, y_train)
y_pred = DecisionTreeModel.predict(X_test)
y_pred

tensor([1., 0., 0., 1., 1., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 1., 0.,
        1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 1.,
        1., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 1., 0., 0., 0.,
        1., 0., 1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1., 1., 1., 0., 1., 0.])

In [18]:
DecisionTreeModel.print_tree(DecisionTreeModel.root, list_feature=list_feature)

                             ┌1.0
                  ┌SibSp, 1.0┤
                  |          |          ┌0.0
                  |          └Parch, 1.0┤
                  |                     └1.0
         ┌Age, 9.0┤
         |        |                    ┌0.0
         |        |          ┌Age, 12.0┤
         |        |          |         |                                 ┌0.0
         |        |          |         |                     ┌Fare, 12.52┤
         |        |          |         |                     |           |          ┌0.0
         |        |          |         |                     |           └Fare, 13.0┤
         |        |          |         |                     |                      └0.0
         |        |          |         |           ┌Age, 57.0┤
         |        |          |         |           |         └0.0
         |        |          |         └Pclass, 2.0┤
         |        |          |                     |                              ┌0.0
         |  

In [19]:
accuracy_score(y_test, y_pred)

0.6666666666666666