In [1]:
import torch
import pandas as pd
from sklearn import datasets
from model import XGBoost

# Titanic Dataset

In [2]:
path_csv = "data/titanic_modified_dataset.csv"
titanic_data_df = pd.read_csv(path_csv, index_col="PassengerId")
titanic_data_df

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,0,22.0,1,0,7.2500,0,0,0
2,1,1,38.0,1,0,71.2833,1,1,1
3,3,1,26.0,0,0,7.9250,0,2,1
4,1,1,35.0,1,0,53.1000,0,1,1
5,3,0,35.0,0,0,8.0500,0,0,0
...,...,...,...,...,...,...,...,...,...
887,2,0,27.0,0,0,13.0000,0,5,0
888,1,1,19.0,0,0,30.0000,0,2,1
889,3,1,28.0,1,2,23.4500,0,2,0
890,1,0,26.0,0,0,30.0000,1,0,1


In [3]:
# convert to tensor array
titanic_data_arr = torch.tensor(titanic_data_df.values, dtype=torch.float32)

# devide features to X, label to y 
X, y = titanic_data_arr[:, :-1], titanic_data_arr[:, -1]
print(X.shape)
print(y.shape)

torch.Size([891, 8])
torch.Size([891])


In [4]:
#shuffle data
idx = torch.randperm(X.shape[0])
X, y = X[idx], y[idx]

# split data to train and test
TRAIN_SIZE = 0.7
VAL_SIZE = 0.2
#TEST_SIZE = 0.1
TRAIN_SAMPLES = int(TRAIN_SIZE * titanic_data_arr.shape[0])
VAL_SAMPLES = TRAIN_SAMPLES + int(VAL_SIZE * titanic_data_arr.shape[0])

X_train, y_train = X[:TRAIN_SAMPLES], y[:TRAIN_SAMPLES]
X_val, y_val = X[TRAIN_SAMPLES:VAL_SAMPLES], y[TRAIN_SAMPLES:VAL_SAMPLES]
X_test, y_test = X[VAL_SAMPLES:], y[VAL_SAMPLES:]

print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

torch.Size([623, 8]) torch.Size([178, 8]) torch.Size([90, 8])
torch.Size([623]) torch.Size([178]) torch.Size([90])


In [5]:
tree = XGBoost()
tree.fit(X_train, y_train, learning_rate=0.001, max_depth=5, n_estimators=50, min_sample=2, gamma=0, lambda_=1)
y_pred = tree.predict(X)
print(sum(y_pred == y) / len(y))


  X = torch.tensor(X)
  y = torch.tensor(y)


tensor(0.6330)


In [6]:
for i in range(2):
    tree.trees[i].print_tree(tree.trees[i].root, list_feature=titanic_data_df.columns[:-1])

                                            ┌0.22475241124629974
                                  ┌Age, 26.0┤
                                  |         |                    ┌-0.5955445170402527
                                  |         |          ┌Age, 45.0┤
                                  |         |          |         └-1.8844407796859741
                                  |         └Fare, 30.0┤
                                  |                    └-2.6033575534820557
                      ┌Pclass, 1.0┤
                      |           └-3.1504104137420654
           ┌Fare, 52.0┤
           |          |                      ┌0.22475241124629974
           |          |          ┌Fare, 52.55┤
           |          |          |           |                   ┌-0.6109404563903809
           |          |          |           |         ┌Age, 19.0┤
           |          |          |           |         |         └0.15182217955589294
           |          |          |           └Age,

In [7]:
y_test = tree.predict(X_val)
print(sum(y_test == y_val) / len(y_val))

tensor(0.6573)


# Iris Dataset

In [8]:

iris = datasets.load_iris()
iris_df = pd.DataFrame(iris.data, columns = ["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"])
iris_df['target'] = iris.target
list_feature = ["sepal length", "sepal width", "petal length", "petal width"]
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [9]:
# convert to numpy array
iris_arr = torch.tensor(iris_df.values, dtype=torch.float32)

# devide features to X, label to y 
X, y = iris_arr[:, :-1], iris_arr[:, -1]
print(X.shape)
print(y.shape)

torch.Size([150, 4])
torch.Size([150])


In [10]:
#shuffle data
idx = torch.randperm(X.shape[0])
X, y = X[idx], y[idx]

# split data to train and test
TRAIN_SIZE = 0.7
VAL_SIZE = 0.2
#TEST_SIZE = 0.1
TRAIN_SAMPLES = int(TRAIN_SIZE * iris_arr.shape[0])
VAL_SAMPLES = TRAIN_SAMPLES + int(VAL_SIZE * iris_arr.shape[0])

X_train, y_train = X[:TRAIN_SAMPLES], y[:TRAIN_SAMPLES]
X_val, y_val = X[TRAIN_SAMPLES:VAL_SAMPLES], y[TRAIN_SAMPLES:VAL_SAMPLES]
X_test, y_test = X[VAL_SAMPLES:], y[VAL_SAMPLES:]

print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

torch.Size([105, 4]) torch.Size([30, 4]) torch.Size([15, 4])
torch.Size([105]) torch.Size([30]) torch.Size([15])


In [11]:
tree = XGBoost()
tree.fit(X_train, y_train, learning_rate=0.5, max_depth=3, n_estimators=70, min_sample=2, gamma=0, lambda_=1)
y_pred = tree.predict(X)
print(sum(y_pred == y) / len(y))


  X = torch.tensor(X)
  y = torch.tensor(y)


tensor(0.6533)


In [12]:
y_val_pred = tree.predict(X_val)
print(sum(y_val_pred == y_val) / len(y_val))

tensor(0.7000)


In [13]:
y_test_pred = tree.predict(X_test)
print(sum(y_test_pred == y_test) / len(y_test))

tensor(0.7333)


In [14]:
print(y)

tensor([2., 2., 1., 1., 1., 2., 0., 1., 1., 2., 0., 2., 2., 2., 0., 1., 0., 2.,
        2., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1., 0., 2., 0., 1., 0., 2., 2.,
        2., 2., 1., 0., 0., 2., 2., 0., 2., 1., 1., 2., 0., 1., 2., 1., 0., 0.,
        2., 0., 0., 1., 2., 0., 0., 1., 1., 2., 2., 0., 2., 0., 2., 0., 1., 1.,
        0., 2., 2., 2., 0., 0., 1., 2., 2., 2., 1., 1., 2., 0., 1., 2., 0., 1.,
        1., 2., 0., 0., 0., 2., 0., 2., 1., 0., 0., 0., 2., 1., 2., 1., 0., 2.,
        1., 0., 1., 1., 2., 1., 1., 2., 1., 0., 1., 0., 0., 2., 2., 2., 0., 1.,
        2., 0., 0., 1., 1., 2., 0., 0., 1., 1., 2., 1., 0., 1., 2., 1., 0., 1.,
        1., 2., 1., 2., 1., 0.])


In [15]:
for i in range(2):
    tree.trees[i].print_tree(tree.trees[i].root, list_feature=list_feature)

                  ┌-3.268923282623291
 petal length, 1.9┤
                  |                 ┌1.3145508766174316
                  └petal length, 4.7┤
                                    └5.574119567871094
                                    ┌-1.4337188005447388
                  ┌petal length, 1.9┤
                  |                 └1.1477336883544922
 petal length, 4.7┤
                  └20.75336265563965
