<a href="https://colab.research.google.com/github/gil2rok/radsketch/blob/master/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import xgboost as xgb #gradient boosted decision trees library
import numpy as np

In [0]:
from sklearn import datasets #import common datasets from scikit learn
iris = datasets.load_iris() #loading the iris flowers dataset. This is a classification problem.
X = iris.data #setting x to the flower data. The four columns correspond to the following: Sepal Length, Sepal Width, Petal Length and Petal Width.
Y = iris.target #setting y to the flower label. The single column has a number that corresponds to the type of iris: Setosa, Versicolour, and Virginica

In [6]:
X[:10] #printing out first 10 entries of x. Note that each entry is an array with 4 values.

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

In [7]:
Y[:10] #printing out first 10 entries of y. Note that each entry is a single value.

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [0]:
from sklearn.model_selection import train_test_split #importing the train_test_split function
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2) #split our data and target into 80% training data and 20% testing data

In [0]:
#For the XGBoost model to work, we need our data to be in a data structure to be in a DMatrix.
D_train = xgb.DMatrix(X_train, label = Y_train)
D_test = xgb.DMatrix(X_test, label = Y_test)

In [0]:
param = {
    'eta': 0.3, #when we add the predictions of new trees to the ensemble, we multiply them by this value to reduce complexity and prevent overfitting
    'max_depth': 3, #max depth of the decision trees
    'objective': 'multi:softprob', #selecting the loss function
    'num_class': 3 #num of classes in the dataset   
}

steps = 20 #num of training iterations

In [0]:
model = xgb.train(param, D_train, steps) #training our model

In [14]:
from sklearn.metrics import precision_score, recall_score, accuracy_score #importing metrics to determine how well our model is working
preds = model.predict(D_test) #predicting from our testing data
best_preds = np.asarray([np.argmax(line) for line in preds])

print("Precision = {}".format(precision_score(Y_test, best_preds, average = 'macro')))
print("Accuracy = {}".format(accuracy_score(Y_test, best_preds)))
print("Recall = {}".format(recall_score(Y_test, best_preds, average = 'macro')))

Precision = 0.9444444444444445
Accuracy = 0.9333333333333333
Recall = 0.9393939393939394
