# Gradient Boosted: syntetic 2-dimensional dataset

### Initial imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [15]:
from sklearn.datasets import make_blobs, make_circles
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

In [9]:
X, y = make_circles(noise=0.25, factor=0.5, random_state=1)

In [10]:
y

array([1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0])

In [11]:
y_named = np.array(['blue','red'])[y]
y_named

array(['red', 'red', 'blue', 'red', 'red', 'blue', 'blue', 'red', 'red',
       'red', 'red', 'blue', 'red', 'red', 'red', 'blue', 'blue', 'blue',
       'red', 'blue', 'blue', 'red', 'red', 'red', 'blue', 'blue', 'red',
       'blue', 'blue', 'blue', 'red', 'red', 'red', 'red', 'red', 'blue',
       'blue', 'red', 'blue', 'blue', 'red', 'red', 'red', 'blue', 'red',
       'blue', 'blue', 'red', 'blue', 'red', 'blue', 'red', 'blue', 'blue',
       'red', 'blue', 'blue', 'red', 'blue', 'red', 'blue', 'red', 'red',
       'blue', 'blue', 'red', 'blue', 'red', 'blue', 'red', 'red', 'blue',
       'red', 'red', 'blue', 'red', 'blue', 'red', 'red', 'blue', 'blue',
       'blue', 'blue', 'blue', 'blue', 'red', 'blue', 'blue', 'red', 'red',
       'blue', 'blue', 'red', 'red', 'blue', 'red', 'blue', 'red', 'blue',
       'blue'], 
      dtype='<U4')

In [13]:
X_train, X_test, y_train_named, y_test_named, y_train, y_test = \
    train_test_split(X, y_named, y, random_state=0, test_size=.25)

In [16]:
gbc = GradientBoostingClassifier(random_state=0)
gbc.fit(X_train, y_train_named)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=0,
              subsample=1.0, verbose=0, warm_start=False)

## The Decision Function

In [17]:
print("X_test.shape: {}".format(X_test.shape))
print("Decision Function shape: {}".format(gbc.decision_function(X_test).shape))

X_test.shape: (25, 2)
Decision Function shape: (25,)


In [19]:
print("Decision Function:\n{}".format(gbc.decision_function(X_test)[:6]))

Decision Function:
[ 4.13592629 -1.7016989  -3.95106099 -3.62599351  4.28986668  3.66166106]


In [21]:
print("Thresholded decision function:\n{}".format(gbc.decision_function(X_test) > 0))
print("\nPredictions:\n{}".format(gbc.predict(X_test)))

Thresholded decision function:
[ True False False False  True  True False  True  True  True False  True
  True False  True False False False  True  True  True  True  True False
 False]

Predictions:
['red' 'blue' 'blue' 'blue' 'red' 'red' 'blue' 'red' 'red' 'red' 'blue'
 'red' 'red' 'blue' 'red' 'blue' 'blue' 'blue' 'red' 'red' 'red' 'red'
 'red' 'blue' 'blue']


In [25]:
greater_zero = (gbc.decision_function(X_test) > 0).astype(int)

pred = gbc.classes_[greater_zero]
print("pred is equal to predictions: {}".format(np.all(pred == gbc.predict(X_test))))

pred is equal to predictions: True


In [26]:
decision_func = gbc.decision_function(X_test)
print("Decision Function min: {:.2f} | max: {:.2f}".format(np.min(decision_func), np.max(decision_func)))

Decision Function min: -7.69 | max: 4.29


## Predicting Probabilities

In [27]:
print("Shape of predict_proba: {}".format(gbc.predict_proba(X_test).shape))

Shape of predict_proba: (25, 2)


In [35]:
print("first 4 predicted probabilities:\n{}".format(np.round(gbc.predict_proba(X_test[:4]),3)))

first 4 predicted probabilities:
[[ 0.016  0.984]
 [ 0.846  0.154]
 [ 0.981  0.019]
 [ 0.974  0.026]]
