In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_blobs
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.preprocessing import KBinsDiscretizer

X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv("../data/X_test.csv")
Y_train = pd.read_csv("../data/Y_train.csv")
Y_test = pd.read_csv("../data/Y_test.csv")

# Decision Tree Classifier

Let's try two different approaches to the encoding of the labels derived from the rang of prices Y:
<ol> <li> enc1 </li><li> enc2 </li></ol>

In [2]:
enc1 = KBinsDiscretizer(n_bins=20, encode="ordinal", strategy="quantile")
Y_train_binned1 = enc1.fit_transform(Y_train)
Y_test_binned1 = enc1.fit_transform(Y_test)

Y_train_binned1 = Y_train_binned1.ravel()
Y_test_binned1 = Y_test_binned1.ravel()

######

enc2 = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="kmeans")
Y_train_binned2 = enc2.fit_transform(Y_train)
Y_test_binned2 = enc2.fit_transform(Y_test)

Now we try to apply a DecisionTreeClassifier model to our datasets

In [3]:
for max_leaves in range(2,100):
    # train and predict
    dt1 = DecisionTreeClassifier(max_leaf_nodes=max_leaves)
    dt1.fit(X_train,Y_train_binned1)

    #f_names = list(X_train.columns.values)
    #c_names = ["Class 0","Class 1", "Class 2", "Class 3", "Class 4", "Class 5", "Class 6"]
    #fig, ax = plt.subplots(figsize=(20,10))

    # compute Accuracy
    train_acc = accuracy_score(y_true = Y_train_binned1, y_pred = dt1.predict(X_train))
    test_acc  = accuracy_score(y_true = Y_test_binned1,  y_pred = dt1.predict(X_test))
    print ("n_max_leaves: {:.3f} Train Accuracy: {:.3f} - Test Accuracy: {:.3f}".format(max_leaves, train_acc,test_acc))



n_max_leaves: 2.000 Train Accuracy: 0.102 - Test Accuracy: 0.101
n_max_leaves: 3.000 Train Accuracy: 0.135 - Test Accuracy: 0.118
n_max_leaves: 4.000 Train Accuracy: 0.166 - Test Accuracy: 0.142
n_max_leaves: 5.000 Train Accuracy: 0.191 - Test Accuracy: 0.157
n_max_leaves: 6.000 Train Accuracy: 0.200 - Test Accuracy: 0.164
n_max_leaves: 7.000 Train Accuracy: 0.215 - Test Accuracy: 0.172
n_max_leaves: 8.000 Train Accuracy: 0.229 - Test Accuracy: 0.183
n_max_leaves: 9.000 Train Accuracy: 0.232 - Test Accuracy: 0.184
n_max_leaves: 10.000 Train Accuracy: 0.235 - Test Accuracy: 0.186
n_max_leaves: 11.000 Train Accuracy: 0.241 - Test Accuracy: 0.189
n_max_leaves: 12.000 Train Accuracy: 0.245 - Test Accuracy: 0.195
n_max_leaves: 13.000 Train Accuracy: 0.251 - Test Accuracy: 0.188
n_max_leaves: 14.000 Train Accuracy: 0.257 - Test Accuracy: 0.186
n_max_leaves: 15.000 Train Accuracy: 0.262 - Test Accuracy: 0.184
n_max_leaves: 16.000 Train Accuracy: 0.267 - Test Accuracy: 0.188
n_max_leaves: 17.0

In [4]:
for max_leaves in range(2,100):
    # train and predict
    dt2 = DecisionTreeClassifier(max_leaf_nodes=max_leaves)
    dt2.fit(X_train,Y_train_binned2)

    # compute Accuracy
    train_acc = accuracy_score(y_true = Y_train_binned2, y_pred = dt2.predict(X_train))
    test_acc  = accuracy_score(y_true = Y_test_binned2,  y_pred = dt2.predict(X_test))
    print ("n_max_leaves: {:.3f} Train Accuracy: {:.3f} - Test Accuracy: {:.3f}".format(max_leaves, train_acc,test_acc))



n_max_leaves: 2.000 Train Accuracy: 0.383 - Test Accuracy: 0.406
n_max_leaves: 3.000 Train Accuracy: 0.419 - Test Accuracy: 0.432
n_max_leaves: 4.000 Train Accuracy: 0.466 - Test Accuracy: 0.468
n_max_leaves: 5.000 Train Accuracy: 0.506 - Test Accuracy: 0.468
n_max_leaves: 6.000 Train Accuracy: 0.534 - Test Accuracy: 0.510
n_max_leaves: 7.000 Train Accuracy: 0.548 - Test Accuracy: 0.527
n_max_leaves: 8.000 Train Accuracy: 0.570 - Test Accuracy: 0.544
n_max_leaves: 9.000 Train Accuracy: 0.585 - Test Accuracy: 0.565
n_max_leaves: 10.000 Train Accuracy: 0.585 - Test Accuracy: 0.538
n_max_leaves: 11.000 Train Accuracy: 0.596 - Test Accuracy: 0.543
n_max_leaves: 12.000 Train Accuracy: 0.596 - Test Accuracy: 0.543
n_max_leaves: 13.000 Train Accuracy: 0.600 - Test Accuracy: 0.549
n_max_leaves: 14.000 Train Accuracy: 0.600 - Test Accuracy: 0.549
n_max_leaves: 15.000 Train Accuracy: 0.606 - Test Accuracy: 0.558
n_max_leaves: 16.000 Train Accuracy: 0.608 - Test Accuracy: 0.570
n_max_leaves: 17.0

In [5]:
#what is this? Do we need this?

def model_decision_boundary(model, X, y):
    x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1
    y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1
    zz = [ [xx,yy] for xx in np.linspace(x_min, x_max, 40)
           for yy in np.linspace(y_min, y_max, 40) ]
    zz = np.array(zz)
    z_labels = model.predict(zz)

    plt.figure()
    plt.scatter(zz[:,0], zz[:,1], c=z_labels, marker='+', alpha=0.3)
    plt.scatter(X[:,0], X[:,1], c=y, alpha=0.6)

model_decision_boundary(dt1, X_train, Y_train)
model_decision_boundary(dt2, X_train, Y_train)

InvalidIndexError: (slice(None, None, None), 0)