In [15]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_blobs
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.preprocessing import KBinsDiscretizer

X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv("../data/X_test.csv")
Y_train = pd.read_csv("../data/Y_train.csv")
Y_test = pd.read_csv("../data/Y_test.csv")

# Decision Tree Classifier
Proviamo due diversi approcci alla codifica delle etichette derivate dall'intervallo di prezzi Y: eseguiamo un primo tentativo con degli intervalli di prezzo basati sui quantili e di seguito un secondo tentativo usando degli intervalli di prezzo discretizzati con la strategia "kmeans".

In [16]:
enc1 = KBinsDiscretizer(n_bins=20, encode="ordinal", strategy="quantile")
Y_train_binned1 = enc1.fit_transform(Y_train)
Y_test_binned1 = enc1.fit_transform(Y_test)

Y_train_binned1 = Y_train_binned1.ravel()
Y_test_binned1 = Y_test_binned1.ravel()

######

enc2 = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="kmeans")
Y_train_binned2 = enc2.fit_transform(Y_train)
Y_test_binned2 = enc2.fit_transform(Y_test)

Ora proviamo ad applicare un modello DecisionTreeClassifier ai nostri dataset e osserviamo i diversi valori di Train Accuracy e di Test Accuracy che possiamo ottenere al variare dell'iper-parametro max_leaf_nodes del modello.

In [17]:
for max_leaves in range(2,100):
    # train and predict
    dt1 = DecisionTreeClassifier(max_leaf_nodes=max_leaves)
    dt1.fit(X_train,Y_train_binned1)

    # compute Accuracy
    train_acc = accuracy_score(y_true = Y_train_binned1, y_pred = dt1.predict(X_train))
    test_acc  = accuracy_score(y_true = Y_test_binned1,  y_pred = dt1.predict(X_test))
    print ("n_max_leaves: {:d} Train Accuracy: {:.3f} - Test Accuracy: {:.3f}".format(max_leaves, train_acc,test_acc))

n_max_leaves: 2 Train Accuracy: 0.105 - Test Accuracy: 0.097
n_max_leaves: 3 Train Accuracy: 0.143 - Test Accuracy: 0.143
n_max_leaves: 4 Train Accuracy: 0.160 - Test Accuracy: 0.164
n_max_leaves: 5 Train Accuracy: 0.183 - Test Accuracy: 0.191
n_max_leaves: 6 Train Accuracy: 0.198 - Test Accuracy: 0.203
n_max_leaves: 7 Train Accuracy: 0.208 - Test Accuracy: 0.208
n_max_leaves: 8 Train Accuracy: 0.210 - Test Accuracy: 0.205
n_max_leaves: 9 Train Accuracy: 0.220 - Test Accuracy: 0.203
n_max_leaves: 10 Train Accuracy: 0.221 - Test Accuracy: 0.217
n_max_leaves: 11 Train Accuracy: 0.226 - Test Accuracy: 0.213
n_max_leaves: 12 Train Accuracy: 0.232 - Test Accuracy: 0.217
n_max_leaves: 13 Train Accuracy: 0.237 - Test Accuracy: 0.222
n_max_leaves: 14 Train Accuracy: 0.243 - Test Accuracy: 0.234
n_max_leaves: 15 Train Accuracy: 0.249 - Test Accuracy: 0.246
n_max_leaves: 16 Train Accuracy: 0.254 - Test Accuracy: 0.247
n_max_leaves: 17 Train Accuracy: 0.261 - Test Accuracy: 0.254
n_max_leaves: 18

Si noti che, eseguendo il modello con la divisione dei prezzi in intervalli basati sui quantili, la migliore coppia di performance si ottiene quando l'iper-parametro max_leaf_nodes è pari a 99, in particolare si ha Train accuracy = 0.435 e Test Accuracy = 0.247

In [18]:
for max_leaves in range(2,100):
    # train and predict
    dt2 = DecisionTreeClassifier(max_leaf_nodes=max_leaves)
    dt2.fit(X_train,Y_train_binned2)

    # compute Accuracy
    train_acc = accuracy_score(y_true = Y_train_binned2, y_pred = dt2.predict(X_train))
    test_acc  = accuracy_score(y_true = Y_test_binned2,  y_pred = dt2.predict(X_test))
    print ("n_max_leaves: {:d} Train Accuracy: {:.3f} - Test Accuracy: {:.3f}".format(max_leaves, train_acc,test_acc))

n_max_leaves: 2 Train Accuracy: 0.345 - Test Accuracy: 0.162
n_max_leaves: 3 Train Accuracy: 0.393 - Test Accuracy: 0.217
n_max_leaves: 4 Train Accuracy: 0.426 - Test Accuracy: 0.177
n_max_leaves: 5 Train Accuracy: 0.462 - Test Accuracy: 0.201
n_max_leaves: 6 Train Accuracy: 0.483 - Test Accuracy: 0.159
n_max_leaves: 7 Train Accuracy: 0.490 - Test Accuracy: 0.125
n_max_leaves: 8 Train Accuracy: 0.506 - Test Accuracy: 0.164
n_max_leaves: 9 Train Accuracy: 0.520 - Test Accuracy: 0.167
n_max_leaves: 10 Train Accuracy: 0.533 - Test Accuracy: 0.167
n_max_leaves: 11 Train Accuracy: 0.539 - Test Accuracy: 0.174
n_max_leaves: 12 Train Accuracy: 0.555 - Test Accuracy: 0.155
n_max_leaves: 13 Train Accuracy: 0.566 - Test Accuracy: 0.179
n_max_leaves: 14 Train Accuracy: 0.576 - Test Accuracy: 0.162
n_max_leaves: 15 Train Accuracy: 0.586 - Test Accuracy: 0.167
n_max_leaves: 16 Train Accuracy: 0.586 - Test Accuracy: 0.167
n_max_leaves: 17 Train Accuracy: 0.589 - Test Accuracy: 0.155
n_max_leaves: 18

Si noti che, eseguendo il modello con la divisione dei prezzi in intervalli discretizzati con la strategia "kmeans", la migliore coppia di performance si ottiene quando l'iper-parametro max_leaf_nodes è pari a 99, in particolare si ha Train accuracy = 0.774 e Test Accuracy = 0.191

In conclusione possiamo dire che le suddette performance non sono soddisfacenti e pertanto proseguiamo lo studio del dataset ai fini di prevedere il prezzo delle proprietà immobiliari in esso contenuto.