In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_blobs
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.preprocessing import KBinsDiscretizer

X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv("../data/X_test.csv")
Y_train = pd.read_csv("../data/Y_train.csv")
Y_test = pd.read_csv("../data/Y_test.csv")

# Decision Tree Classifier
Proviamo due diversi approcci alla codifica delle etichette derivate dall'intervallo di prezzi Y: eseguiamo un primo tentativo con degli intervalli di prezzo basati sui quantili e di seguito un secondo tentativo usando degli intervalli di prezzo discretizzati con la strategia "kmeans".


In [2]:
enc1 = KBinsDiscretizer(n_bins=20, encode="ordinal", strategy="quantile")
Y_train_binned1 = enc1.fit_transform(Y_train)
Y_test_binned1 = enc1.fit_transform(Y_test)

Y_train_binned1 = Y_train_binned1.ravel()
Y_test_binned1 = Y_test_binned1.ravel()

######

enc2 = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="kmeans")
Y_train_binned2 = enc2.fit_transform(Y_train)
Y_test_binned2 = enc2.fit_transform(Y_test)



Ora proviamo ad applicare un modello DecisionTreeClassifier ai nostri dataset e osserviamo i diversi valori di Train Accuracy e di Test Accuracy che possiamo ottenere al variare dell'iper-parametro max_leaf_nodes del modello.

In [3]:
for max_leaves in range(2,100):
    # train and predict
    dt1 = DecisionTreeClassifier(max_leaf_nodes=max_leaves)
    dt1.fit(X_train,Y_train_binned1)

    # compute Accuracy
    train_acc = accuracy_score(y_true = Y_train_binned1, y_pred = dt1.predict(X_train))
    test_acc  = accuracy_score(y_true = Y_test_binned1,  y_pred = dt1.predict(X_test))

    print ("n_max_leaves: {:.3f} Train Accuracy: {:.3f} - Test Accuracy: {:.3f}".format(max_leaves, train_acc,test_acc))

n_max_leaves: 2.000 Train Accuracy: 0.100 - Test Accuracy: 0.101
n_max_leaves: 3.000 Train Accuracy: 0.137 - Test Accuracy: 0.135
n_max_leaves: 4.000 Train Accuracy: 0.158 - Test Accuracy: 0.164
n_max_leaves: 5.000 Train Accuracy: 0.185 - Test Accuracy: 0.191
n_max_leaves: 6.000 Train Accuracy: 0.197 - Test Accuracy: 0.203
n_max_leaves: 7.000 Train Accuracy: 0.204 - Test Accuracy: 0.205
n_max_leaves: 8.000 Train Accuracy: 0.207 - Test Accuracy: 0.203
n_max_leaves: 9.000 Train Accuracy: 0.219 - Test Accuracy: 0.206
n_max_leaves: 10.000 Train Accuracy: 0.231 - Test Accuracy: 0.213
n_max_leaves: 11.000 Train Accuracy: 0.241 - Test Accuracy: 0.210
n_max_leaves: 12.000 Train Accuracy: 0.243 - Test Accuracy: 0.210
n_max_leaves: 13.000 Train Accuracy: 0.246 - Test Accuracy: 0.205
n_max_leaves: 14.000 Train Accuracy: 0.253 - Test Accuracy: 0.215
n_max_leaves: 15.000 Train Accuracy: 0.255 - Test Accuracy: 0.217
n_max_leaves: 16.000 Train Accuracy: 0.263 - Test Accuracy: 0.217
n_max_leaves: 17.0

Si noti che, eseguendo il modello con la divisione dei prezzi in intervalli basati sui quantili, la migliore coppia di performance si ottiene quando l'iper-parametro max_leaf_nodes è pari a 32, in particolare si ha Train accuracy = 0.314 e Test Accuracy = 0.254

In [5]:
for max_leaves in range(2,100):
    # train and predict
    dt2 = DecisionTreeClassifier(max_leaf_nodes=max_leaves)
    dt2.fit(X_train,Y_train_binned2)

    # compute Accuracy
    train_acc = accuracy_score(y_true = Y_train_binned2, y_pred = dt2.predict(X_train))
    test_acc  = accuracy_score(y_true = Y_test_binned2,  y_pred = dt2.predict(X_test))

    print ("n_max_leaves: {:.3f} Train Accuracy: {:.3f} - Test Accuracy: {:.3f}".format(max_leaves, train_acc,test_acc))


n_max_leaves: 2.000 Train Accuracy: 0.333 - Test Accuracy: 0.263
n_max_leaves: 3.000 Train Accuracy: 0.363 - Test Accuracy: 0.314
n_max_leaves: 4.000 Train Accuracy: 0.391 - Test Accuracy: 0.304
n_max_leaves: 5.000 Train Accuracy: 0.429 - Test Accuracy: 0.386
n_max_leaves: 6.000 Train Accuracy: 0.447 - Test Accuracy: 0.398
n_max_leaves: 7.000 Train Accuracy: 0.472 - Test Accuracy: 0.386
n_max_leaves: 8.000 Train Accuracy: 0.500 - Test Accuracy: 0.372
n_max_leaves: 9.000 Train Accuracy: 0.509 - Test Accuracy: 0.437
n_max_leaves: 10.000 Train Accuracy: 0.512 - Test Accuracy: 0.478
n_max_leaves: 11.000 Train Accuracy: 0.512 - Test Accuracy: 0.478
n_max_leaves: 12.000 Train Accuracy: 0.532 - Test Accuracy: 0.471
n_max_leaves: 13.000 Train Accuracy: 0.540 - Test Accuracy: 0.457
n_max_leaves: 14.000 Train Accuracy: 0.554 - Test Accuracy: 0.473
n_max_leaves: 15.000 Train Accuracy: 0.566 - Test Accuracy: 0.461
n_max_leaves: 16.000 Train Accuracy: 0.571 - Test Accuracy: 0.464
n_max_leaves: 17.0

Si noti che, eseguendo il modello con la divisione dei prezzi in intervalli discretizzati con la strategia "kmeans", la migliore coppia di performance si ottiene quando l'iper-parametro max_leaf_nodes è pari a 10, in particolare si ha Train accuracy = 0.512 e Test Accuracy = 0.478

 In conclusione possiamo dire che le suddette performance non sono soddisfacenti e pertanto proseguiamo lo studio del dataset ai fini di prevedere il prezzo delle proprietà immobiliari in esso contenuto.