## Import dataset with pandas


In [None]:
import pandas
import numpy as np
from scipy.io import arff

dataset = arff.loadarff('ames_housing.arff')
data = pandas.DataFrame(dataset[0])
data.columns

print(data.Sale_Price.min(),data.Sale_Price.max())


In [None]:
def refineData(df):
    replace_kitch = {"Kitchen_Qual":{b'Poor': 0, b'Fair': 1, b'Typical': 2, b'Good': 3, b'Excellent': 4},"Utilities":{b'AllPub': 1, b'NoSewr': 2, b'NoSeWa':3}, "Land_Slope": {b'Gtl': 1, b'Mod': 2, b'Sev': 3}}
    data = df.replace(replace_kitch)
    return data

data = refineData(data)
data.describe()
data.head()

print(data.Land_Slope.value_counts())

## Controllo features

Andiamo a vedere le caratteristiche di alcune feature.

In [None]:
for i in data.loc[:, ~data.columns.isin(data._get_numeric_data())]:
    print()
    print(data[i].value_counts())


In [None]:
for i in data.loc[:, ~data.columns.isin(data._get_numeric_data())]:
    if data[i].value_counts()[0] * 100 / len(data[i]) >= 95:
        print(i)

In [None]:
import matplotlib.pyplot as plt

fig, axs = plt.subplots()

axs.plot(data.Misc_Feature, data['Sale_Price'], 'o', alpha=0.1)
axs.set_xlabel('Land_Slope')
axs.set_ylabel('Sale_Price')

data.Misc_Feature.describe()

La feature "Street" non ha una distribuzione uniforme tra le varie classi e non va ha un prezzo interessante rispetto a quello di "Pave", quindi non ha un buon valore predittivo.
La feature "Utilities" non ha una distribuzione uniforme tra le varie classi e non va ha un prezzo interessante rispetto a quello di "Pave", quindi non ha un buon valore predittivo.

Andiamo momentaneamente ad escludere le feature:
Street
Utilities
Land_Slope
Condition_2
Roof_Matl
Heating
Pool_QC
Misc_Feature

In quanto il 95% delle istanze cadono nello stesso valore della feature

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

Y = data['Sale_Price'].to_numpy()
Y = Y.reshape(-1,1)

enc = KBinsDiscretizer(n_bins=10, encode="onehot")
Y_binned = enc.fit_transform(Y)

print(Y_binned)



fig, axs = plt.subplots()

plt.hist(Y_binned.getnnz())

#data.describe()



## Plotting

In [None]:
import matplotlib.pyplot as plt

fig, axs = plt.subplots()

axs.plot(data.Kitchen_Qual, data['Sale_Price'], 'o', alpha=0.2)
axs.set_xlabel('Kitchen_Qual')
axs.set_ylabel('Sale_Price')
#plt.savefig('./plots/books_read.png')

fig, axs = plt.subplots()

axs.plot(data.Kitchen_AbvGr, data['Sale_Price'], 'o', alpha=0.2)
axs.set_xlabel('Kitchen_AbvGr')
axs.set_ylabel('Sale_Price')

fig, axs = plt.subplots()

axs.plot(data.Kitchen_Qual, data['Kitchen_AbvGr'], 'o', alpha=0.2)
axs.set_xlabel('Kitchen_AbvGr')
axs.set_ylabel('Kitchen_Qual')

In [None]:
import matplotlib.pyplot as plt

for index, i in enumerate(data.columns):
    fig, axs = plt.subplots()
    axs.plot(data[i], data['Sale_Price'], 'o', alpha=0.2)
    axs.set_xlabel(i)
    axs.set_ylabel('Sale_Price')


# Decision trees
> Applicazione dei decision tre al dataset

In [None]:
import pandas
from scipy.io import arff
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

dataset = arff.loadarff('ames_housing.arff')
data = pandas.DataFrame(dataset[0])

X = data._get_numeric_data()
X = X.loc[:, ~X.columns.isin(
    ['Sale_Price', 'Longitude', 'Latitude', 'TotRms_AbvGrd'])]
X = X.loc[:, ~X.columns.isin(
    [col for col in X if len(X[col].value_counts()) < 13])]

y = data['Sale_Price']

# split dataset in train set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

errors = []

for max_leaves in range(5, 600):
    # train and predict
    dt = DecisionTreeRegressor(max_leaf_nodes=max_leaves)
    dt.fit(X_train,y_train)

    # compute Accuracy
    train_acc = mean_squared_error(y_true=y_train, y_pred=dt.predict(X_train))
    test_acc  = mean_squared_error(y_true=y_test,  y_pred=dt.predict(X_test))

    errors += [ [max_leaves, train_acc, test_acc] ]

errors = np.array(errors)

fig, ax = plt.subplots()
ax.plot(errors[:, 0], errors[:, 1], "o-", label="DT Train", alpha=0.3)
ax.plot(errors[:, 0], errors[:, 2], "o-", label="DT Test", alpha=0.3)
ax.set_ylabel("MSE")
ax.set_xlabel("Number of Leaves")
ax.grid()
ax.legend();

# Comparazione con regressione lineare

In [None]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train,y_train)

reg_train_err = mean_squared_error(y_true=y_train, y_pred=reg.predict(X_train))
reg_test_err  = mean_squared_error(y_true=y_test,  y_pred=reg.predict(X_test))

fig, ax = plt.subplots()
ax.plot(errors[:, 0], errors[:, 1], "o-", label="DT Train", alpha=0.3)
ax.plot(errors[:, 0], errors[:, 2], "o-", label="DT Test", alpha=0.3)

ax.axhline(y=reg_train_err, c='red', ls="--", label="Reg Train")
ax.axhline(y=reg_test_err, c='green', ls="--", label="Reg Test")

ax.set_ylabel("MSE")
ax.grid()
ax.legend();

# Comparazione con KNeighborsRegressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor

errors_knn = []
for k in range(5, 300):
    kNN = KNeighborsRegressor(n_neighbors=k)
    kNN.fit(X_train, y_train)

    knn_train_err = mean_squared_error(
        y_true=y_train, y_pred=kNN.predict(X_train))
    knn_test_err = mean_squared_error(
        y_true=y_test,  y_pred=kNN.predict(X_test))

    errors_knn += [ [k, knn_train_err, knn_test_err] ]

errors_knn = np.array(errors_knn)

fig, ax = plt.subplots()
ax.plot(errors[:, 0], errors[:, 1], "o-", label="DT Train", alpha=0.3)
ax.plot(errors[:, 0], errors[:, 2], "o-", label="DT Test", alpha=0.3)

ax.axhline(y=reg_train_err, c='red', ls="--", label="Reg Train")
ax.axhline(y=reg_test_err, c='green', ls="--", label="Reg Test")

ax.plot(errors_knn[:, 0], errors_knn[:, 1], c='blue', ls="dotted", label="kNN Train")
ax.plot(errors_knn[:, 0], errors_knn[:, 2], c='purple', ls="dotted", label="kNN Test")

ax.set_ylabel("MSE")
ax.grid()
ax.legend()