In [203]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from utils import get_scores, plot_predict_result
%matplotlib notebook

In [204]:
df = pd.read_csv('../data/ready/dataset_houses.csv')
df = df[df['precio']<1000000]

features = df.drop(columns='precio')
price = df['precio']

# Usefull methods to explore features
- df.description()
- df.shape
- df.count()
- df.sum()
- df.mean()
- df.std()
- df.corr()

- `df['feature'].unique()`
- `df.nunique()`
- `df['feature'].isna()`



# Modeling

In [205]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import Normalizer
# Train/test
from sklearn.model_selection import train_test_split
# Descomposition
from sklearn.decomposition import PCA
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.model_selection import cross_val_score

 

## Split train and test dataset

In [206]:
X_train, X_test = train_test_split(features, test_size=0.2, random_state=0)
y_train, y_test = train_test_split(price, test_size=0.2, random_state=0)

## Check train and test distributions

In [209]:
plt.figure()
plt.hist(y_train, bins=100, density=True, alpha=0.5);
plt.hist(y_test, bins=100, density=True, alpha=0.5);

<IPython.core.display.Javascript object>

## Scale dataset

In [210]:
# scaler =  RobustScaler(quantile_range=(25, 75))
scaler_target = QuantileTransformer(output_distribution='normal', ) 
# scaler = PowerTransformer(method='box-cox')  # Non Linear | Strictly positive data
# scaler = PowerTransformer(method='yeo-johnson')  # Non Linear 
scaler_feature = MinMaxScaler()


In [211]:
scale_train = scaler_feature.fit_transform(X_train)
# scale_train = scaler_feature.transform(X_train)
scale_test = scaler_feature.transform(X_test)

scale_train_price = scaler_target.fit_transform(y_train.to_numpy()[:, None])
scale_test_price = scaler_target.transform(y_test.to_numpy()[:, None])

## PCA analysis

In [212]:
pca = PCA(n_components=.98, )
pca.fit(scale_train)
arr_train = pca.transform(scale_train)
arr_test = pca.transform(scale_test)

In [213]:
pca.explained_variance_ratio_

array([0.16931876, 0.11887327, 0.08923624, 0.0823609 , 0.06028471,
       0.0511455 , 0.04575569, 0.03981486, 0.03205461, 0.02772286,
       0.02554175, 0.02389725, 0.02314492, 0.02093171, 0.01742225,
       0.01586492, 0.01573012, 0.01445018, 0.01335048, 0.01207   ,
       0.01143276, 0.01113541, 0.01036122, 0.01007787, 0.00912704,
       0.00845178, 0.00807326, 0.00719249, 0.00547484])

In [214]:
sns.pairplot(pd.DataFrame(arr_train[:, 0:4]))

<IPython.core.display.Javascript object>

<seaborn.axisgrid.PairGrid at 0x7feb8f2c0d30>

In [215]:
# color = features.loc[features.index, 'zona'].astype('category').cat.codes
color = X_train.iloc[:, 1]
f, ax = plt.subplots(2,2)
ax[0, 0].scatter(arr_train[:,0], arr_train[:,1], c=color)
ax[0, 1].scatter(arr_train[:,1], arr_train[:,2], c=color)
ax[1, 0].scatter(arr_train[:,0], arr_train[:,2], c=color)
ax[1, 1].scatter(arr_train[:,3], arr_train[:,5], c=color)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x7feb8e68f828>

# Choosing a model

## Linear Model

In [182]:
model = LinearRegression(fit_intercept=True)
model.fit(arr_train, y_train)
#model.coef_, model.intercept_, model.rank_

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [183]:
predict_price = model.predict(arr_test)
get_scores(y_test, predict_price)


'mse=10174509825.974245, mea=64879.65297842749, r2=0.6760774214697185'

In [184]:
f, ax = plot_predict_result(y_test, predict_price)


<IPython.core.display.Javascript object>

## Transformed Target Regressor

In [185]:
from sklearn.linear_model import RidgeCV
from sklearn.compose import TransformedTargetRegressor

In [186]:
f, (ax0, ax1) = plt.subplots(1, 2, sharey=True)

regr = RidgeCV()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

ax0.scatter(y_test, y_pred)
ax0.plot(ax0.get_xbound(), ax0.get_xbound(), '--k')
ax0.set_ylabel('Target predicted')
ax0.set_xlabel('True Target')
ax0.set_title('Ridge regression \n without target transformation')
ax0.text(1, 9, r'$R^2$={:.2f}, MAE={:.2f}'.format(r2_score(y_test, y_pred),
                                                  median_absolute_error(y_test, y_pred)))


regr_trans = TransformedTargetRegressor(
    regressor=RidgeCV(),
    transformer=QuantileTransformer(n_quantiles=300,
                                    output_distribution='normal'))
regr_trans.fit(X_train, y_train)
y_pred = regr_trans.predict(X_test)

ax1.scatter(y_test, y_pred)
ax1.plot(ax1.get_xbound(), ax1.get_xbound(), '--k')
ax1.set_ylabel('Target predicted')
ax1.set_xlabel('True Target')
ax1.set_title('Ridge regression \n with target transformation')
ax1.text(1, 9, r'$R^2$={:.2f}, MAE={:.2f}'.format(r2_score(y_test, y_pred),
                                                  median_absolute_error(y_test, y_pred)))
# ax1.set_xlim([0, 10])
# ax1.set_ylim([0, 10])

f.suptitle("Boston housing data: distance to employment centers", y=0.035)
f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95])

plt.show()

<IPython.core.display.Javascript object>

# Regression Tree

In [187]:
cls_tree = DecisionTreeRegressor()
cls_tree.fit(X_train, y_train)


predic_price_tree = cls_tree.predict(X_test)
get_scores(predic_price_tree, y_test)



scores_accuracy = cross_val_score(cls_tree, features, price, cv=10)
# scores_balanced = cross_val_score(cls_tree, features, price, cv=10, scoring="balanced_accuracy")

scores_accuracy, scores_accuracy.mean()

plot_predict_result(y_test, predic_price_tree)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [192]:
f, ax = plt.subplots()
(pd
 .Series(cls_tree.feature_importances_, index=train_features.columns)
 .sort_values()[-10:]
 .plot(ax=ax, kind='barh')
)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7feb7b1ca2b0>

## Random Forest

In [193]:
from sklearn.ensemble import RandomForestRegressor

In [194]:
f, (ax0, ax1) = plt.subplots(1, 2, sharey=True)

rfr = RandomForestRegressor(n_estimators=100,)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)

ax0.scatter(y_test, y_pred)
ax0.plot(ax0.get_xbound(), ax0.get_xbound(), '--k')
ax0.set_ylabel('Target predicted')
ax0.set_xlabel('True Target')
ax0.set_title('RandomForestRegressor')
ax0.text(1, 9, r'$R^2$={:.2f}, MAE={:.2f}'.format(r2_score(y_test, y_pred),
                                                  median_absolute_error(y_test, y_pred)))

<IPython.core.display.Javascript object>

Text(1, 9, '$R^2$=0.83, MAE=22236.36')

In [202]:
pd.Series(rfr.feature_importances_, index=X_train.columns).sort_values(ascending=False)

m2_edificados         0.572583
banos                 0.118823
gastos_comunes        0.053793
m2_index              0.035303
garajes               0.034055
banos_extra           0.033856
ZN__carrasco          0.025980
estado                0.014592
dormitorios           0.014427
m2_de_la_terraza      0.011543
ZN__punta carretas    0.010434
ZN__pocitos           0.010023
ZN__otros             0.009428
parrilero             0.006725
plantas               0.006061
ZN__centro            0.005485
cerca_rambla          0.004268
DISP__otro            0.003684
ZN__malvin            0.003252
DISP__al frente       0.003148
ZN__cordon            0.002583
ZN__prado             0.002496
ZN__pocitos nuevo     0.002283
dormitorios_extra     0.002085
garajes_extra         0.001779
ZN__buceo             0.001376
plantas_extra         0.001347
ZN__la blanqueada     0.001291
ZN__parque batlle     0.001066
vivienda_social       0.000923
ZN__aguada            0.000913
penthouse             0.000895
DISP__co