### Housing prices training using:
* Decision Tree
* Random Forest
* Gradient boosting: XGBoost

First we have to get the data and
we have to select features & target + get the data ready

In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

X_full = pd.read_csv("data/train.csv", index_col = "Id")
X_test = pd.read_csv("data/test.csv", index_col = "Id")
y_full = X_full.SalePrice
X_full.drop(['SalePrice'], axis = 1, inplace = True)

plt.rc('figure', autolayout = True)
plt.rc('axes', labelweight = 'bold', labelsize = 'large',
       titleweight = 'bold', titlesize = 18, titlepad = 18)
#plt.rc('animation', html = 'html5')
print(X_full.shape)
print(X_test.shape)
X_full.head()

cols_numerical = [col for col in X_full.columns
                  if X_full[col].dtype in ['int64', 'float64']]
cols_categorical = [col for col in X_full.columns
                    if X_full[col].dtype == 'object'
                    and X_full[col].nunique() < 10]
print(cols_numerical)
print(cols_categorical)
used_cols = cols_categorical + cols_numerical

X_working_full = X_full[used_cols].copy()
X_working_test = X_test[used_cols].copy()

# X_full.describe()
## Splitting the data
X_train, X_valid, y_train, y_valid =  train_test_split(X_working_full,
                                                       y_full,
                                                       train_size=0.8,
                                                       test_size=0.2,
                                                       random_state=0)

(1460, 79)
(1459, 79)
['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQ

### Now that we got the data ready, we have to prepare the preprocessor for getting the data ready
* SimpleImputer + StandardScaler for ints
* OHE for categorical data

In [108]:
## Data prelucations
numeric_transformer = Pipeline( steps = [
    ('imputer', SimpleImputer(strategy = 'mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')

## So we use SimpleImputer and StandardScaler for numbers and OHE for cat
preprocessor = ColumnTransformer(transformers = [
    ('num', numeric_transformer, cols_numerical),
    ('cat', categorical_transformer, cols_categorical)
])

### Since we have the preprocessor we need the model & train data

In [135]:
model_DTR = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeRegressor(
        max_leaf_nodes = 100,
        random_state = 1
    ))
])

model_DTR.fit(X_train, y_train)

predicted_train = model_DTR.predict(X_train)
predicted_validation =  model_DTR.predict(X_valid)

print("MAE {}".format(mean_absolute_error(y_valid, predicted_validation)))
print("MSE {}".format(mean_squared_error(y_valid, predicted_validation)))
#predicted_train.plot()

MAE 26290.426919867234
MSE 1948230183.7805803


1. Got 17178.613 MAE with
```python
    model_RF = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('classifier', RandomForestRegressor(
            n_estimators = 250,
            max_depth = 23,
            random_state = 1
        ))
    ])
```

In [164]:
model_RF = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('classifier', RandomForestRegressor(
        n_estimators = 250,
        max_depth = 23,
        random_state = 1
    ))
])
model_RF.fit(X_train, y_train)

predicted_train = model_RF.predict(X_train)
predicted_validation =  model_RF.predict(X_valid)

print("MAE {}".format(mean_absolute_error(y_valid, predicted_validation)))
print("MSE {}".format(mean_squared_error(y_valid, predicted_validation)))

MAE 17289.906583333337
MSE 1128221776.615161
