### Housing prices training using:
* Decision Tree
* Random Forest
* Gradient boosting: XGBoost

First we have to get the data and
we have to select features & target + get the data ready

In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

X_full = pd.read_csv("data/train.csv", index_col = "Id")
X_test = pd.read_csv("data/test.csv", index_col = "Id")
y_full = X_full.SalePrice
X_full.drop(['SalePrice'], axis = 1, inplace = True)

plt.rc('figure', autolayout = True)
plt.rc('axes', labelweight = 'bold', labelsize = 'large',
       titleweight = 'bold', titlesize = 18, titlepad = 18)
#plt.rc('animation', html = 'html5')
print(X_full.shape)
print(X_test.shape)
X_full.head()

cols_numerical = [col for col in X_full.columns
                  if X_full[col].dtype in ['int64', 'float64']]
cols_categorical = [col for col in X_full.columns
                    if X_full[col].dtype == 'object']
                    #and X_full[col].nunique() < 10]
print(cols_numerical)
print(cols_categorical)
used_cols = cols_categorical + cols_numerical

X_col_full = X_full[used_cols].copy()
X_working_test = X_test[used_cols].copy()

# X_full.describe()
## Splitting the data

(1460, 79)
(1459, 79)
['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'G

### Now that we got the data ready, we have to prepare the preprocessor for getting the data ready
* SimpleImputer + StandardScaler for ints
* OHE for categorical data

In [85]:
## Data prelucations
numeric_transformer = Pipeline( steps = [
    ('imputer', SimpleImputer(strategy = 'mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore', sparse = False)

## So we use SimpleImputer and StandardScaler for numbers and OHE for cat
preprocessor = ColumnTransformer(transformers = [
    ('num', numeric_transformer, cols_numerical),
    ('cat', categorical_transformer, cols_categorical)
])

### Since we have the preprocessor we need the model & train data

In [86]:
model_DTR = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeRegressor(
        max_leaf_nodes = 100,
        random_state = 1
    ))
])

answer_mean = -1 * cross_val_score(model_DTR, X_full, y_full,
                                   cv = 5,
                                   scoring = 'neg_mean_absolute_error')

print("MSE {}".format(answer_mean.mean()))
#predicted_train.plot()

MSE 24335.008061980527


1. Got 17650 MAE with
```python
    model_RF = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('classifier', RandomForestRegressor(
            n_estimators = 200,
            max_depth = 21,
            n_jobs = 2,
            random_state = 1
        ))
    ])
```

In [87]:
model_RF = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('classifier', RandomForestRegressor(
        n_estimators = 200,
        max_depth = 21,
        n_jobs = 2,
        random_state = 1
    ))
])
answer_mean_RF = -1 * cross_val_score(model_RF, X_full, y_full,
                                   cv = 5,
                                   scoring = 'neg_mean_absolute_error')

print("MSE {}".format(answer_mean_RF.mean()))


MSE 17628.90453362497


In [96]:
X_train, X_valid, y_train, y_valid =  train_test_split(X_col_full,
                                                       y_full,
                                                       train_size=0.8,
                                                       test_size=0.2,
                                                       random_state=0)
OH_X_train_cols = pd.DataFrame(categorical_transformer.fit_transform(X_train[cols_categorical]))
OH_X_valid_cols = pd.DataFrame(categorical_transformer.transform(X_valid[cols_categorical]))
OH_X_test = pd.DataFrame(categorical_transformer.transform(X_working_test[cols_categorical]))
OH_X_full = pd.DataFrame(categorical_transformer.transform(X_col_full[cols_categorical]))

OH_X_train_cols.index = X_train.index
OH_X_valid_cols.index = X_valid.index
OH_X_test.index = X_working_test.index
OH_X_full.index = X_col_full.index

num_X_train = pd.DataFrame(numeric_transformer.fit_transform(X_train[cols_numerical]))
num_X_valid = pd.DataFrame(numeric_transformer.transform(X_valid[cols_numerical]))
num_X_test = pd.DataFrame(numeric_transformer.transform(X_working_test[cols_numerical]))
num_X_full = pd.DataFrame(numeric_transformer.transform(X_col_full[cols_numerical]))

num_X_train.columns = X_train[cols_numerical].columns
num_X_valid.columns = X_valid[cols_numerical].columns
num_X_test.columns = X_working_test[cols_numerical].columns
num_X_full.columns = X_col_full[cols_numerical].columns

num_X_train.index = X_train[cols_numerical].index
num_X_valid.index = X_valid[cols_numerical].index
num_X_test.index = X_working_test[cols_numerical].index
num_X_full.index = X_col_full[cols_numerical].index

fixed_X_train = pd.concat([OH_X_train_cols, num_X_train], axis = 1)
fixed_X_valid = pd.concat([OH_X_valid_cols, num_X_valid], axis = 1)
final_X_test = pd.concat([OH_X_test, num_X_test], axis = 1)
final_X_full = pd.concat([OH_X_full, num_X_full], axis = 1)
# MSE 16489.772206763697
model_XGBR = XGBRegressor(
    n_estimators = 295,
    learning_rate = 0.05,
    #gamma=0.2, #???? OMEGALUL
    #min_child_weight=2,
    #max_depth=20,
    n_jobs = 3,
    random_state = 1
)
# answer_mean_XGBR = -1 * cross_val_score(model_XGBR, X_full, y_full,
#                                    cv = 5,
#                                    scoring = 'neg_mean_absolute_error')
model_XGBR.fit(final_X_full, y_full,
               # early_stopping_rounds = 30,
               # eval_metric="mae",
               # eval_set = [(fixed_X_valid, y_valid)],
               verbose = False
               )
answer = model_XGBR.predict(fixed_X_valid)
answer_mean_XGBR = mean_absolute_error(y_valid, answer)

test_answer = model_XGBR.predict(final_X_test)
print("MSE {}".format(answer_mean_XGBR))
print(test_answer.shape)
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': test_answer})
print(output.head())
output.to_csv('submission_XGBR.cvs', index = False)

MSE 3454.9249785958905
(1459,)
     Id      SalePrice
0  1461  124581.750000
1  1462  157772.671875
2  1463  186010.937500
3  1464  184028.562500
4  1465  186309.015625


In [89]:
# tester = np.sum([X_full[col].nunique() for col in X_full.columns if X_full[col].dtype == 'object'and X_full[col].nunique() < 10] )
# tester
fixed_X_train.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.434743,-0.742575,0.874116,-0.364703,-0.115333,4.546911,-0.058085,-0.092588,0.256396,-0.613562
871,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,-0.791166,-0.742575,-0.700461,-0.364703,-0.115333,-0.269109,-0.058085,-0.092588,0.623394,0.884118
93,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,-0.198864,-0.742575,-0.700461,0.332315,-0.115333,-0.269109,-0.058085,-0.092588,0.623394,0.884118
818,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.831204,0.436137,0.159725,-0.364703,-0.115333,-0.269109,-0.058085,-0.092588,0.256396,0.135278
303,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.764331,2.935007,0.480472,-0.364703,-0.115333,-0.269109,-0.058085,-0.092588,-1.945593,-1.362401
