## Case de Estudos - Precificação do preço das casas em Boston

Dataset disponibilizado pelo sklearn, muito conhecido pelos estudantes de Data Science 

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from xgboost import XGBRegressor

In [None]:
housing = datasets.load_boston()

In [None]:
print(housing)

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]]), 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 1

In [None]:
df = pd.DataFrame(housing.data, columns=housing.feature_names)

In [None]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [None]:
housing.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [None]:
print(housing.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [None]:
print(housing.target)

[24.  21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15.  18.9 21.7 20.4
 18.2 19.9 23.1 17.5 20.2 18.2 13.6 19.6 15.2 14.5 15.6 13.9 16.6 14.8
 18.4 21.  12.7 14.5 13.2 13.1 13.5 18.9 20.  21.  24.7 30.8 34.9 26.6
 25.3 24.7 21.2 19.3 20.  16.6 14.4 19.4 19.7 20.5 25.  23.4 18.9 35.4
 24.7 31.6 23.3 19.6 18.7 16.  22.2 25.  33.  23.5 19.4 22.  17.4 20.9
 24.2 21.7 22.8 23.4 24.1 21.4 20.  20.8 21.2 20.3 28.  23.9 24.8 22.9
 23.9 26.6 22.5 22.2 23.6 28.7 22.6 22.  22.9 25.  20.6 28.4 21.4 38.7
 43.8 33.2 27.5 26.5 18.6 19.3 20.1 19.5 19.5 20.4 19.8 19.4 21.7 22.8
 18.8 18.7 18.5 18.3 21.2 19.2 20.4 19.3 22.  20.3 20.5 17.3 18.8 21.4
 15.7 16.2 18.  14.3 19.2 19.6 23.  18.4 15.6 18.1 17.4 17.1 13.3 17.8
 14.  14.4 13.4 15.6 11.8 13.8 15.6 14.6 17.8 15.4 21.5 19.6 15.3 19.4
 17.  15.6 13.1 41.3 24.3 23.3 27.  50.  50.  50.  22.7 25.  50.  23.8
 23.8 22.3 17.4 19.1 23.1 23.6 22.6 29.4 23.2 24.6 29.9 37.2 39.8 36.2
 37.9 32.5 26.4 29.6 50.  32.  29.8 34.9 37.  30.5 36.4 31.1 29.1 50.
 33.3 3

In [None]:
#CARACTERISTICAS DAS CASAS
X = housing.data

#PREÇO CONHECIDO DAS CASAS - TARGET
y = housing.target

#MODELING TECHNIQUES

1. Regressão Linear do SKLEARN
<https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression>
2. Support Vector Regression do SKLEN
<https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR>
3. Decision Tree Regression do XGBoost
<https://xgboost.readthedocs.io/en/latest/python/python_api.html>

Modeling Assumptions:

Apenas variaveis numéricas

#TEST DESIGN

##Dataset split:

Separação de Train/Test dataset padrão com 20% de massa para teste via metodo SKLEARN

##Métrica de avaliação do modelo:

Validação da métrica MSE e RMSE para penalizar grandes erros de previsão. 
Utilizando o método do SKLEARN.


<https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

TECNICA 1. REGRESSÃO LINEAR

In [None]:
regLinear = LinearRegression().fit(X_train,y_train)

In [None]:
yLinear = regLinear.predict(X_test)

In [None]:
MSELinear = mean_squared_error(y_test, yLinear)

In [None]:
print("MSE Linear:", MSELinear)
print("RMSE Linear:", np.sqrt(MSELinear))

MSE Linear: 24.291119474973616
RMSE Linear: 4.9286021826653466


TECNICA 2. SVR 

In [None]:
regSVR = SVR().fit(X_train, y_train)

In [None]:
ySVR = regSVR.predict(X_test)

In [None]:
MSESVR = mean_squared_error(y_test, ySVR)

In [None]:
print("MSE SVR:", MSESVR)
print("RMSE SVR:", np.sqrt(MSESVR))

MSE SVR: 52.8383657679667
RMSE SVR: 7.269000327965785


TECNICA 3. Decision Tree Regression (XGBoost)

In [None]:
regXGB = XGBRegressor().fit(X_train, y_train)



In [None]:
yXGB = regXGB.predict(X_test)

In [None]:
MSEXGB = mean_squared_error(y_test, yXGB)

In [None]:
print("MSE XGB:", MSEXGB)
print("RMSE XGB:", np.sqrt(MSEXGB))

MSE XGB: 7.2667278403836315
RMSE XGB: 2.6956868958363156


#OTIMIZAÇÃO DE HIPERPARAMETROS


Utilizando o metodo GridSerachCV do SKLEARN.


<https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV>

In [None]:
regXGB.get_params().keys()

dict_keys(['base_score', 'booster', 'colsample_bylevel', 'colsample_bynode', 'colsample_bytree', 'gamma', 'importance_type', 'learning_rate', 'max_delta_step', 'max_depth', 'min_child_weight', 'missing', 'n_estimators', 'n_jobs', 'nthread', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'scale_pos_weight', 'seed', 'silent', 'subsample', 'verbosity'])

In [None]:
parameters = {
    "max_depth": [5, 6, 7],
    "learning_rate": [0.1, 0.2,0.3],
    "objective": ['reg:squarederror'],
    "booster": ['gbtree'],
    "n_jobs": [5],
    "gamma": [0, 1],
    "min_child_weight": [1,3],
    "max_delta_step": [0,1],
    "subsample": [0.5, 1]
}

In [None]:
xgbGrid = GridSearchCV(XGBRegressor(),parameters, refit= 'neg_mean_squared_error', verbose=True)

In [None]:
xgbGridModel = xgbGrid.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 720 out of 720 | elapsed:  2.0min finished


In [None]:
xgbGridModel.best_params_

{'booster': 'gbtree',
 'gamma': 1,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 7,
 'min_child_weight': 1,
 'n_jobs': 5,
 'objective': 'reg:squarederror',
 'subsample': 1}

In [None]:
yGrid = xgbGridModel.predict(X_test)

In [None]:
MSEGrid = mean_squared_error(y_test,yGrid)

In [None]:
print('MSE XGB Grid:', MSEGrid)
print('RMSE XGB Grid:', np.sqrt(MSEGrid))

MSE XGB Grid: 6.548085815642261
RMSE XGB Grid: 2.558922784228211
