### Importar bibliotecas

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import load_boston


### Carregar Dados

In [None]:
data = load_boston()
print(data.data.shape)

### Pré processamento de dados

In [None]:
data.isnull().sum()
x = data.drop('medv',axis = 1)
y = data['medv']

### Separar dados para treino e teste 

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 123)

### Treinar diferentes modelos

In [None]:
linear_model = LinearRegression().fit(x_train,y_train)
decision_tree = DecisionTreeRegressor().fit(x_train,y_train)
random_model = RandomForestRegressor().fit(x_train,y_train)
xgb_model = XGBRegressor().fit(x_train,y_train)
cat_model = CatBoostRegressor().fit(x_train,y_train)


### Comparar as métricas estatísticas para selecionar o melhor modelo

In [None]:
def evaluate_Regression_models(model,x_test,y_test):
  prediction = model.predict(x_test)
  print("Mean Absolute Error:",
                       mean_absolute_error(y_test,prediction))
  print("Mean Squared Error : ", 
                       mean_squared_error(y_test,prediction))
  print("Root Mean Squared Error : ",
                     np.sqrt(mean_squared_error(y_test,prediction)))
  print("R2 Score : ",r2_score(y_test,prediction))

In [None]:
evaluate_Regression_models(linear_model, x_test, y_test)

In [None]:
evaluate_Regression_models(decision_tree, x_test, y_test)

In [None]:
evaluate_Regression_models(random_model, x_test, y_test)

In [None]:
evaluate_Regression_models(xgb_model, x_test, y_test)

In [None]:
evaluate_Regression_models(cat_model, x_test, y_test)

### Tunar o melhor modelo

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# numero de arvores na floresta aleatoria 
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# numero de variaveis para considerar em cada split
max_features = ['auto', 'sqrt']
# numero maximo de niveis na arvore
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# numero minimo de amostras necessarias para split em um node
min_samples_split = [2, 5, 10]
# numero minimo de amostras necessarias em cada nó de folha 
min_samples_leaf = [1, 2, 4]
# metodo de selecao de amostras para treino em cada arvore 
bootstrap = [True, False]
# criacao do random grid para iterar no processo de tunagem de hyperparametros
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Usar o random grid para achar os melhores hyperparametros
rf = RandomForestRegressor()
# busca de otimização de parametros usando 3 fold cross validation 
# busca em 100 diferentes combinacoes
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# fit o random search
rf_random.fit(x_train, y_train)

In [None]:
#Avaliar o resultado da tunagem dos hyperprametros
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy


base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(x_train, y_train)
base_accuracy = evaluate(base_model, x_test, y_test)

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, x_test, y_test)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

### Plotar gráficos para avaliar os modelos...