Baixando arquivo do url para o sistema e extraindo

In [None]:
import requests
import tarfile

In [None]:
URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz"

response = requests.get(URL)
open("housing.tgz", "wb").write(response.content)

my_tar = tarfile.open('housing.tgz')
my_tar.extractall() # specify which folder to extract to
my_tar.close()

Etapa de importação dos dados para o script e visualizações preliminares

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('housing.csv')

df.head()

In [None]:
df.info()

In [None]:
df['ocean_proximity'].value_counts()

In [None]:
df.describe()

In [None]:
import matplotlib.pyplot as plt
df.hist(bins=50, figsize=(20,15))
plt.show()

Etapa de divisão de dados de treino e teste, aqui foi selecionado 20% dos dados para teste

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size = 0.2, random_state = 42)

Etapa de análise exploratória de dados

In [None]:
train_ead = train_df.copy()

In [None]:
train_ead.plot(kind = 'scatter', x = 'longitude', y = 'latitude', alpha = 0.4,
              s = train_ead['population']/100, label = 'população', figsize = (10, 7),
              c = 'median_house_value', cmap = plt.get_cmap('jet'), colorbar = True)

In [None]:
corr_matrix = train_ead.corr()
corr_matrix['median_house_value'].sort_values(ascending = False)

In [None]:
attributes = ["median_house_value", "median_income", "total_rooms",
"housing_median_age"]
pd.plotting.scatter_matrix(train_ead[attributes], figsize=(12, 8))
plt.show()

Combinando atributos

In [None]:
train_ead["rooms_per_household"] = train_ead["total_rooms"]/train_ead["households"]
train_ead["bedrooms_per_room"] = train_ead["total_bedrooms"]/train_ead["total_rooms"]
train_ead["population_per_household"]=train_ead["population"]/train_ead["households"]

corr_matrix = train_ead.corr()
corr_matrix['median_house_value'].sort_values(ascending = False)

Preparando os dados para o modelo

Separando o atributo do label

In [None]:
train_attbr = train_df.drop('median_house_value', axis = 1)
train_labels = train_df['median_house_value'].copy()

Lidando com valores nulos

In [None]:
train_attbr = train_attbr.dropna(subset = ['total_bedrooms'])

Lidandando com textos e argumentos categóricos

In [None]:
train_attbr = pd.get_dummies(train_attbr, columns = ['ocean_proximity'])

Sequência de Transformação

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

numeric_columns_train = train_attbr[list(train_attbr)].columns
pipeline=ColumnTransformer([('num',StandardScaler(),numeric_columns_train)])

train_attbr[list(train_attbr)] = pipeline.fit_transform(train_attbr)

Seleção e treino de modelo com dados de treino

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(train_attbr, train_labels)

Avaliação do treino

In [None]:
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(train_attbr)
lin_mse = mean_squared_error(train_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

Selecionando um modelo mais robusto

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(train_attbr, train_labels)

In [None]:
housing_predictions = tree_reg.predict(train_attbr)
tree_mse = mean_squared_error(train_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

RMSE 0 indica overfitting, uso de CrossValidation para avaliar

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, train_attbr, train_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
tree_rmse_scores.mean()

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(train_attbr, train_labels)

In [None]:
housing_predictions = forest_reg.predict(train_attbr)
forest_mse = mean_squared_error(train_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

In [None]:
scores = cross_val_score(forest_reg, train_attbr, train_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)
forest_rmse_scores.mean()

Afinamento de modelo

Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

grid_search.fit(train_attbr, train_labels)

grid_search.best_params_

Aplicando nos dados de teste

In [None]:
test_work = test_df.copy()

test_work = test_work.dropna(subset = ['total_bedrooms'])

test_attbr = test_work.drop('median_house_value', axis = 1)
test_labels = test_work['median_house_value'].copy()

test_attbr = pd.get_dummies(test_attbr, columns = ['ocean_proximity'])

numeric_columns_test = test_attbr[list(test_attbr)].columns
pipeline=ColumnTransformer([('num',StandardScaler(),numeric_columns_test)])

test_attbr[list(test_attbr)] = pipeline.fit_transform(test_attbr)

In [None]:
final_model = grid_search.best_estimator_

final_predictions = final_model.predict(test_attbr)

final_mse = mean_squared_error(test_labels, final_predictions)
final_rmse = np.sqrt(final_mse)

In [None]:
final_rmse