In [1]:
# Importando Bibliotecas

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
from IPython.display import display, Markdown
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from ray import tune
from ray.tune import CLIReporter
from ray.air import session
from ray.air.checkpoint import Checkpoint
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split, TensorDataset, DataLoader
import torchvision
import torchvision.transforms as transforms
import os


%matplotlib inline

In [2]:
# Importando o Dataset
 
from sklearn.datasets import load_boston
boston_dataset = load_boston()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [3]:
# Obtendo as chaves da dict 'boston_dataset'

boston_dataset.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename', 'data_module'])

In [4]:
# Carregando o dataset com o Pandas

display(Markdown("**Dataset:**"))
boston_x = pd.DataFrame(data=boston_dataset.data, columns=boston_dataset.feature_names)
boston_x

**Dataset:**

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [5]:
# Carregando a variável target do dataset com o Pandas

display(Markdown("**Variável Target:**"))
boston_y = pd.DataFrame(data=boston_dataset.target, columns=['MEDV'])
#boston_y = boston_y/20

**Variável Target:**

In [6]:
# Exibindo as estatísticas descritivas do dataset

display(Markdown("**Estatísticas descritivas do Dataset:**"))
boston_x.describe()

**Estatísticas descritivas do Dataset:**

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [7]:
# Divisão do dataset em conjunto de treinamento e conjunto de teste (para variáveis independentes e variável target)

x_train, x_valid_test, y_train, y_valid_test = train_test_split(boston_x, boston_y, test_size=0.4, random_state=5)
x_valid, x_test, y_valid, y_test = train_test_split(x_valid_test, y_valid_test, test_size=0.5, random_state=5)

y_train = np.array(y_train).reshape(-1, 1)
y_valid = np.array(y_valid).reshape(-1, 1)
y_test = np.array(y_test).reshape(-1, 1)

print(x_train.shape, y_train.shape)
print()
print(x_valid.shape, y_valid.shape)
print()
print(x_test.shape, y_test.shape)

(303, 13) (303, 1)

(101, 13) (101, 1)

(102, 13) (102, 1)


In [8]:
# Preparando a normalização min-max (coletando características min-max das features)

scaler_x = MinMaxScaler().fit(x_train)
scaler_y = MinMaxScaler().fit(y_train)

In [9]:
# Aplicando a normalização min-max no dataset sem a variável target

x_train = scaler_x.transform(x_train)
x_valid = scaler_x.transform(x_valid)
x_test = scaler_x.transform(x_test)

y_train = scaler_y.transform(y_train)
y_valid = scaler_y.transform(y_valid)
y_test = scaler_y.transform(y_test)

In [10]:
# Transformando o array das variáveis independentes em tensores

tensor_x_train = torch.tensor(x_train, dtype=torch.float32)
tensor_y_train = torch.tensor(y_train, dtype=torch.float32)

tensor_x_valid = torch.tensor(x_valid, dtype=torch.float32)
tensor_y_valid = torch.tensor(y_valid, dtype=torch.float32)
                             
tensor_x_test = torch.tensor(x_test, dtype=torch.float32)
tensor_y_test = torch.tensor(y_test, dtype=torch.float32)
                             
print(tensor_x_train.shape, tensor_y_train.shape)
print(tensor_x_valid.shape, tensor_y_valid.shape)
print(tensor_x_test.shape, tensor_y_test.shape)

torch.Size([303, 13]) torch.Size([303, 1])
torch.Size([101, 13]) torch.Size([101, 1])
torch.Size([102, 13]) torch.Size([102, 1])


In [11]:
# Cria o dataset e o dataloader para dividir os dados de treinamento em lotes (batchs)

n_batchs = 10

train_dataset = TensorDataset(tensor_x_train, tensor_y_train)
train_dataloader = DataLoader(train_dataset, batch_size = n_batchs, shuffle=True)

In [12]:
class Net(nn.Module):
    def __init__(self, n_hidden=30):
        super(Net, self).__init__()
        self.lr1 = nn.Linear(13, n_hidden)
        self.dp = nn.Dropout(0.2)
        self.tanh = nn.Tanh()
        self.lr2 = nn.Linear(n_hidden, 1)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.lr1(x)
        x = self.dp(x)
        x = self.tanh(x)
        x = self.lr2(x)
        x = self.relu(x)
        
        return x

In [13]:
def train(config):
    net = Net(config["n_hidden"])
    
    loss_function = nn.MSELoss()
    optimizer = optim.Adam(net.parameters(), lr=config["lr"], weight_decay=1e-5)
    
    loaded_checkpoint = session.get_checkpoint()
    if loaded_checkpoint:
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
            model_state, optimizer_state = torch.load(os.path.join(loaded_checkpoint_dir, "checkpoint.pt"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)   

    for epoch in range(config["n_epochs"]):
        losses = []
        r2_s = []
        for x, y in config["train_dataloader"]:
        
            pred_y = net(x.to(torch.float32))

            optimizer.zero_grad()
            loss = loss_function(pred_y, y.to(torch.float32))
            r2 = r2_score(y.to(torch.float32), pred_y.detach().numpy())
            r2_s.append(r2)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
            
        loss_train = np.mean(losses)
        r2_train = np.mean(r2_s)
            
        
    with torch.no_grad():
        
        pred_y = net(config["valid_x"])

        loss_valid = loss_function(pred_y, config["valid_y"])
        r2_valid = r2_score(config["valid_y"], pred_y)
        
    
    os.makedirs("my_model", exist_ok=True)
    torch.save((net.state_dict(), optimizer.state_dict()), "my_model/checkpoint.pt")
    checkpoint = Checkpoint.from_directory("my_model")

    session.report({"r2_train": r2_train, "loss_train": loss_train, "r2_valid": r2_valid, "loss_valid": loss_valid}, checkpoint=checkpoint)

In [14]:
config = {
    "train_dataloader": train_dataloader,
    "valid_x": tensor_x_valid,
    "valid_y": tensor_y_valid,
    "n_epochs": tune.grid_search([500, 1000, 1500]),
    "n_hidden": tune.grid_search([6,8,10]),
    "lr": tune.grid_search([0.005,0.01,0.015])
}

In [15]:
tuner = tune.Tuner(train, param_space=config, tune_config=tune.TuneConfig(metric="r2_valid", mode="max"))
results = tuner.fit()

2022-12-16 10:51:22,979	INFO worker.py:1519 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


0,1
Current time:,2022-12-16 11:00:06
Running for:,00:08:36.05
Memory:,6.8/15.8 GiB

Trial name,status,loc,lr,n_epochs,n_hidden,iter,total time (s),r2_train,loss_train,r2_valid
train_be744_00000,TERMINATED,127.0.0.1:13380,0.005,500,6,1,30.5186,0.485362,0.00825777,0.806936
train_be744_00001,TERMINATED,127.0.0.1:3500,0.01,500,6,1,35.8533,0.626404,0.00977164,0.764041
train_be744_00002,TERMINATED,127.0.0.1:5720,0.015,500,6,1,38.9904,0.482689,0.0100346,0.738247
train_be744_00003,TERMINATED,127.0.0.1:756,0.005,1000,6,1,98.5454,0.661341,0.0107808,0.762574
train_be744_00004,TERMINATED,127.0.0.1:12944,0.01,1000,6,1,106.66,0.633465,0.00886191,0.769352
train_be744_00005,TERMINATED,127.0.0.1:14252,0.015,1000,6,1,111.692,0.568311,0.00993982,0.702089
train_be744_00006,TERMINATED,127.0.0.1:5032,0.005,1500,6,1,186.359,0.682036,0.00921407,0.7972
train_be744_00007,TERMINATED,127.0.0.1:13560,0.01,1500,6,1,191.772,0.611504,0.0101072,0.75725
train_be744_00008,TERMINATED,127.0.0.1:13380,0.015,1500,6,1,180.777,0.443318,0.0104838,0.824645
train_be744_00009,TERMINATED,127.0.0.1:3500,0.005,500,8,1,57.3237,0.718157,0.00917767,0.806815


Trial name,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,loss_train,loss_valid,node_ip,pid,r2_train,r2_valid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
train_be744_00000,2022-12-16_10-52-21,True,,5a68b6b5002e40e78a04a2a6e8617e80,"0_lr=0.0050,n_epochs=500,n_hidden=6",FCPC-24146,1,0.00825777,0.00927777,127.0.0.1,13380,0.485362,0.806936,True,30.5186,30.5186,30.5186,1671198741,0,,1,be744_00000,0.0
train_be744_00001,2022-12-16_10-52-37,True,,50c2d66cbadd40ee9ccf7b2bbb2c9bb7,"1_lr=0.0100,n_epochs=500,n_hidden=6",FCPC-24146,1,0.00977164,0.0113391,127.0.0.1,3500,0.626404,0.764041,True,35.8533,35.8533,35.8533,1671198757,0,,1,be744_00001,0.0156081
train_be744_00002,2022-12-16_10-52-50,True,,7fd1ec8b55384e28872f7c76ab57aa0c,"2_lr=0.0150,n_epochs=500,n_hidden=6",FCPC-24146,1,0.0100346,0.0125787,127.0.0.1,5720,0.482689,0.738247,True,38.9904,38.9904,38.9904,1671198770,0,,1,be744_00002,0.0156271
train_be744_00003,2022-12-16_10-54-01,True,,7284666db91d4b718a8947fc3d7df709,"3_lr=0.0050,n_epochs=1000,n_hidden=6",FCPC-24146,1,0.0107808,0.0114096,127.0.0.1,756,0.661341,0.762574,True,98.5454,98.5454,98.5454,1671198841,0,,1,be744_00003,0.0156147
train_be744_00004,2022-12-16_10-54-21,True,,1d398e6110e54e1d91969ce7424d88b4,"4_lr=0.0100,n_epochs=1000,n_hidden=6",FCPC-24146,1,0.00886191,0.0110839,127.0.0.1,12944,0.633465,0.769352,True,106.66,106.66,106.66,1671198861,0,,1,be744_00004,0.0156269
train_be744_00005,2022-12-16_10-54-39,True,,725f5d20811b4072b253eb4b374015ad,"5_lr=0.0150,n_epochs=1000,n_hidden=6",FCPC-24146,1,0.00993982,0.0143163,127.0.0.1,14252,0.568311,0.702089,True,111.692,111.692,111.692,1671198879,0,,1,be744_00005,0.015625
train_be744_00006,2022-12-16_10-56-08,True,,c81df167ca754430b8add20d282b0d4a,"6_lr=0.0050,n_epochs=1500,n_hidden=6",FCPC-24146,1,0.00921407,0.00974563,127.0.0.1,5032,0.682036,0.7972,True,186.359,186.359,186.359,1671198968,0,,1,be744_00006,0.0156186
train_be744_00007,2022-12-16_10-56-28,True,,92616637421e4f1c8b1417b12a6d79ab,"7_lr=0.0100,n_epochs=1500,n_hidden=6",FCPC-24146,1,0.0101072,0.0116654,127.0.0.1,13560,0.611504,0.75725,True,191.772,191.772,191.772,1671198988,0,,1,be744_00007,0.0156238
train_be744_00008,2022-12-16_10-56-18,True,,5a68b6b5002e40e78a04a2a6e8617e80,"8_lr=0.0150,n_epochs=1500,n_hidden=6",FCPC-24146,1,0.0104838,0.00842675,127.0.0.1,13380,0.443318,0.824645,True,180.777,180.777,180.777,1671198978,0,,1,be744_00008,0.0
train_be744_00009,2022-12-16_10-54-14,True,,50c2d66cbadd40ee9ccf7b2bbb2c9bb7,"9_lr=0.0050,n_epochs=500,n_hidden=8",FCPC-24146,1,0.00917767,0.00928359,127.0.0.1,3500,0.718157,0.806815,True,57.3237,57.3237,57.3237,1671198854,0,,1,be744_00009,0.0156081


2022-12-16 11:00:07,008	INFO tune.py:777 -- Total run time: 516.55 seconds (515.96 seconds for the tuning loop).


In [16]:
def evaluate_best_model(best_result, test_x, test_y):
    best_trained_model = Net(best_result.config["n_hidden"])
    
    checkpoint_path = os.path.join(best_result.checkpoint.to_directory(), "checkpoint.pt")
        
    model_state, optimizer_state = torch.load(checkpoint_path)
    best_trained_model.load_state_dict(model_state)
    
    with torch.no_grad():
        
        pred_y = best_trained_model(test_x)

        loss_test = nn.MSELoss()(pred_y, test_y)
        r2_test = r2_score(test_y, pred_y)
        
        
    return {"loss_test":loss_test, "r2_test":r2_test}

In [17]:

display(Markdown("**Melhores valores para os hiperparâmetros do modelo**"))
print("Melhor valor para o hiperparâmetro 'n_hidden': ",results.get_best_result().config["n_hidden"])
print("Melhor valor para o hiperparâmetro 'n_epochs': ",results.get_best_result().config["n_epochs"])
print("Melhor valor para o hiperparâmetro 'lr': ",results.get_best_result().config["lr"])

print("--------------------------------------")

display(Markdown("**Performance do modelo para dados de treinamento**"))
print("MSE obtido para esses valores de hiperparâmetros: ",(results.get_best_result().metrics["loss_train"]))
print("R2 obtido para esses valores de hiperparâmetros: ",(results.get_best_result().metrics["r2_train"]))

print("--------------------------------------")

display(Markdown("**Performance do modelo para dados de validação**"))
print("MSE obtido para esses valores de hiperparâmetros: ",(results.get_best_result().metrics["loss_valid"].item()))
print("R2 obtido para esses valores de hiperparâmetros: ",(results.get_best_result().metrics["r2_valid"]))

print("--------------------------------------")

display(Markdown("**Performance do modelo para dados de teste**"))
results_best_model = evaluate_best_model(results.get_best_result(), tensor_x_test, tensor_y_test)
print("MSE obtido para esses valores de hiperparâmetros: ",(results_best_model["loss_test"].item()))
print("R2 obtido para esses valores de hiperparâmetros: ",(results_best_model["r2_test"]))


**Melhores valores para os hiperparâmetros do modelo**

Melhor valor para o hiperparâmetro 'n_hidden':  10
Melhor valor para o hiperparâmetro 'n_epochs':  1000
Melhor valor para o hiperparâmetro 'lr':  0.005
--------------------------------------


**Performance do modelo para dados de treinamento**

MSE obtido para esses valores de hiperparâmetros:  0.007896556479540923
R2 obtido para esses valores de hiperparâmetros:  0.7101367653121752
--------------------------------------


**Performance do modelo para dados de validação**

MSE obtido para esses valores de hiperparâmetros:  0.00840953178703785
R2 obtido para esses valores de hiperparâmetros:  0.8250035320174215
--------------------------------------


**Performance do modelo para dados de teste**

MSE obtido para esses valores de hiperparâmetros:  0.011851509101688862
R2 obtido para esses valores de hiperparâmetros:  0.7134748889498002


- Pode-se observar que o modelo explica 64.0% das amostras de treinamento e 73.8% das amostras de teste
- Não há indícios de sobreajuste

0.6802740060437853