In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

data = pd.read_csv('barueri_dengue_filtered.csv').drop('data inicial semana', axis=1).drop('month', axis=1)

data_filled = data.fillna(data.mean())

Q1 = data_filled['casos'].quantile(0.25)
Q3 = data_filled['casos'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

data_filled['casos'] = data_filled['casos'].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))

X, Y = data_filled[['tempmin','tempmax','precipitacao total']], data_filled['casos']
Q1 = data['casos'].quantile(0.25)
Q3 = data['casos'].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = data[(data['casos'] < lower_bound) | (data['casos'] > upper_bound)]
print(f'Number of outliers: {len(outliers)}')
print(outliers)
data_filled.casos.describe()


Number of outliers: 88
       umidmin    umidmed     umidmax    tempmin    tempmed    tempmax  \
0    63.334366  86.488957   99.304751  18.730769  21.795150  26.653846   
1    85.166191  86.695892   88.244707  21.882353  22.215686  22.588235   
2    94.158321  94.585553   95.012784  20.857143  20.928571  21.000000   
45   56.043362  80.964771   98.259046  17.428571  20.378402  24.571429   
46   61.510815  84.295943   98.259188  18.571429  21.394048  26.142857   
..         ...        ...         ...        ...        ...        ...   
525  67.714286  82.209487   92.285714  19.285714  21.403812  25.285714   
618  47.571429  79.537838   99.142857  18.285714  22.404165  28.142857   
619  46.285714  82.611648  100.000000  16.857143  21.292295  27.857143   
620  60.000000  87.771625  100.000000  16.714286  19.665284  24.714286   
621  58.714286  84.591394   98.285714  19.000000  21.994909  27.571429   

     precipitacao total  casos  
0                  65.0     70  
1                 121.

count    622.000000
mean      12.961415
std       12.475577
min        0.000000
25%        4.000000
50%        8.000000
75%       17.750000
max       38.375000
Name: casos, dtype: float64

In [2]:
def load_and_preprocess_data(X, Y):    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    X_train, X_temp, Y_train, Y_temp = train_test_split(X_scaled, Y, test_size=0.2, random_state=3)
    X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=3)
    
    x_train = torch.FloatTensor(X_train)
    y_train = torch.FloatTensor(Y_train.values)
    x_val = torch.FloatTensor(X_val)
    y_val = torch.FloatTensor(Y_val.values)
    x_test = torch.FloatTensor(X_test)
    y_test = torch.FloatTensor(Y_test.values)
    
    return x_train, y_train, x_val, y_val, x_test, y_test

In [3]:
class Casos(nn.Module):
    def __init__(self, input_dim):
        super(Casos, self).__init__()
        self.rede = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.75),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.75),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Dropout(0.7),
            nn.Linear(16, 1)
        )

    def forward(self, x):
        return self.rede(x)

In [4]:
input_dim = X.shape[1]
model = Casos(input_dim)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/home/eliabe/Dev/Python/Data/DS Bootcamp/.conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/eliabe/Dev/Python/Data/DS Bootcamp/.conda/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/eliabe/Dev/Python/Data/DS Bootcamp/.conda/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/eliabe/Dev/Python/Data/DS Bootcamp/.conda/lib/python3.10/sit

In [None]:
def train_network(model, optimizer, loss_function, train_loader, val_loader, num_epochs, device, patience):
    best_val_loss = float('inf')
    patience_counter = 0

    train_losses = np.zeros(num_epochs)
    val_losses = np.zeros(num_epochs)
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            output_train = model(x_batch)
            loss_train = loss_function(output_train, y_batch.view(-1, 1))
            loss_train.backward()
            optimizer.step()
            
            train_loss += loss_train.item()
        
        train_loss /= len(train_loader)
        train_losses[epoch] = train_loss
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for x_val_batch, y_val_batch in val_loader:
                x_val_batch, y_val_batch = x_val_batch.to(device), y_val_batch.to(device)
                output_val = model(x_val_batch)
                loss_val = loss_function(output_val, y_val_batch.view(-1, 1))
                val_loss += loss_val.item()
        
        val_loss /= len(val_loader)
        val_losses[epoch] = val_loss

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f'Early stopping at epoch {epoch + 1}. Best validation loss: {best_val_loss:.4f}')
            print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
            break
        
        if (epoch + 1) % 500 == 0:
            print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    model.load_state_dict(torch.load('best_model.pth'))

In [None]:
batch_size = 128
patience = 150
num_epochs = 30000
train_losses = np.zeros(num_epochs)
val_losses = np.zeros(num_epochs)
x_train, y_train, x_val, y_val, x_test, y_test = load_and_preprocess_data(X, Y)

train_dataset = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = TensorDataset(x_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

train_network(model, optimizer, criterion, train_loader, val_loader, num_epochs, device, patience)

Early stopping at epoch 488. Best validation loss: 99.8065
Epoch 488/30000, Train Loss: 143.0200, Val Loss: 102.4355
Epoch 500/30000, Train Loss: 156.1219, Val Loss: 103.6901
Early stopping at epoch 665. Best validation loss: 98.5208
Epoch 665/30000, Train Loss: 130.6656, Val Loss: 100.1981
Epoch 1000/30000, Train Loss: 133.0383, Val Loss: 98.3362
Early stopping at epoch 1148. Best validation loss: 97.3762
Epoch 1148/30000, Train Loss: 122.2755, Val Loss: 99.1665
Early stopping at epoch 1298. Best validation loss: 97.3762
Epoch 1298/30000, Train Loss: 132.0149, Val Loss: 102.4469
Early stopping at epoch 1448. Best validation loss: 97.3762
Epoch 1448/30000, Train Loss: 139.7183, Val Loss: 101.1288
Epoch 1500/30000, Train Loss: 138.4804, Val Loss: 101.6641
Early stopping at epoch 1598. Best validation loss: 97.3762
Epoch 1598/30000, Train Loss: 128.1979, Val Loss: 101.1162
Early stopping at epoch 1748. Best validation loss: 97.3762
Epoch 1748/30000, Train Loss: 127.2274, Val Loss: 102.06

In [7]:
baseline_prediction = np.mean(Y)
baseline_mse = np.mean((Y - baseline_prediction) ** 2)
print(f'Baseline MSE: {baseline_mse}')

Baseline MSE: 155.38978931669442
