In [23]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

data = pd.read_csv('barueri_dengue_filtered.csv').drop('data inicial semana', axis=1).drop('month', axis=1)

data_filled = data.fillna(data.mean())
 # Identify outliers using IQR method
Q1 = data_filled['casos'].quantile(0.25)
Q3 = data_filled['casos'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Cap the outliers
data_filled['casos'] = data_filled['casos'].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))

X, Y = data_filled[['tempmin','tempmed','tempmax','precipitacao total']], data_filled['casos']
# Y.to_csv('casos.csv', index=False)
Q1 = data['casos'].quantile(0.25)
Q3 = data['casos'].quantile(0.75)

# Calculate IQR (Interquartile Range)
IQR = Q3 - Q1

# Define the acceptable range
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = data[(data['casos'] < lower_bound) | (data['casos'] > upper_bound)]
print(f'Number of outliers: {len(outliers)}')
print(outliers)
data_filled.casos.describe()


Number of outliers: 88
       umidmin    umidmed     umidmax    tempmin    tempmed    tempmax  \
0    63.334366  86.488957   99.304751  18.730769  21.795150  26.653846   
1    85.166191  86.695892   88.244707  21.882353  22.215686  22.588235   
2    94.158321  94.585553   95.012784  20.857143  20.928571  21.000000   
45   56.043362  80.964771   98.259046  17.428571  20.378402  24.571429   
46   61.510815  84.295943   98.259188  18.571429  21.394048  26.142857   
..         ...        ...         ...        ...        ...        ...   
525  67.714286  82.209487   92.285714  19.285714  21.403812  25.285714   
618  47.571429  79.537838   99.142857  18.285714  22.404165  28.142857   
619  46.285714  82.611648  100.000000  16.857143  21.292295  27.857143   
620  60.000000  87.771625  100.000000  16.714286  19.665284  24.714286   
621  58.714286  84.591394   98.285714  19.000000  21.994909  27.571429   

     precipitacao total  casos  
0                  65.0     70  
1                 121.

count    622.000000
mean      12.961415
std       12.475577
min        0.000000
25%        4.000000
50%        8.000000
75%       17.750000
max       38.375000
Name: casos, dtype: float64

In [None]:
def load_and_preprocess_data(X, Y):    
    X = data_filled[['tempmin', 'tempmed', 'tempmax', 'precipitacao total']]
    Y = data_filled['casos']
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    X_train, X_temp, Y_train, Y_temp = train_test_split(X_scaled, Y, test_size=0.3, random_state=3)
    X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=3)
    
    x_train = torch.FloatTensor(X_train)
    y_train = torch.FloatTensor(Y_train.values)
    x_val = torch.FloatTensor(X_val)
    y_val = torch.FloatTensor(Y_val.values)
    x_test = torch.FloatTensor(X_test)
    y_test = torch.FloatTensor(Y_test.values)
    
    return x_train, y_train, x_val, y_val, x_test, y_test

In [25]:
class Casos(nn.Module):
    def __init__(self, input_dim):
        super(Casos, self).__init__()
        self.rede = nn.Sequential(
            nn.Linear(input_dim, 256),
            # nn.ReLU(),
            # nn.Dropout(0.5),  # Dropout layer
            # nn.Linear(256, 256),
            nn.ReLU(),
            nn.Dropout(0.5),  # Dropout layer
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.5),  # Dropout layer
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.rede(x)

In [26]:
input_dim = X.shape[1]
model = Casos(input_dim)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [27]:
def train_network(model, optimizer, loss_function, x_train, y_train, x_val, y_val, num_epochs, train_losses, val_losses, device):
    model.to(device)
    x_train, y_train = x_train.to(device), y_train.to(device)
    x_val, y_val = x_val.to(device), y_val.to(device)
    
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        
        # Forward pass
        output_train = model(x_train)

        # Calculate training loss
        loss_train = loss_function(output_train, y_train.view(-1, 1))

        # Backward pass
        loss_train.backward()

        # Update weights
        optimizer.step()

        model.eval()
        with torch.no_grad():
            # Validation phase
            output_val = model(x_val)
            loss_val = loss_function(output_val, y_val.view(-1, 1))

        # Store losses
        train_losses[epoch] = loss_train.item()
        val_losses[epoch] = loss_val.item()

        # Print progress every 500 epochs
        if (epoch + 1) % 500 == 0:
            print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {loss_train.item():.4f}, Val Loss: {loss_val.item():.4f}')


In [28]:
num_epochs = 15000
train_losses = np.zeros(num_epochs)
val_losses = np.zeros(num_epochs)
x_train, y_train, x_val, y_val, x_test, y_test = load_and_preprocess_data(X, Y)
train_network(model, optimizer, criterion, x_train, y_train, x_val, y_val, num_epochs, train_losses, val_losses, device)

AttributeError: module 'torch' has no attribute 'FloatFloatTensor'

In [None]:
baseline_prediction = np.mean(Y)
baseline_mse = np.mean((Y - baseline_prediction) ** 2)
print(f'Baseline MSE: {baseline_mse}')

Baseline MSE: 17199.6884234034
