In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sklearn.preprocessing import MinMaxScaler 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, precision_recall_fscore_support

## 0 - Loading the data

In [2]:
folder = "data/15038_O3_2022_processed.csv"
data = pd.read_csv(folder)

value_to_use = 'rawValue'
target = 'state'

columns_to_keep = [value_to_use, 'day_of_week', 'hour_of_day', 'month_of_year', 'day_of_month', target]
data = data[columns_to_keep]
nb_pos_class = (data[target] == 1).sum()
nb_neg_class = (data[target] == 0).sum()
print(nb_pos_class, nb_neg_class)
data.head()

8450 45


Unnamed: 0,rawValue,day_of_week,hour_of_day,month_of_year,day_of_month,state
0,-0.4,5,1,1,1,1
1,-1.1,5,2,1,1,1
2,-0.3,5,3,1,1,1
3,-0.6,5,4,1,1,1
4,-0.4,5,5,1,1,1


## 1 - Processing the data

### 1.1 - Normalizing

In [3]:
scaler = MinMaxScaler()
data[value_to_use+'_scaled'] = scaler.fit_transform(data[value_to_use].values.reshape(-1, 1))
columns_to_keep = [value_to_use+'_scaled', 'day_of_week', 'hour_of_day', 'month_of_year', 'day_of_month', target]
data = data[columns_to_keep]
data.head()

Unnamed: 0,rawValue_scaled,day_of_week,hour_of_day,month_of_year,day_of_month,state
0,0.010365,5,1,1,1,1
1,0.007863,5,2,1,1,1
2,0.010722,5,3,1,1,1
3,0.00965,5,4,1,1,1
4,0.010365,5,5,1,1,1


### 1.2 - Creating sliding window

In [4]:
def sliding_window(input_data, labels, sequence_length):
    X = []
    y = []
    for i in range(len(input_data) - sequence_length):
        X.append(input_data[i:i+sequence_length])  # Sequence of length 48
        y.append(labels[i+sequence_length])       # Label for the next timestep (n+49)
    return np.array(X), np.array(y)


### 1.3 - Defining train and test sets

In [5]:
def split_data(data, split_ratio):
    train_size = int(len(data) * split_ratio)
    # Split the data
    train_data = data[:train_size]
    test_data = data[train_size:]

    # Count the number of '-1' states in each set
    count_train_neg1 = (train_data[target] == 0).sum()
    count_test_neg1 = (test_data[target] == 0).sum()

    print(count_train_neg1, "negative example in train")
    print(count_test_neg1, "negative example in test")

    if count_train_neg1 == 0 or count_test_neg1 == 0:
        print("No negative class in one of the set")
        return None, None

    return train_data, test_data

In [6]:
def get_train_set(train_data, sequence_length):
    labels = train_data[target].values
    input_data = train_data.drop([target], axis=1).values
    X, y = sliding_window(input_data, labels, sequence_length)
    X_train = torch.tensor(X, dtype=torch.float32)
    y_train = torch.tensor(y, dtype=torch.long)  # Convert labels to torch.long
    return X_train, y_train

def get_test_set(test_data, sequence_length):
    labels = test_data[target].values
    input_data = test_data.drop([target], axis=1).values
    X, y = sliding_window(input_data, labels, sequence_length)
    X_test = torch.tensor(X, dtype=torch.float32)
    y_test = torch.tensor(y, dtype=torch.long)  # Convert labels to torch.long
    return X_test, y_test


In [7]:
batch_size = 32 
split_ratio = 0.6
sequence_length = 48
num_epochs = 10

train_data, test_data = split_data(data, split_ratio)
X_train, y_train = get_train_set(train_data, sequence_length)

train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size)

X_test, y_test = get_test_set(test_data, sequence_length)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size)


37 negative example in train
8 negative example in test


## 2 - Model

### 2.1 - Implementation

#### Encoder/Decoder

In [8]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2, dropout=0.5):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        
    def forward(self, input_seq):
        output, (hidden, cell) = self.lstm(input_seq)
        return hidden, cell

class DecoderLSTM(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers=2, dropout=0.5):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_size, output_size)

    def forward(self, hidden, cell):
        # We didnt have time to fully construct it, so we just use a FC layer
        output = self.fc_out(hidden[-1])  # Using the last hidden state
        return output

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source):
        hidden, cell = self.encoder(source)
        output = self.decoder(hidden, cell)
        return output

### 2.2 - Training

In [9]:
# Initialize the model
encoder = EncoderLSTM(input_size=5, hidden_size=32, num_layers=2, dropout=0.5)
decoder = DecoderLSTM(hidden_size=32, output_size=2, num_layers=2, dropout=0.5)
seq2seq = Seq2Seq(encoder, decoder, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Loss function and optimizer

# Calculate class weights
# class_weights = torch.tensor([nb_pos_class/(nb_pos_class+nb_neg_class), nb_neg_class/(nb_pos_class+nb_neg_class)], dtype=torch.float32).to(seq2seq.device)

# Initialize the loss function with class weights
criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(seq2seq.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    total_loss = 0

    for X_batch, y_batch in train_loader:
        # Assuming y_batch is of shape [batch_size, sequence_length] and we need the last label
        X_batch, y_batch = X_batch.to(seq2seq.device), y_batch.to(seq2seq.device)

        optimizer.zero_grad()
        output = seq2seq(X_batch)
        loss = criterion(output, y_batch)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_loader):.4f}')



Epoch 1/10, Loss: 0.1555
Epoch 2/10, Loss: 0.0459
Epoch 3/10, Loss: 0.0459
Epoch 4/10, Loss: 0.0457
Epoch 5/10, Loss: 0.0457
Epoch 6/10, Loss: 0.0458
Epoch 7/10, Loss: 0.0456
Epoch 8/10, Loss: 0.0456
Epoch 9/10, Loss: 0.0456
Epoch 10/10, Loss: 0.0456


### 2.3 - Testing

In [10]:
# Convert y_test to a NumPy array for easy manipulation
y_test_np = y_test.numpy()

# Count the number of instances of each class
unique, counts = np.unique(y_test_np, return_counts=True)
class_distribution = dict(zip(unique, counts))

print("Class Distribution in y_test:", class_distribution)

Class Distribution in y_test: {0: 8, 1: 3342}


In [None]:
# Put the model in evaluation mode
seq2seq.eval()

all_predictions = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(seq2seq.device), y_batch.to(seq2seq.device)

        # Forward pass
        outputs = seq2seq(X_batch)
        predicted_labels = torch.argmax(outputs, dim=1)
        
        all_predictions.extend(predicted_labels.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())


In [12]:
# Convert lists to arrays for evaluation metrics
all_predictions = np.array(all_predictions).flatten()
all_labels = np.array(all_labels).flatten()

conf_matrix = confusion_matrix(all_labels, all_predictions)
#print(conf_matrix.ravel())
tn, fp, fn, tp = conf_matrix.ravel()

print("True negative :", tn, "False negative :", fn, "\nTrue positive :", tp, "False Positive :", fp, "\n")

# Calculate metrics for the positive class (1)
precision_pos = tp / (tp + fp)
recall_pos = tp / (tp + fn)
f1_pos = 2 * (precision_pos * recall_pos) / (precision_pos + recall_pos)

# Calculate metrics for the negative class (0)
precision_neg = tn / (tn + fn)
recall_neg = tn / (tn + fp)
f1_neg = 2 * (precision_neg * recall_neg) / (precision_neg + recall_neg)

# Print metrics
print(f'Positive Class Precision: {precision_pos:.4f}')
print(f'Positive Class Recall: {recall_pos:.4f}')
print(f'Positive Class F1 Score: {f1_pos:.4f}')
print("--------------------------")
print(f'Negative Class Precision: {precision_neg:.4f}')
print(f'Negative Class Recall: {recall_neg:.4f}')
print(f'Negative Class F1 Score: {f1_neg:.4f}')

True negative : 0 False negative : 0 
True positive : 3342 False Positive : 8 

Positive Class Precision: 0.9976
Positive Class Recall: 1.0000
Positive Class F1 Score: 0.9988
--------------------------
Negative Class Precision: nan
Negative Class Recall: 0.0000
Negative Class F1 Score: nan


  precision_neg = tn / (tn + fn)
