In [94]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sklearn.preprocessing import MinMaxScaler 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, precision_recall_fscore_support

## 0 - Loading the data

In [95]:
folder = "data/15038_O3_2022_processed.csv"
data = pd.read_csv(folder)

value_to_use = 'rawValue'
target = 'state'

columns_to_keep = [value_to_use, 'day_of_week', 'hour_of_day', 'month_of_year', 'day_of_month', target]
data = data[columns_to_keep]
nb_pos_class = (data[target] == 1).sum()
nb_neg_class = (data[target] == 0).sum()
print(nb_pos_class, nb_neg_class)
data.head()

8450 45


Unnamed: 0,rawValue,day_of_week,hour_of_day,month_of_year,day_of_month,state
0,-0.4,5,1,1,1,1
1,-1.1,5,2,1,1,1
2,-0.3,5,3,1,1,1
3,-0.6,5,4,1,1,1
4,-0.4,5,5,1,1,1


## 1 - Processing the data

### 1.1 - Normalizing

In [96]:
scaler = MinMaxScaler()
data[value_to_use+'_scaled'] = scaler.fit_transform(data[value_to_use].values.reshape(-1, 1))
data.drop('rawValue', axis=1, inplace=True)
data.head()

Unnamed: 0,day_of_week,hour_of_day,month_of_year,day_of_month,state,rawValue_scaled
0,5,1,1,1,1,0.010365
1,5,2,1,1,1,0.007863
2,5,3,1,1,1,0.010722
3,5,4,1,1,1,0.00965
4,5,5,1,1,1,0.010365


### 1.2 - Creating sliding window

In [97]:
def sliding_window(X, y, sequence_length):
    X_windows = []
    y_windows = []
    for i in range(len(X) - sequence_length):
        X_windows.append(X.iloc[i:i+sequence_length].values)
        y_windows.append(y.iloc[i:i+sequence_length].values)
    return np.array(X_windows), np.array(y_windows)


### 1.3 - Defining train and test sets

In [98]:
def split_data(data, split_ratio):
    train_size = int(len(data) * split_ratio)
    # Split the data
    train_data = data[:train_size]
    test_data = data[train_size:]

    # Count the number of '-1' states in each set
    count_train_neg1 = (train_data[target] == 0).sum()
    count_test_neg1 = (test_data[target] == 0).sum()

    print(count_train_neg1, "negative example in train")
    print(count_test_neg1, "negative example in test")

    if count_train_neg1 == 0 or count_test_neg1 == 0:
        print("No negative class in one of the set")
        return None, None

    return train_data, test_data

In [99]:
def get_train_set(train_data, sequence_length):
    labels = train_data[target]
    input = train_data.drop([target], axis=1)
    X, y = sliding_window(input, labels, sequence_length)
    X_train = torch.tensor(X, dtype=torch.float32)
    y_train  = torch.tensor(y, dtype=torch.float32)
    return X_train, y_train

def get_test_set(test_data):
    """
    The test set only have windows of size 1. Could also use windows of size 48 like in training time but then,
    the labels set need to be changed so that each element corresponds to the 48-th element of the window. So it was
    not done because of lack of time
    """
    labels = test_data[target]
    input = test_data.drop([target], axis=1)
    return torch.tensor(input.values, dtype=torch.float32), torch.tensor(labels.values, dtype=torch.float32)

## 2 - Model

### 2.1 - Implementation

In [100]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes=1, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_classes = num_classes
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x, threshold=0.5):
        out, _ = self.lstm(x)  # out: tensor of shape (batch_size, seq_length, hidden_size)
        out = self.dropout(out)
        # Apply the fully connected layer to each time step
        out = self.fc(out)  # Now out is of shape (batch_size, seq_length, num_classes)
        return out

### 2.2 - Training

#### Training simple LSTM

In [101]:
batch_size = 32 
split_ratio = 0.6
sequence_length = 48
num_epochs = 50

train_data, test_data = split_data(data, split_ratio)
X_train, y_train = get_train_set(train_data, sequence_length)
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size)

37 negative example in train
8 negative example in test


In [102]:
# Model instantiation
model = LSTMClassifier(input_size=X_train.shape[2], hidden_size=32, num_layers=3)
model.train()  # Set the model to training mode

# Loss function and optimizer
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([nb_neg_class/(nb_pos_class-3000)], dtype=torch.float32))
# criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Training loop
for epoch in range(num_epochs):
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()  # Clear existing gradients
        predictions = model(X_batch).squeeze(-1)
        loss = criterion(predictions, y_batch)
        loss.backward()  # Compute gradients
        optimizer.step()  # Update weights

    if (epoch + 1)%10 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')

Epoch 10/50, Loss: 0.0054
Epoch 20/50, Loss: 0.0039
Epoch 30/50, Loss: 0.0041
Epoch 40/50, Loss: 0.0048
Epoch 50/50, Loss: 0.0049


### 2.3 - Testing

In [105]:
# Assuming you have a test_loader similar to your data_loader for training
X_test, y_test = get_test_set(test_data)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size)

# Threshold that separate the negative from the positive class regarding the probability
threshold = 0.5

# Put model in evaluation mode
model.eval()

# Store predictions and actual labels
all_predictions = []
all_labels = []

# Disable gradient computations for evaluation
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        # Forward pass
        predictions = model(X_batch)
        predictions_labels = (predictions > threshold).float()
        
        all_predictions.extend(predictions_labels.numpy())
        all_labels.extend(y_batch.numpy())

In [106]:
# Convert lists to arrays for evaluation metrics
all_predictions = np.array(all_predictions).flatten()
all_labels = np.array(all_labels).flatten()

conf_matrix = confusion_matrix(all_labels, all_predictions)
#print(conf_matrix.ravel())
tn, fp, fn, tp = conf_matrix.ravel()

print("True negative :", tn, "False negative :", fn, "\nTrue positive :", tp, "False Positive :", fp, "\n")

# Calculate metrics for the positive class (1)
precision_pos = tp / (tp + fp)
recall_pos = tp / (tp + fn)
f1_pos = 2 * (precision_pos * recall_pos) / (precision_pos + recall_pos)

# Calculate metrics for the negative class (0)
precision_neg = tn / (tn + fn)
recall_neg = tn / (tn + fp)
f1_neg = 2 * (precision_neg * recall_neg) / (precision_neg + recall_neg)

# Print metrics
print(f'Positive Class Precision: {precision_pos:.4f}')
print(f'Positive Class Recall: {recall_pos:.4f}')
print(f'Positive Class F1 Score: {f1_pos:.4f}')
print("--------------------------")
print(f'Negative Class Precision: {precision_neg:.4f}')
print(f'Negative Class Recall: {recall_neg:.4f}')
print(f'Negative Class F1 Score: {f1_neg:.4f}')

True negative : 4 False negative : 628 
True positive : 2762 False Positive : 4 

Positive Class Precision: 0.9986
Positive Class Recall: 0.8147
Positive Class F1 Score: 0.8973
--------------------------
Negative Class Precision: 0.0063
Negative Class Recall: 0.5000
Negative Class F1 Score: 0.0125


## Inference for the submission csv

In [107]:
def get_test_set_submission(new_data):
    """
    The test set only have windows of size 1. Could also use windows of size 48 like in training time but then,
    the labels set need to be changed so that each element corresponds to the 48-th element of the window. So it was
    not done because of lack of time
    """
    scaler = MinMaxScaler()
    new_data['rawValue_scaled'] = scaler.fit_transform(new_data['rawValue'].values.reshape(-1, 1))
    new_data = new_data[['day_of_week', 'hour_of_day', 'month_of_year', 'day_of_month', 'rawValue_scaled']]
    return torch.tensor(new_data.values, dtype=torch.float32)

In [108]:
def to_processed_filename(original_filename):
    return original_filename.replace(".csv", "_processed.csv")

def to_original_filename(processed_filename):
    return processed_filename.replace("_processed.csv", ".csv")

In [109]:
files = ['data/07004_O3_2022.csv', 'data/15043_O3_2022.csv', 'data/20017_O3_2022.csv', 'data/20037_O3_2022.csv', 'data/20047_O3_2022.csv', 'data/27007_O3_2022.csv', 'data/29439_O3_2022.csv', 'data/33120_O3_2022.csv', 'data/36019_O3_2022.csv', 'data/36021_O3_2022.csv']

for file in files:
    # 0 - Load the new data
    new_data_folder = to_processed_filename(file)  # Replace with your new data file path
    new_data = pd.read_csv(new_data_folder)

    # 1 - Preprocess the new data
    X_new = get_test_set_submission(new_data)  # We don't have labels for the new data

    # 2 - Perform inference
    new_data_loader = DataLoader(TensorDataset(X_new, torch.zeros(len(X_new))), batch_size=batch_size)
    model.eval()  # Set the model to evaluation mode

    predicted_labels = []
    confidence_levels = []

    with torch.no_grad():
        for X_batch, _ in new_data_loader:
            predictions = model(X_batch).squeeze(-1)
            predicted_label_batch = (predictions > threshold).float()
            confidence_level_batch = predictions.sigmoid()  # Sigmoid to get probabilities

            predicted_labels.extend(predicted_label_batch.numpy())
            confidence_levels.extend(confidence_level_batch.numpy())

    # Flatten and convert to correct format
    predicted_labels = np.array(predicted_labels).flatten()
    confidence_levels = np.array(confidence_levels).flatten()

    # 3 - Load the original data file
    original_data_folder = to_original_filename(new_data_folder)  # Replace with your original data file path
    original_data = pd.read_csv(original_data_folder, delimiter=';', decimal=',')

    # 4 - Add 'predicted_label' and 'confidence_level' columns to the original data
    original_data['predicted_label'] = predicted_labels
    original_data['confidence_level'] = confidence_levels

    # 5 - Save the updated original data to a new CSV file
    original_data.to_csv(original_data_folder, index=False)


In [111]:
def update_predicted_label(file_path):
    # Read the CSV file
    data = pd.read_csv(file_path)

    # Replace 0 with 'I' and 1 with 'A' in 'predicted_label'
    data['predicted_label'] = data['predicted_label'].replace({0: 'I', 1: 'A'})

    # Save the updated data back to the CSV file
    data.to_csv(file_path, index=False)

# List of files to update
file_list = ['data/07004_O3_2022.csv', 'data/15043_O3_2022.csv', 'data/20017_O3_2022.csv', 
             'data/20037_O3_2022.csv', 'data/20047_O3_2022.csv', 'data/27007_O3_2022.csv', 
             'data/29439_O3_2022.csv', 'data/33120_O3_2022.csv', 'data/36019_O3_2022.csv', 
             'data/36021_O3_2022.csv']

# Update each file
for file_path in file_list:
    update_predicted_label(file_path)