# Neural network training with masked unreliable labels

This example presents the procedure to train a model with a loss function that ignores labels that have been identified as unreliable. These labels are said to be masked.

In [1]:
import os
import sys

sys.path.append(os.path.join(os.path.abspath(""), ".."))

import numpy as np
from IPython.display import Markdown as md

from torch import nn, optim

from nnbma.networks import FullyConnected
from nnbma.dataset import RegressionDataset, MaskDataset
from nnbma.learning import (
    learning_procedure,
    LearningParameters,
    MaskedMSELoss,
    CauchyLoss,
)

from functions import Fexample as F

### Analytical function

In the following cell, we load and instantiate a vectorial function $f$ implemented as a PyTorch `Module`. For more details on the implementation, see `functions.py`. You can implement your own by following the model.

The function is the following:

In [2]:
f = F()
md(F.latex())

$$\left(\begin{array}{c} t_1\\ t_2 \end{array}\right) \longmapsto \left(\begin{array}{c} t_1+2t_2\\ t_1^2\\ t_1t_2^2 \end{array}\right)$$

### Definition of the model

In [3]:
layers_sizes = [f.n_inputs, 50, 50, f.n_outputs]  # Can be modified
activation = nn.ELU()

net = FullyConnected(layers_sizes, activation)

### Dataset

In [4]:
n_samples = 10_000
test_frac = 0.20

np.random.seed(0)
X = np.random.normal(0, 1, size=(n_samples, F.n_inputs)).astype("float32")
Y = f(X)

X_train, X_test = X[round(test_frac * n_samples) :], X[: round(test_frac * n_samples)]
Y_train_true, Y_test_true = (
    Y[round(test_frac * n_samples) :],
    Y[: round(test_frac * n_samples)],
)

### Damage to certain labels

In [5]:
p_damage = 0.05  # 5%, can be modified

# Select a fraction p_damage of the labels
mask_train = np.random.rand(*Y_train_true.shape) < p_damage
mask_test = np.random.rand(*Y_test_true.shape) < p_damage

# Add an error of 1 to 5 to a fraction p_damage of the labels
Y_train = np.where(mask_train, Y_train_true + np.random.randint(5) + 1, Y_train_true)
Y_test = np.where(mask_test, Y_test_true + np.random.randint(5) + 1, Y_test_true)

In [6]:
train_dataset = RegressionDataset(X_train, Y_train)
test_dataset = RegressionDataset(X_test, Y_test)

train_mask_dataset = MaskDataset(~mask_train)  # Mask is inverted for training
test_mask_dataset = MaskDataset(~mask_test)  # Mask is inverted for training

## Train a network with masked labels

In [7]:
# Epochs
epochs = 100

# Batch size
batch_size = 100

# Loss function (for non-masked training)
loss_no_mask = nn.MSELoss()

# Loss function (for masked training)
loss_mask = MaskedMSELoss()

# Copies of network in order to perform two different trainings
net_no_mask = net.copy()
net_mask = net.copy()

# Optimizers (for both trainings)
learning_rate = 1e-3
optimizer_no_mask = optim.Adam(net.parameters(), learning_rate)
optimizer_mask = optim.Adam(net_mask.parameters(), learning_rate)

### Training without mask

In [8]:
learning_params = LearningParameters(
    loss_no_mask, epochs, batch_size, optimizer_no_mask
)

results = learning_procedure(
    net_no_mask,
    (train_dataset, test_dataset),
    learning_params,
    val_frac=test_frac,
    verbose=False,
)

# Compute outputs for both training and testing sets
Y_train_hat = net_no_mask(X_train)
Y_test_hat = net_no_mask(X_test)

Training initiated
FullyConnected:
	layers_sizes: [2, 50, 50, 3]
	activation: ELU(alpha=1.0)
	batch_norm: False
	inputs_names: None
	outputs_names: None
	inputs_transformer: None
	outputs_transformer: None
	device: cpu
	last_restrictable: True
: 2,853 learnable parameters (11.41 kB)



Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 100%|██████████| 100/100 [00:33<00:00,  2.96it/s, train loss=5.05, val loss=4.3, train error=42460711343034990592.00%, val error=27791656125898686464.00%]





### Training with mask

In [9]:
learning_params = LearningParameters(loss_mask, epochs, batch_size, optimizer_mask)

results = learning_procedure(
    net_mask,
    (train_dataset, test_dataset),
    learning_params,
    mask_dataset=(train_mask_dataset, test_mask_dataset),
    val_frac=test_frac,
    verbose=False,
)

# Compute outputs for both training and testing sets
Y_train_hat_mask = net_mask(X_train)
Y_test_hat_mask = net_mask(X_test)

Training initiated
FullyConnected:
	layers_sizes: [2, 50, 50, 3]
	activation: ELU(alpha=1.0)
	batch_norm: False
	inputs_names: None
	outputs_names: None
	inputs_transformer: None
	outputs_transformer: None
	device: cpu
	last_restrictable: True
: 2,853 learnable parameters (11.41 kB)



Epoch: 100%|██████████| 100/100 [00:41<00:00,  2.39it/s, train loss=0.000929, val loss=0.0027, train error=9.60%, val error=48.31%]                         





### Comparison between labels estimation

In [10]:
def metric(y_hat: np.ndarray, y: np.ndarray, reduce: bool = True):
    if reduce:
        return np.mean((y_hat - y) ** 2)
    return (y_hat - y) ** 2

#### Training set

In [11]:
true = Y_train_true[mask_train]
corrupted = Y_train[mask_train]
estimated = Y_train_hat[mask_train]
estimated_mask = Y_train_hat_mask[mask_train]

#

print(
    "Examples of true, corrupted, estimated without masking and estimated with masking labels"
)
for i in range(10):
    print(
        f"True: {true[i]:+.2f}, corrup: {corrupted[i]:+.2f}, estim. w/o masking: {estimated[i]:+.2f}, estim. with masking: {estimated_mask[i]:+.2f}"
    )

#

print("\nAverage quadratic error between true label and estimated labels")
print(f"Estimated w/o masking vs. true: {metric(estimated, true):.2e}")
print(f"Estimated with masking vs. true: {metric(estimated_mask, true):.2e}")

#

print("\nAverage quadratic error over all labels (corrupted or not)")
print(f"Estimated w/o masking vs. true: {metric(Y_train_hat, Y_train_true):.2e}")
print(f"Estimated with masking vs. true: {metric(Y_train_hat_mask, Y_train_true):.2e}")

Examples of true, corrupted, estimated without masking and estimated with masking labels
True: -1.78, corrup: +3.22, estim. w/o masking: +0.29, estim. with masking: -1.79
True: -0.01, corrup: +4.99, estim. w/o masking: -0.20, estim. with masking: +0.02
True: +3.58, corrup: +8.58, estim. w/o masking: +0.36, estim. with masking: +3.57
True: +2.70, corrup: +7.70, estim. w/o masking: +0.21, estim. with masking: +2.71
True: +0.71, corrup: +5.71, estim. w/o masking: -0.13, estim. with masking: +0.73
True: +0.53, corrup: +5.53, estim. w/o masking: -0.43, estim. with masking: +0.57
True: +2.30, corrup: +7.30, estim. w/o masking: +0.39, estim. with masking: +2.32
True: +1.05, corrup: +6.05, estim. w/o masking: +0.45, estim. with masking: +1.05
True: -1.22, corrup: +3.78, estim. w/o masking: +0.20, estim. with masking: -1.21
True: +2.94, corrup: +7.94, estim. w/o masking: +0.42, estim. with masking: +2.95

Average quadratic error between true label and estimated labels
Estimated w/o masking vs. 

#### Testing set

In [12]:
true = Y_test_true[mask_test]
corrupted = Y_test[mask_test]
estimated = Y_test_hat[mask_test]
estimated_mask = Y_test_hat_mask[mask_test]

#

print(
    "Examples of true, corrupted, estimated without masking and estimated with masking labels"
)
for i in range(10):
    print(
        f"True: {true[i]:+.2f}, corrup: {corrupted[i]:+.2f}, estim. w/o masking: {estimated[i]:+.2f}, estim. with masking: {estimated_mask[i]:+.2f}"
    )

#

print("\nAverage quadratic error between true label and estimated labels")
print(f"Estimated w/o masking vs. true: {metric(estimated, true):.2e}")
print(f"Estimated with masking vs. true: {metric(estimated_mask, true):.2e}")

#

print("\nAverage quadratic error over all labels (corrupted or not)")
print(f"Estimated w/o masking vs. true: {metric(Y_test_hat, Y_test_true):.2e}")
print(f"Estimated with masking vs. true: {metric(Y_test_hat_mask, Y_test_true):.2e}")

Examples of true, corrupted, estimated without masking and estimated with masking labels
True: +0.28, corrup: +4.28, estim. w/o masking: -0.43, estim. with masking: +0.30
True: +3.49, corrup: +7.49, estim. w/o masking: -0.47, estim. with masking: +3.50
True: +0.65, corrup: +4.65, estim. w/o masking: +0.20, estim. with masking: +0.65
True: -0.02, corrup: +3.98, estim. w/o masking: -0.26, estim. with masking: +0.00
True: +0.12, corrup: +4.12, estim. w/o masking: +0.14, estim. with masking: +0.12
True: +1.36, corrup: +5.36, estim. w/o masking: +0.33, estim. with masking: +1.35
True: +2.04, corrup: +6.04, estim. w/o masking: +0.45, estim. with masking: +2.05
True: +0.00, corrup: +4.00, estim. w/o masking: +0.20, estim. with masking: +0.03
True: +3.55, corrup: +7.55, estim. w/o masking: -0.51, estim. with masking: +3.55
True: +3.49, corrup: +7.49, estim. w/o masking: -0.25, estim. with masking: +3.55

Average quadratic error between true label and estimated labels
Estimated w/o masking vs. 

## Advanced example: how to detect anomalies

### 1. Use a robust loss function to detect outliers

In [13]:
net_robust = net.copy()

# Robust loss function (that is likely to ignore outliers)
loss_robust = CauchyLoss()

# Optimizer
optimizer_robust = optim.Adam(net_robust.parameters(), learning_rate)

In [14]:
learning_params = LearningParameters(loss_robust, epochs, batch_size, optimizer_robust)

results = learning_procedure(
    net_robust,
    (train_dataset, test_dataset),
    learning_params,
    val_frac=test_frac,
    verbose=False,
)

# Compute outputs for both training and testing sets
Y_train_hat_robust = net_robust(X_train)
Y_test_hat_robust = net_robust(X_test)

# Compute errors
errors_train = metric(Y_train_hat_robust, Y_train_true, reduce=False)
errors_test = metric(Y_test_hat_robust, Y_test_true, reduce=False)

Training initiated
FullyConnected:
	layers_sizes: [2, 50, 50, 3]
	activation: ELU(alpha=1.0)
	batch_norm: False
	inputs_names: None
	outputs_names: None
	inputs_transformer: None
	outputs_transformer: None
	device: cpu
	last_restrictable: True
: 2,853 learnable parameters (11.41 kB)



Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 100%|██████████| 100/100 [00:36<00:00,  2.74it/s, train loss=0.16, val loss=0.135, train error=480567.91%, val error=50413.62%]                         





### Detection of outliers

For the example, we will consider that we already know the fraction of anomalies in the data. If it wasn't the case, we could have use other methods to automatically or manually segment the samples in two categories (reliable and anomalies). Here, we just consider the first `100*p_damage` % of errors as anomalies.

In [15]:
mask_train_estimated = errors_train > np.quantile(errors_train, 1 - p_damage)
mask_test_estimated = errors_test > np.quantile(errors_test, 1 - p_damage)

#

print(
    f"Fraction of anomalies well detected (train set): {100*np.mean(mask_train_estimated & mask_train):.1f}%"
)
print(
    f"Fraction of false alarms (train set): {100*np.mean(mask_train_estimated & ~mask_train):.1f}%"
)

print(
    f"\nFraction of anomalies well detected (test set): {100*np.mean(mask_test_estimated & mask_test):.1f}%"
)
print(
    f"Fraction of false alarms (test set): {100*np.mean(mask_test_estimated & ~mask_test):.1f}%"
)

Fraction of anomalies well detected (train set): 0.2%
Fraction of false alarms (train set): 4.8%

Fraction of anomalies well detected (test set): 0.4%
Fraction of false alarms (test set): 4.6%


In [None]:
print(mask_train[:10])
print(mask_train_estimated[:10])

### 2. Train a network with a masked non-robust loss function

In [16]:
train_mask_estimated_dataset = MaskDataset(
    ~mask_train_estimated
)  # Mask is inverted for training
test_mask_estimated_dataset = MaskDataset(
    ~mask_test_estimated
)  # Mask is inverted for training

In [17]:
net_mask_2 = net.copy()

# Non-robust masked loss function
loss_mask_2 = MaskedMSELoss()

# Optimizer
optimizer_mask_2 = optim.Adam(net_mask_2.parameters(), learning_rate)

In [18]:
learning_params = LearningParameters(loss_mask_2, epochs, batch_size, optimizer_mask_2)

results = learning_procedure(
    net_mask_2,
    (train_dataset, test_dataset),
    learning_params,
    mask_dataset=(train_mask_estimated_dataset, test_mask_estimated_dataset),
    val_frac=test_frac,
    verbose=False,
)

Training initiated
FullyConnected:
	layers_sizes: [2, 50, 50, 3]
	activation: ELU(alpha=1.0)
	batch_norm: False
	inputs_names: None
	outputs_names: None
	inputs_transformer: None
	outputs_transformer: None
	device: cpu
	last_restrictable: True
: 2,853 learnable parameters (11.41 kB)



Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 100%|██████████| 100/100 [00:49<00:00,  2.04it/s, train loss=1.17, val loss=0.712, train error=282423.75%, val error=27029.81%]       







Now, you have a network trained only on reliable labels which have been detected in a non-supervised way.