In [2]:
import matplotlib.pyplot as plt
from data_generation import MEANS, VARIANCES, KEY, generate_mv_data, NUM_SAMPLES
from plot_utils import plot_scatter, plot_circles, plot_metrics
from train_utils import SimpleDnn, Encoder, Linear_Encoder, create_loaders
import torch
from torch.utils.data import TensorDataset
import numpy as np
import pytorch_lightning as pl
from pytorch_lightning.loggers import CSVLogger
import torch.nn.functional as F
import pandas as pd
from sklearn.datasets import make_circles
import matplotlib.image as image
from matplotlib.offsetbox import (OffsetImage, AnnotationBbox)
import plotly.express as px
from csv_utils import format_csv
from pathlib import Path
import weightwatcher as ww

BATCH_SIZE = 32

PyTorch is available but CUDA is not. Defaulting to SciPy for SVD


# Linear Data Example

1. Generate linear data from three Gaussians

In [2]:
# generate data
data, labels = generate_mv_data(KEY, MEANS, VARIANCES, NUM_SAMPLES, 3)

# format labels and set up dataloaders
labels_one_hot = F.one_hot(torch.Tensor(np.hstack((np.array(labels)))).to(torch.int64), num_classes=3).float() 
data_linear = TensorDataset(
                torch.Tensor(np.vstack((np.array(data)))), labels_one_hot
            )
train_loader_linear, val_loader_linear = create_loaders(data = data_linear, ratio = 0.8, num_workers = 0, shuffle_train = True, shuffle_val = False, batch_size = 32)

2023-04-03 15:27:27.888 | INFO     | train_utils:create_loaders:44 - Creating dataloaders with 80/20train/test split 🔪
2023-04-03 15:27:27.889 | INFO     | train_utils:create_loaders:70 - Successfully created train and validation loader 🤗


### 1. Train 'unbroken' model (simple feed-forward network)

In [10]:
# define model and logger
simple_dnn = SimpleDnn(Encoder(input_dim = 2, output_dim = 3), task_type = 'classification')
csv_logger = CSVLogger(save_dir='metrics_csv', name = 'linear')
# train model on linear data
trainer = pl.Trainer(logger=csv_logger, max_epochs = 50, log_every_n_steps=20)
trainer.fit(model=simple_dnn, train_dataloaders = train_loader_linear, val_dataloaders = val_loader_linear)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")

  | Name    | Type          | Params
------------------------------------------
0 | encoder | Encoder       | 63    
1 | f1      | BinaryF1Score | 0     
------------------------------------------
63        Trainable params
0         Non-trainable params
63        Total params
0.000     Total estimated model params size (MB)
  rank_zero_warn(


Epoch 49: 100%|██████████| 38/38 [00:00<00:00, 177.47it/s, v_num=0]         

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 38/38 [00:00<00:00, 173.16it/s, v_num=0]


In [11]:
# format, since csv logger captures metrics on step basis
format_csv('metrics_csv/linear/version_0/metrics.csv', ['train_loss', 'val_loss'], Path('reformatted_metrics/linear_data.csv'))

### 2. Broken model: No activation functions

In [43]:
# DNN without ReLUs
simple_dnn_linear = SimpleDnn(Linear_Encoder(input_dim = 2, output_dim = 3), task_type = 'classification')
csv_logger = CSVLogger(save_dir='metrics_csv', name = 'linear_no_relu')
# train model on linear data
trainer = pl.Trainer(logger=csv_logger, max_epochs = 50, log_every_n_steps=20)
trainer.fit(model=simple_dnn_linear, train_dataloaders = train_loader_linear, val_dataloaders = val_loader_linear)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


NameError: name 'train_loader_linear' is not defined

In [13]:
# format, since csv logger captures metrics on step basis
format_csv('metrics_csv/linear_no_relu/version_0/metrics.csv', ['train_loss', 'val_loss'], Path('reformatted_metrics/linear_data_no_relu.csv'))

# Nonlinear Data Example

In [44]:
data_circles, label_circles = make_circles(n_samples=NUM_SAMPLES, factor=0.5, noise=0.05)
# concentric circles
data_tensor_circles = TensorDataset(
                torch.Tensor(data_circles), F.one_hot(torch.Tensor(label_circles).to(torch.int64), num_classes=2).float() 
            )
train_loader_circles, val_loader_circles = create_loaders(data = data_tensor_circles, ratio = 0.8,  batch_size = 32, num_workers = 0,
                                                        shuffle_train = True, shuffle_val = False)

2023-04-08 15:56:39.352 | INFO     | train_utils:create_loaders:58 - Creating dataloaders with 80/20train/test split 🔪
2023-04-08 15:56:39.370 | INFO     | train_utils:create_loaders:84 - Successfully created train and validation loader 🤗


### 1. Train 'unbroken' model (simple feed-forward network)

In [16]:
simple_dnn = SimpleDnn(Encoder(input_dim = 2, output_dim = 2), task_type = 'classification')
csv_logger = CSVLogger(save_dir='metrics_csv', name = 'circles')
# train model on nonlinear data
trainer = pl.Trainer(logger=csv_logger, max_epochs = 50)
trainer.fit(model=simple_dnn, train_dataloaders = train_loader_circles, val_dataloaders = val_loader_circles)
format_csv('metrics_csv/circles/version_0/metrics.csv', ['train_loss', 'val_loss'], Path('reformatted_metrics/circle_data.csv'))

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: metrics_csv/circles

  | Name    | Type          | Params
------------------------------------------
0 | encoder | Encoder       | 52    
1 | f1      | BinaryF1Score | 0     
------------------------------------------
52        Trainable params
0         Non-trainable params
52        Total params
0.000     Total estimated model params size (MB)


                                                                            

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Epoch 49: 100%|██████████| 13/13 [00:00<00:00, 171.54it/s, v_num=0]

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 13/13 [00:00<00:00, 161.16it/s, v_num=0]


### 2. Broken model: No activation functions

In [18]:
# DNN without ReLUs
simple_dnn_linear = SimpleDnn(Linear_Encoder(input_dim = 2, output_dim = 2), task_type = 'classification')
# train model on nonlinear data
csv_logger = CSVLogger(save_dir='metrics_csv', name = 'circles_no_relu')
trainer = pl.Trainer(logger=csv_logger, max_epochs = 50)
trainer.fit(model=simple_dnn_linear, train_dataloaders = train_loader_circles, val_dataloaders = val_loader_circles)
format_csv('metrics_csv/circles_no_relu/version_0/metrics.csv', ['train_loss', 'val_loss'], Path('reformatted_metrics/circle_data_no_relu.csv'))

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: metrics_csv/circles_no_relu

  | Name    | Type           | Params
-------------------------------------------
0 | encoder | Linear_Encoder | 52    
1 | f1      | BinaryF1Score  | 0     
-------------------------------------------
52        Trainable params
0         Non-trainable params
52        Total params
0.000     Total estimated model params size (MB)


                                                                            

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Epoch 49: 100%|██████████| 13/13 [00:00<00:00, 142.19it/s, v_num=0]

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 13/13 [00:00<00:00, 132.34it/s, v_num=0]


### 3. Broken model: Frozen weights

In [148]:
simple_dnn_frozen_weights = SimpleDnn(Encoder(input_dim = 2, output_dim = 2), task_type = 'classification')

num_params = len([param for param in simple_dnn.parameters()])
for i, param in zip(range(num_params), simple_dnn.parameters()):
    if i in [num_params-1, num_params -2] :
        param.requires_grad = False

csv_logger = CSVLogger(save_dir='metrics_csv', name = 'circles_frozen')
# train model on nonlinear data
trainer = pl.Trainer(logger=csv_logger, max_epochs = 50)
trainer.fit(model=simple_dnn_frozen_weights, train_dataloaders = train_loader_circles, val_dataloaders = val_loader_circles)
format_csv('metrics_csv/circles_frozen/version_0/metrics.csv', ['train_loss', 'val_loss'], Path('reformatted_metrics/circle_data_frozen.csv'))

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type          | Params
------------------------------------------
0 | encoder | Encoder       | 52    
1 | f1      | BinaryF1Score | 0     
------------------------------------------
52        Trainable params
0         Non-trainable params
52        Total params
0.000     Total estimated model params size (MB)


Epoch 49: 100%|██████████| 13/13 [00:00<00:00, 112.61it/s, v_num=0]         

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 13/13 [00:00<00:00, 106.09it/s, v_num=0]


### 4. Broken model: Frozen Bias

In [7]:
simple_dnn = SimpleDnn(Encoder(input_dim = 2, output_dim = 2), task_type = 'classification')

num_params = len([param for param in simple_dnn.parameters()])
for i, param in zip(range(num_params), simple_dnn.parameters()):
    if i == num_params-1 :
        param.requires_grad = False

csv_logger = CSVLogger(save_dir='metrics_csv', name = 'circles_frozen_bias')
# train model on nonlinear data
trainer = pl.Trainer(logger=csv_logger, max_epochs = 50)
trainer.fit(model=simple_dnn, train_dataloaders = train_loader_circles, val_dataloaders = val_loader_circles)
format_csv('metrics_csv/circles_frozen_bias/version_0/metrics.csv', ['train_loss', 'val_loss'], Path('reformatted_metrics/circle_data_frozen_bias.csv'))

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")

  | Name    | Type          | Params
------------------------------------------
0 | encoder | Encoder       | 52    
1 | f1      | BinaryF1Score | 0     
------------------------------------------
50        Trainable params
2         Non-trainable params
52        Total params
0.000     Total estimated model params size (MB)
  rank_zero_warn(


                                                                            

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Epoch 49: 100%|██████████| 13/13 [00:00<00:00, 56.09it/s, v_num=0] 

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 13/13 [00:00<00:00, 53.67it/s, v_num=0]


### 5. Broken model: Trainloader returns single example

In [45]:
simple_dnn = SimpleDnn(Encoder(input_dim = 2, output_dim = 2), task_type = 'classification')
train_loader_circles_broken, val_loader_circles = create_loaders(data = data_tensor_circles, ratio = 0.8,  batch_size = 32, num_workers = 0,
                                                        shuffle_train = True, shuffle_val = False, subset_broken_train = True)

csv_logger = CSVLogger(save_dir='metrics_csv', name = 'circles_dataloader_broken')
# train model on nonlinear data
trainer = pl.Trainer(logger=csv_logger, max_epochs = 50)
trainer.fit(model=simple_dnn, train_dataloaders = train_loader_circles_broken, val_dataloaders = val_loader_circles)
format_csv('metrics_csv/circles_dataloader_broken/version_0/metrics.csv', ['train_loss', 'val_loss'], Path('reformatted_metrics/circle_dataloader_broken.csv'))

2023-04-08 15:56:42.822 | INFO     | train_utils:create_loaders:58 - Creating dataloaders with 80/20train/test split 🔪
2023-04-08 15:56:42.827 | INFO     | train_utils:create_loaders:84 - Successfully created train and validation loader 🤗
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type          | Params
------------------------------------------
0 | encoder | Encoder       | 52    
1 | f1      | BinaryF1Score | 0     
------------------------------------------
52        Trainable params
0         Non-trainable params
52        Total params
0.000     Total estimated model params size (MB)


Epoch 49: 100%|██████████| 7/7 [00:00<00:00, 100.66it/s, v_num=0]          

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 7/7 [00:00<00:00, 93.07it/s, v_num=0] 


# 7. Cleanlab Example

In [1]:
import cleanlab 
from skorch import NeuralNetClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from cleanlab.filter import find_label_issues
import numpy as np

In [8]:
simple_dnn = SimpleDnn(Encoder(input_dim = 2, output_dim = 2), task_type = 'classification')

np.random.seed(96578)
data_circles, label_circles = make_circles(n_samples=NUM_SAMPLES, factor=0.5, noise=0.05)

# create dataloaders
data_tensor_circles = TensorDataset(
                torch.Tensor(data_circles), F.one_hot(torch.Tensor(label_circles).to(torch.int64), num_classes=2).float() 
            )
train_loader_circles, val_loader_circles = create_loaders(data = data_tensor_circles, ratio = 0.8,  batch_size = 32, num_workers = 0,
                                                        shuffle_train = True, shuffle_val = False)
# train model
csv_logger = CSVLogger(save_dir='metrics_csv', name = 'circles_cleanlab')
# train model on nonlinear data
trainer = pl.Trainer(logger=csv_logger, max_epochs = 200)
trainer.fit(model=simple_dnn, train_dataloaders = train_loader_circles, val_dataloaders = val_loader_circles)
format_csv('metrics_csv/circles_cleanlab/version_0/metrics.csv', ['train_loss', 'val_loss'], Path('reformatted_metrics/circles_cleanlab.csv'))

model_skorch = NeuralNetClassifier(simple_dnn.encoder)
cl = cleanlab.classification.CleanLearning(model_skorch)

2023-04-10 12:28:54.726 | INFO     | train_utils:create_loaders:58 - Creating dataloaders with 80/20train/test split 🔪
2023-04-10 12:28:54.731 | INFO     | train_utils:create_loaders:84 - Successfully created train and validation loader 🤗
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type          | Params
------------------------------------------
0 | encoder | Encoder       | 52    
1 | f1      | BinaryF1Score | 0     
------------------------------------------
52        Trainable params
0         Non-trainable params
52        Total params
0.000     Total estimated model params size (MB)


Epoch 199: 100%|██████████| 13/13 [00:00<00:00, 101.77it/s, v_num=0]       

`Trainer.fit` stopped: `max_epochs=200` reached.


Epoch 199: 100%|██████████| 13/13 [00:00<00:00, 96.77it/s, v_num=0] 


In [4]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [5]:
pred_probs_logits = cross_val_predict(
    model_skorch,
    np.float32(data_circles),
    label_circles,
    cv=3,
    method="predict_proba",
)

pred_probs = np.apply_along_axis(sigmoid, 0, pred_probs_logits)

predicted_labels = pred_probs.argmax(axis=1)
acc = accuracy_score(label_circles, predicted_labels)
print(f"Cross-validated estimate of accuracy on held-out data: {acc}")

ranked_label_issues = find_label_issues(
    label_circles,
    pred_probs,
    return_indices_ranked_by="self_confidence",
)

print(f"Cleanlab found {len(ranked_label_issues)} label issues.")
print(f"Top 15 most likely label errors: \n {ranked_label_issues[:15]}")

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1           nan       [32m0.5075[0m           nan  0.0396
      2           nan       0.5075           nan  0.0071
      3           nan       0.5075           nan  0.0061
      4           nan       0.5075           nan  0.0082
      5           nan       0.5075           nan  0.0079
      6           nan       0.5075           nan  0.0097
      7           nan       0.5075           nan  0.0083
      8           nan       0.5075           nan  0.0113
      9           nan       0.5075           nan  0.0096
     10           nan       0.5075           nan  0.0100
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1           nan       [32m0.4925[0m           nan  0.0107
      2           nan       0.4925           nan  0.0085
      3           nan       0.4925           nan  0.0062
      4      

In [9]:
# concentric circles with label mixup
# save uncorrupted data
df = pd.DataFrame({'x1': data_circles[:,0], 'x2': data_circles[:,1], 'labels': label_circles})
df.to_csv('label_mixup/circles_uncorrupted.csv', index = False)

# corrupt 20 labels
corrupted_indices = np.random.choice(range(len(label_circles)), size = 20, replace = False)
label_circles[corrupted_indices] = abs(label_circles[corrupted_indices] - 1)

# save for plotting
df = pd.DataFrame({'x1': data_circles[:,0], 'x2': data_circles[:,1], 'labels': label_circles})
df.to_csv('label_mixup/circles_corrupted.csv', index = False)

# train model on corrupted data
data_tensor_circles = TensorDataset(
                torch.Tensor(data_circles), F.one_hot(torch.Tensor(label_circles).to(torch.int64), num_classes=2).float() 
            )

train_loader_circles, val_loader_circles = create_loaders(data = data_tensor_circles, ratio = 0.8,  batch_size = 32, num_workers = 0,
                                                        shuffle_train = True, shuffle_val = False)

# train model
csv_logger = CSVLogger(save_dir='metrics_csv', name = 'circles_cleanlab_corrupted')
# train model on nonlinear data
trainer = pl.Trainer(logger=csv_logger, max_epochs = 200)
trainer.fit(model=simple_dnn, train_dataloaders = train_loader_circles, val_dataloaders = val_loader_circles)
format_csv('metrics_csv/circles_cleanlab_corrupted/version_0/metrics.csv', ['train_loss', 'val_loss'], Path('reformatted_metrics/circles_cleanlab_corrupted.csv'))


model_skorch = NeuralNetClassifier(simple_dnn.encoder)
cl = cleanlab.classification.CleanLearning(model_skorch)

pred_probs_logits = cross_val_predict(
    model_skorch,
    np.float32(data_circles),
    label_circles,
    cv=3,
    method="predict_proba",
)

pred_probs = np.apply_along_axis(sigmoid, 0, pred_probs_logits)

predicted_labels = pred_probs.argmax(axis=1)
acc = accuracy_score(label_circles, predicted_labels)
print(f"Cross-validated estimate of accuracy on held-out data: {acc}")

ranked_label_issues = find_label_issues(
    label_circles,
    pred_probs,
    return_indices_ranked_by="self_confidence",
)

print(f"Cleanlab found {len(ranked_label_issues)} label issues.")
print(f"Top 15 most likely label errors: \n {ranked_label_issues[:15]}")




2023-04-10 12:31:18.312 | INFO     | train_utils:create_loaders:58 - Creating dataloaders with 80/20train/test split 🔪
2023-04-10 12:31:18.315 | INFO     | train_utils:create_loaders:84 - Successfully created train and validation loader 🤗
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type          | Params
------------------------------------------
0 | encoder | Encoder       | 52    
1 | f1      | BinaryF1Score | 0     
------------------------------------------
52        Trainable params
0         Non-trainable params
52        Total params
0.000     Total estimated model params size (MB)


Epoch 199: 100%|██████████| 13/13 [00:00<00:00, 129.54it/s, v_num=0]        

`Trainer.fit` stopped: `max_epochs=200` reached.


Epoch 199: 100%|██████████| 13/13 [00:00<00:00, 122.33it/s, v_num=0]
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1           nan       [32m0.9403[0m           nan  0.0079
      2           nan       0.9403           nan  0.0066
      3           nan       0.9403           nan  0.0091
      4           nan       0.9403           nan  0.0074
      5           nan       0.9403           nan  0.0079
      6           nan       0.9403           nan  0.0087
      7           nan       0.9403           nan  0.0064
      8           nan       0.9403           nan  0.0074
      9           nan       0.9403           nan  0.0064
     10           nan       0.9403           nan  0.0075
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1           nan       [32m0.9701[0m           nan  0.0076
      2           nan       0.9701           nan  0.0079
 

In [10]:
# save for plotting
df = pd.DataFrame({'x1': data_circles[ranked_label_issues[:20],0], 
                   'x2': data_circles[ranked_label_issues[:20],1],
                   'labels': label_circles[ranked_label_issues[:20]]*0 + 2})
df.to_csv('label_mixup/circles_corrupted_cleanlab_pred.csv', index = False)

In [None]:
px.scatter(         df,
                    x='x1',
                    y='x2',
                    color='labels',
                    size_max=60,
                    color_continuous_scale = px.colors.sequential.Peach,
                    opacity = 0.5,
                    )

In [100]:
# concentric circles with label mixup
data_circles, label_circles = make_circles(n_samples=NUM_SAMPLES, factor=0.5, noise=0.05)
corrupted_indices = np.random.choice(range(len(label_circles)), size = 20, replace = False)
df.to_csv('label_mixup/circles_uncorrupted.csv', index = False)
label_circles[corrupted_indices] = abs(label_circles[corrupted_indices] - 1)

data_tensor_circles = TensorDataset(
                torch.Tensor(data_circles), F.one_hot(torch.Tensor(label_circles).to(torch.int64), num_classes=2).float() 
            )

# save for plotting
df = pd.DataFrame({'x1': data_circles[:,0], 'x2': data_circles[:,1], 'labels': label_circles})
df.to_csv('label_mixup/circles_corrupted.csv', index = False)

train_loader_circles, val_loader_circles = create_loaders(data = data_tensor_circles, ratio = 0.8,  batch_size = 32, num_workers = 0,
                                                        shuffle_train = True, shuffle_val = False)

2023-04-08 16:29:29.811 | INFO     | train_utils:create_loaders:58 - Creating dataloaders with 80/20train/test split 🔪
2023-04-08 16:29:29.841 | INFO     | train_utils:create_loaders:84 - Successfully created train and validation loader 🤗


In [104]:
simple_dnn = SimpleDnn(Encoder(input_dim = 2, output_dim = 2), task_type = 'classification')
csv_logger = CSVLogger(save_dir='metrics_csv', name = 'circles_label_mixup')
# train model on nonlinear data
trainer = pl.Trainer(logger=csv_logger, max_epochs = 100)
trainer.fit(model=simple_dnn, train_dataloaders = train_loader_circles, val_dataloaders = val_loader_circles)
format_csv('metrics_csv/circles_label_mixup/version_0/metrics.csv', ['train_loss', 'val_loss'], Path('reformatted_metrics/circles_label_mixup_data.csv'))

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type          | Params
------------------------------------------
0 | encoder | Encoder       | 52    
1 | f1      | BinaryF1Score | 0     
------------------------------------------
52        Trainable params
0         Non-trainable params
52        Total params
0.000     Total estimated model params size (MB)


Epoch 99: 100%|██████████| 13/13 [00:00<00:00, 113.79it/s, v_num=0]         

`Trainer.fit` stopped: `max_epochs=100` reached.


Epoch 99: 100%|██████████| 13/13 [00:00<00:00, 92.24it/s, v_num=0] 


In [98]:
px.scatter(         df,
                    x='x1',
                    y='x2',
                    color='labels',
                    size_max=60,
                    color_continuous_scale = px.colors.sequential.Peach,
                    opacity = 0.5,
                    )

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [10]:
model_skorch = NeuralNetClassifier(simple_dnn.encoder)
cl = cleanlab.classification.CleanLearning(model_skorch)

data_circles = np.float32(data_circles)

pred_probs_logits = cross_val_predict(
    model_skorch,
    np.float32(data_circles),
    label_circles,
    cv=3,
    method="predict_proba",
)

pred_probs = np.apply_along_axis(sigmoid, 0, pred_probs_logits)

predicted_labels = pred_probs.argmax(axis=1)
acc = accuracy_score(label_circles, predicted_labels)
print(f"Cross-validated estimate of accuracy on held-out data: {acc}")

ranked_label_issues = find_label_issues(
    label_circles,
    pred_probs,
    return_indices_ranked_by="self_confidence",
)

print(f"Cleanlab found {len(ranked_label_issues)} label issues.")
print(f"Top 15 most likely label errors: \n {ranked_label_issues[:15]}")

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1           nan       [32m0.4925[0m           nan  0.0076
      2           nan       0.4925           nan  0.0113
      3           nan       0.4925           nan  0.0122
      4           nan       0.4925           nan  0.0079
      5           nan       0.4925           nan  0.0078
      6           nan       0.4925           nan  0.0085
      7           nan       0.4925           nan  0.0066
      8           nan       0.4925           nan  0.0086
      9           nan       0.4925           nan  0.0120
     10           nan       0.4925           nan  0.0083
  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1           nan       [32m0.5522[0m           nan  0.0057
      2           nan       0.5075           nan  0.0101
      3           nan       0.4925       [35m-0.9486[0m  0.0123
    

In [118]:
len(corrupted_indices)

20

In [116]:
len(list(set(corrupted_indices).intersection(ranked_label_issues)))

9

In [110]:
label_circles_allegedly_corrupted = label_circles.copy()
label_circles_allegedly_corrupted[ranked_label_issues] = 2

In [111]:
df2 = pd.DataFrame({'x1': data_circles[:,0], 'x2': data_circles[:,1], 'labels': label_circles_allegedly_corrupted})

In [112]:
px.scatter(         df2,
                    x='x1',
                    y='x2',
                    color='labels',
                    size_max=60,
                    color_continuous_scale = px.colors.sequential.Peach,
                    opacity = 0.5,
                    )

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

## 6. Broken models with weightwatcher

In [298]:
class CustomEncoder(nn.Module):
    """
    Simple encoder
    """

    def __init__(self, input_dim: int, output_dim: int) -> None:
        """Setup model layers"""
        super().__init__()
        self.dense1 =  nn.Linear(input_dim, 10)
        self.dense2 = nn.Linear(10, 9)
        self.dense3 = nn.Linear(10, 8)
        self.dense4 = nn.Linear(10, 7)
        self.dense5 = nn.Linear(10, 6)
        self.dense6 = nn.Linear(10, output_dim)
        self.relu = nn.ReLU()

        
        self.output_dim = output_dim
        self.input_dim = input_dim

    def forward(self, x: torch.Tensor) -> nn.Sequential:
        """Forward pass through layers"""
        x = self.dense1(x)
        x = self.relu(x)
        x = self.dense2(x)
        x = self.relu(x)
        x = self.dense3(x)
        x = self.relu(x)
        x = self.dense4(x)
        x = self.relu(x)
        x = self.dense5(x)
        x = self.relu(x)
        x = self.dense6(x)
        return x

simple_dnn = SimpleDnn(CustomEncoder(input_dim = 2, output_dim = 2), task_type = 'classification')
train_loader_circles_broken, val_loader_circles = create_loaders(data = data_tensor_circles, ratio = 0.8,  batch_size = 32, num_workers = 0,
                                                        shuffle_train = True, shuffle_val = False, subset_broken_train = True)

csv_logger = CSVLogger(save_dir='metrics_csv', name = 'circles_dataloader_broken')
# train model on nonlinear data
trainer = pl.Trainer(logger=csv_logger, max_epochs = 50)
trainer.fit(model=simple_dnn, train_dataloaders = train_loader_circles_broken, val_dataloaders = val_loader_circles)
format_csv('metrics_csv/circles_dataloader_broken/version_0/metrics.csv', ['train_loss', 'val_loss'], Path('reformatted_metrics/circle_dataloader_broken.csv'))

2023-04-06 15:05:59.754 | INFO     | train_utils:create_loaders:44 - Creating dataloaders with 80/20train/test split 🔪
2023-04-06 15:05:59.773 | INFO     | train_utils:create_loaders:70 - Successfully created train and validation loader 🤗
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type          | Params
------------------------------------------
0 | encoder | CustomEncoder | 492   
1 | f1      | BinaryF1Score | 0     
------------------------------------------
492       Trainable params
0         Non-trainable params
492       Total params
0.002     Total estimated model params size (MB)


Epoch 49: 100%|██████████| 7/7 [00:00<00:00, 70.24it/s, v_num=0]            

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 7/7 [00:00<00:00, 59.35it/s, v_num=0]


In [310]:
simple_dnn_everything_ok = SimpleDnn(CustomEncoder(input_dim = 2, output_dim = 2), task_type = 'classification')
train_loader_circles, val_loader_circles = create_loaders(data = data_tensor_circles, ratio = 0.8,  batch_size = 32, num_workers = 0,
                                                        shuffle_train = True, shuffle_val = False, subset_broken_train = False)

csv_logger = CSVLogger(save_dir='metrics_csv', name = 'circles_dataloader_broken')
# train model on nonlinear data
trainer = pl.Trainer(logger=csv_logger, max_epochs = 50)
trainer.fit(model=simple_dnn, train_dataloaders = train_loader_circles, val_dataloaders = val_loader_circles)
format_csv('metrics_csv/circles_dataloader_broken/version_0/metrics.csv', ['train_loss', 'val_loss'], Path('reformatted_metrics/circle_dataloader_broken.csv'))

2023-04-06 15:08:12.919 | INFO     | train_utils:create_loaders:44 - Creating dataloaders with 80/20train/test split 🔪
2023-04-06 15:08:12.926 | INFO     | train_utils:create_loaders:70 - Successfully created train and validation loader 🤗
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type          | Params
------------------------------------------
0 | encoder | CustomEncoder | 492   
1 | f1      | BinaryF1Score | 0     
------------------------------------------
492       Trainable params
0         Non-trainable params
492       Total params
0.002     Total estimated model params size (MB)


Epoch 49: 100%|██████████| 13/13 [00:00<00:00, 101.04it/s, v_num=0]         

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 13/13 [00:00<00:00, 95.04it/s, v_num=0] 


In [2]:
assertTrue(3 <= 2 <= 8)

NameError: name 'assertTrue' is not defined