In [1]:
from torch.utils.data import DataLoader, TensorDataset, Subset
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import StratifiedKFold
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
import math
import time
import multiprocessing

from cleanlab import Datalab

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Method for computing out-of-sample embeddings
def compute_embeddings(model, testloader):
    embeddings_list = []

    with torch.no_grad():
        for data in tqdm(testloader):
            images, labels = data[0].to(device), data[1].to(device)

            embeddings = model.embeddings(images)
            embeddings_list.append(embeddings.cpu())

    return torch.vstack(embeddings_list)


# Method for computing out-of-sample predicted probabilities
def compute_pred_probs(model, testloader):
    pred_probs_list = []

    with torch.no_grad():
        for data in tqdm(testloader):
            images, labels = data[0].to(device), data[1].to(device)

            outputs = model(images)
            pred_probs_list.append(outputs.cpu())

    return torch.vstack(pred_probs_list)

In [5]:
import sys
sys.path.append('../machine-learning')

from train import Classifier, TrainingDataset, my_collate

import fire
from pathlib import Path
import torchvision
from torchvision import transforms
import numpy as np
from sklearn import preprocessing
import torch
import torchmetrics
import pytorch_lightning as pl
import torch.nn as nn
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
import json

from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from torch.utils.data import Dataset, DataLoader
from PIL import Image

from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

data_dir = '/output/data/labeled_4312'
saving_dir = '/output/results/iter_7'

max_epochs = 1
sample = 0.1
#test_size = 0.2
batch_size = 16
learning_rate = 1e-4
threshold = 0.5
num_workers = 1
patience = 5
K = 3

data_dir = Path(data_dir)
saving_dir = Path(saving_dir)
saving_dir.mkdir(exist_ok = True, parents=True)

train_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomApply([transforms.Grayscale(num_output_channels=3)], p=0.25),
    #transforms.RandomRotation(degrees=(0, 45)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])

test_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])



imgs = np.array([str(p) for p in data_dir.rglob("*/*")])
labels = np.array([Path(p).parent.name for p in imgs])

n = int(imgs.shape[0]*sample)
imgs = imgs[:n]
labels = labels[:n]


le = preprocessing.LabelEncoder()
_labels = le.fit_transform(labels)
classes = le.classes_
labels = F.one_hot(torch.from_numpy(_labels)).float()




# Create k splits of the dataset

kfold = StratifiedKFold(n_splits=K, shuffle=True, random_state=0)
splits = kfold.split(imgs, _labels)

train_id_list, test_id_list = [], []

for fold, (train_ids, test_ids) in enumerate(splits):
    train_id_list.append(train_ids)
    test_id_list.append(test_ids)


pred_probs_list, embeddings_list = [], []
embeddings_model = None

for i in range(K):
    print(f"\nTraining on fold: {i+1} ...")

    # Create train and test sets and corresponding dataloaders

    X_train_val = imgs[train_id_list[i]]
    y_train_val = labels[train_id_list[i]]

    # Create validation split

    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=1.0/K)

    X_test = imgs[test_id_list[i]]
    y_test = labels[test_id_list[i]]

    train_dataset = TrainingDataset(X_train,y_train,transform = train_transform)
    val_dataset = TrainingDataset(X_val,y_val,transform = test_transform)
    test_dataset = TrainingDataset(X_test,y_test,transform = test_transform)

    train_loader = DataLoader(train_dataset, batch_size=batch_size,collate_fn=my_collate,num_workers=num_workers)
    val_loader = DataLoader(val_dataset, batch_size=batch_size,collate_fn=my_collate,num_workers=num_workers)
    test_loader = DataLoader(test_dataset, batch_size=batch_size,collate_fn=my_collate,num_workers=num_workers)

    model = Classifier(
        output_dim = len(classes), 
        learning_rate = learning_rate,
        threshold = threshold
    )

    callbacks = [EarlyStopping(monitor="valid_loss",patience=patience, verbose = True)]

    trainer = pl.Trainer(
        #gpus=1,
        accelerator="auto",
        max_epochs = max_epochs,
        log_every_n_steps=100,
        callbacks = callbacks
    )

    trainer.fit(model, train_loader, val_loader)
    trainer.test(dataloaders=test_loader)

    model = model.to(device)

    # Compute out-of-sample embeddings
    print("Computing feature embeddings ...")
    fold_embeddings = compute_embeddings(model, test_loader)
    embeddings_list.append(fold_embeddings)

    print("Computing predicted probabilities ...")
    # Compute out-of-sample predicted probabilities
    fold_pred_probs = compute_pred_probs(model, test_loader)
    pred_probs_list.append(fold_pred_probs)

print("Finished Training")


# Combine embeddings and predicted probabilities from each fold
features = torch.vstack(embeddings_list).numpy()

logits = torch.vstack(pred_probs_list)
#pred_probs = nn.Softmax(dim=1)(logits).numpy()

indices = np.hstack(test_id_list)

imgs = imgs[indices]
_labels = _labels[indices]





dataset = {'image':imgs,'label':_labels}
lab = Datalab(data=dataset, label_name="label", image_key="image")
lab.find_issues(features=features, pred_probs=pred_probs)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type           | Params
--------------------------------------------
0 | model    | ResNet         | 11.2 M
1 | sm       | Softmax        | 0     
2 | accuracy | BinaryAccuracy | 0     
--------------------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.708    Total estimated model params size (MB)



Training on fold: 1 ...


Sanity Checking: |                                                                                            …

/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/loops/fit_loop.py:293: The number of training batches (12) is smaller than the logging interval Trainer(log_every_n_steps=100). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

Metric valid_loss improved. New best score: 0.000
`Trainer.fit` stopped: `max_epochs=1` reached.
Restoring states from the checkpoint path at /code/notebooks/lightning_logs/version_23/checkpoints/epoch=0-step=12.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /code/notebooks/lightning_logs/version_23/checkpoints/epoch=0-step=12.ckpt
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Testing: |                                                                                                    …

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_acc_step                 1.0
        test_loss                   0.0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Computing feature embeddings ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:03<00:00,  2.32it/s]


Computing predicted probabilities ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:03<00:00,  2.31it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type           | Params
--------------------------------------------
0 | model    | ResNet         | 11.2 M
1 | sm       | Softmax        | 0     
2 | accuracy | BinaryAccuracy | 0     
--------------------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.708    Total estimated model params size (MB)



Training on fold: 2 ...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Metric valid_loss improved. New best score: 0.000
`Trainer.fit` stopped: `max_epochs=1` reached.
Restoring states from the checkpoint path at /code/notebooks/lightning_logs/version_24/checkpoints/epoch=0-step=12.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /code/notebooks/lightning_logs/version_24/checkpoints/epoch=0-step=12.ckpt


Testing: |                                                                                                    …

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_acc_step                 1.0
        test_loss                   0.0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Computing feature embeddings ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:04<00:00,  1.80it/s]


Computing predicted probabilities ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:04<00:00,  1.82it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type           | Params
--------------------------------------------
0 | model    | ResNet         | 11.2 M
1 | sm       | Softmax        | 0     
2 | accuracy | BinaryAccuracy | 0     
--------------------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.708    Total estimated model params size (MB)



Training on fold: 3 ...


Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Metric valid_loss improved. New best score: 0.000
`Trainer.fit` stopped: `max_epochs=1` reached.
Restoring states from the checkpoint path at /code/notebooks/lightning_logs/version_25/checkpoints/epoch=0-step=12.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /code/notebooks/lightning_logs/version_25/checkpoints/epoch=0-step=12.ckpt


Testing: |                                                                                                    …

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_acc_step                 1.0
        test_loss                   0.0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Computing feature embeddings ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:04<00:00,  1.90it/s]


Computing predicted probabilities ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:04<00:00,  1.89it/s]

Finished Training
Finding label issues ...
Error in label: Labels must contain at least 2 classes.
Finding outlier issues ...
Fitting OOD estimator based on provided features ...
Finding near_duplicate issues ...
Finding non_iid issues ...
Failed to check for these issue types: [LabelIssueManager]
Finding dark, light, low_information, odd_aspect_ratio, odd_size, grayscale, blurry images ...





  0%|          | 0/431 [00:00<?, ?it/s]

Error in checking for image issues: 'str' object has no attribute 'mode'

Audit complete. 73 issues found in the dataset.


In [8]:

lab.report()

Here is a summary of the different kinds of issues found in the data:

    issue_type  num_issues
near_duplicate          37
       outlier          35
       non_iid           1

Dataset Information: num_examples: 431, num_classes: 1


------------------ near_duplicate issues -------------------

About this issue:
	A (near) duplicate issue refers to two or more examples in
    a dataset that are extremely similar to each other, relative
    to the rest of the dataset.  The examples flagged with this issue
    may be exactly duplicated, or lie atypically close together when
    represented as vectors (i.e. feature embeddings).
    

Number of examples with this issue: 37
Overall dataset quality in terms of this issue: 0.0048

Examples representing most severe instances of this issue:
     is_near_duplicate_issue  near_duplicate_score near_duplicate_sets  distance_to_nearest_neighbor
379                     True              0.000028                [63]                      0.000028
63 

In [4]:
label_issues = lab.get_issues("outlier")
label_issues.head()

ValueError: No columns found for issue type 'label'.

In [19]:




model = Classifier(
    output_dim = 2, 
    learning_rate = 0,
    threshold = 0.5
)

model.embeddings()

TypeError: embeddings() missing 1 required positional argument: 'x'

In [5]:
K = 3  # Number of cross-validation folds. Set to small value here to ensure quick runtimes, we recommend 5 or 10 in practice for more accurate estimates.
n_epochs = 2  # Number of epochs to train model for. Set to a small value here for quick runtime, you should use a larger value in practice.
patience = 2  # Parameter for early stopping. If the validation accuracy does not improve for this many epochs, training will stop.
train_batch_size = 64  # Batch size for training
test_batch_size = 512  # Batch size for testing


num_workers = multiprocessing.cpu_count()  # Number of workers for data loaders

# Create k splits of the dataset
kfold = StratifiedKFold(n_splits=K, shuffle=True, random_state=0)
splits = kfold.split(transformed_dataset, transformed_dataset["label"])



NameError: name 'transformed_dataset' is not defined

In [None]:
pred_probs_list, embeddings_list = [], []
embeddings_model = None

for i in range(K):
    print(f"\nTraining on fold: {i+1} ...")

    # Create train and test sets and corresponding dataloaders
    trainset = Subset(torch_dataset, train_id_list[i])
    testset = Subset(torch_dataset, test_id_list[i])

    trainloader = DataLoader(
        trainset,
        batch_size=train_batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True,
    )
    testloader = DataLoader(
        testset, batch_size=test_batch_size, shuffle=False, num_workers=num_workers, pin_memory=True
    )

    # Train model
    model = train(trainloader, testloader, n_epochs, patience)
    if embeddings_model is None:
        embeddings_model = model

    # Compute out-of-sample embeddings
    print("Computing feature embeddings ...")
    fold_embeddings = compute_embeddings(embeddings_model, testloader)
    embeddings_list.append(fold_embeddings)

    print("Computing predicted probabilities ...")
    # Compute out-of-sample predicted probabilities
    fold_pred_probs = compute_pred_probs(model, testloader)
    pred_probs_list.append(fold_pred_probs)

print("Finished Training")


# Combine embeddings and predicted probabilities from each fold
features = torch.vstack(embeddings_list).numpy()

logits = torch.vstack(pred_probs_list)
pred_probs = nn.Softmax(dim=1)(logits).numpy()