In [6]:
import torch
from PIL import Image
from cleanlab import Datalab
import matplotlib.pyplot as plt

features = torch.load('/storage/results/features.pt')
pred_probs = torch.load('/storage/results/pred_probs.pt')
dataset = torch.load('/storage/results/dataset.pt')
print(dataset)
imgs = dataset['image']

lab = Datalab(data=dataset, label_name="label", image_key="image")
lab.find_issues(features=features, pred_probs=pred_probs)

lab.report()

issues = lab.get_issues("outlier")
issues = issues.sort_values(by='outlier_score', ascending=False)
issues = issues.tail(18)

for path in imgs[issues.index]:
    img = Image.open(path)
    
    fig,ax = plt.subplots(1,1)
    ax.imshow(img)
    plt.show()


issues = lab.get_issues("label")
issues = issues.loc[issues['is_label_issue'] == True]
issues = issues.head(20)

for i,row in issues.iterrows():
    img = Image.open(imgs[i])

    print(f"gt: {classes[row['given_label']]}, predicted: {classes[row['predicted_label']]}")
    fig,ax = plt.subplots(1,1,figsize = (10,10))
    ax.imshow(img)
    ax.axis('off')
    plt.show()

{'image': array(['/output/data/labeled_4312/watermark/[ph]478[ph]https___www_esbirky_cz_detail_4043982.jpg',
       '/output/data/labeled_4312/watermark/[ph]2048217[ph]europeana_fashion_MUDE_M_D0152_01.jpg',
       '/output/data/labeled_4312/watermark/[ph]2048217[ph]europeana_fashion_MUDE_M_0269_02.jpg',
       ...,
       '/output/data/labeled_4312/no_watermark/[ph]08502[ph]Athena_Update_ProvidedCHO_The_Israel_Museum__Jerusalem_397502.jpg',
       '/output/data/labeled_4312/no_watermark/[ph]519[ph]catalognumber_mnhn_p_p05316349.jpg',
       '/output/data/labeled_4312/no_watermark/[ph]440[ph]item_PP6JNL6KVN6ZQJMZKGEMFRDCTXJAKXWH.jpg'],
      dtype='<U178'), 'label': array([1, 1, 1, ..., 0, 0, 0])}
Finding label issues ...
Finding outlier issues ...
Fitting OOD estimator based on provided features ...
Finding near_duplicate issues ...
Finding non_iid issues ...
Finding dark, light, low_information, odd_aspect_ratio, odd_size, grayscale, blurry images ...


  0%|          | 0/2587 [00:00<?, ?it/s]

Error in checking for image issues: 'str' object has no attribute 'width'

Audit complete. 408 issues found in the dataset.
Here is a summary of the different kinds of issues found in the data:

    issue_type  num_issues
         label         389
       outlier          18
       non_iid           1
near_duplicate           0

Dataset Information: num_examples: 2587, num_classes: 2


----------------------- label issues -----------------------

About this issue:
	Examples whose given label is estimated to be potentially incorrect
    (e.g. due to annotation error) are flagged as having label issues.
    

Number of examples with this issue: 389
Overall dataset quality in terms of this issue: 0.6444

Examples representing most severe instances of this issue:
      is_label_issue  label_score  given_label  predicted_label
2531            True     0.001507            0                1
764             True     0.001615            0                1
716             True     0.002132     

FileNotFoundError: [Errno 2] No such file or directory: '/output/data/labeled_4312/no_watermark/[ph]447[ph]GEO0017914.jpg'

In [None]:
from torch.utils.data import DataLoader, TensorDataset, Subset
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import StratifiedKFold
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
import math
import time
import multiprocessing

from cleanlab import Datalab

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Method for computing out-of-sample embeddings
def compute_embeddings(model, testloader):
    embeddings_list = []

    with torch.no_grad():
        for data in tqdm(testloader):
            images, labels = data[0].to(device), data[1].to(device)

            embeddings = model.embeddings(images)
            embeddings_list.append(embeddings.cpu())

    return torch.vstack(embeddings_list)


# Method for computing out-of-sample predicted probabilities
def compute_pred_probs(model, testloader):
    pred_probs_list = []

    with torch.no_grad():
        for data in tqdm(testloader):
            images, labels = data[0].to(device), data[1].to(device)

            outputs = model(images)
            pred_probs_list.append(outputs.cpu())

    return torch.vstack(pred_probs_list)

In [None]:
import sys
sys.path.append('../machine-learning')

from train import Classifier, TrainingDataset, my_collate

import fire
from pathlib import Path
import torchvision
from torchvision import transforms
import numpy as np
from sklearn import preprocessing
import torch
import torchmetrics
import pytorch_lightning as pl
import torch.nn as nn
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
import json

from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from torch.utils.data import Dataset, DataLoader
from PIL import Image

from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

class Classifier(pl.LightningModule): 
  
    def __init__(self, **kwargs):
        super().__init__()
        self.save_hyperparameters()

        output_dim = kwargs.get('output_dim')
        learning_rate = kwargs.get('learning_rate')
        self.threshold = kwargs.get('threshold')

        self.model = torchvision.models.resnet18(pretrained=True)
        self.model.fc = nn.Linear(self.model.fc.in_features, output_dim)
        self.embedding = nn.Sequential(*list(self.model.children())[:-1])

        #self.output = nn.Linear(self.model.fc.out_features, output_dim)
        
        self.sm = nn.Softmax(dim=1)
        self.accuracy = torchmetrics.Accuracy(task='binary')

    def forward(self, x):
        out = self.model(x)
        #out = self.output(x)
        out = self.sm(out)
        return out

    def embeddings(self, x):
        out = self.embedding(x)
        return out

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        y_hat = torch.where(y_hat>self.threshold,1,0).int()
        self.accuracy(y_hat, y.int())
        self.log('valid_acc_step', self.accuracy)
        self.log('valid_loss', loss)

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        
        loss = F.cross_entropy(y_hat, y)
        y_hat = torch.where(y_hat>self.threshold,1,0).int()
        self.accuracy(y_hat, y.int())
        self.log('test_acc_step', self.accuracy)
        self.log('test_loss', loss)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)

In [None]:


data_dir = '/output/data/labeled_4312'
saving_dir = '/output/results/iter_7'

max_epochs = 1
sample = 0.6
#test_size = 0.2
batch_size = 16
learning_rate = 1e-4
threshold = 0.5
num_workers = 1
patience = 5
K = 3

data_dir = Path(data_dir)
saving_dir = Path(saving_dir)
saving_dir.mkdir(exist_ok = True, parents=True)

train_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomApply([transforms.Grayscale(num_output_channels=3)], p=0.25),
    #transforms.RandomRotation(degrees=(0, 45)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])

test_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])



imgs = np.array([str(p) for p in data_dir.rglob("*/*")])
labels = np.array([Path(p).parent.name for p in imgs])

n = int(imgs.shape[0]*sample)
imgs = imgs[:n]
labels = labels[:n]


le = preprocessing.LabelEncoder()
_labels = le.fit_transform(labels)
classes = le.classes_
labels = F.one_hot(torch.from_numpy(_labels)).float()

print(len(classes))


# Create k splits of the dataset

kfold = StratifiedKFold(n_splits=K, shuffle=True, random_state=0)
splits = kfold.split(imgs, _labels)

train_id_list, test_id_list = [], []

for fold, (train_ids, test_ids) in enumerate(splits):
    train_id_list.append(train_ids)
    test_id_list.append(test_ids)


pred_probs_list, embeddings_list = [], []
embeddings_model = None

for i in range(K):
    print(f"\nTraining on fold: {i+1} ...")

    # Create train and test sets and corresponding dataloaders

    X_train_val = imgs[train_id_list[i]]
    y_train_val = labels[train_id_list[i]]

    # Create validation split

    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=1.0/K)

    X_test = imgs[test_id_list[i]]
    y_test = labels[test_id_list[i]]

    train_dataset = TrainingDataset(X_train,y_train,transform = train_transform)
    val_dataset = TrainingDataset(X_val,y_val,transform = test_transform)
    test_dataset = TrainingDataset(X_test,y_test,transform = test_transform)

    train_loader = DataLoader(train_dataset, batch_size=batch_size,collate_fn=my_collate,num_workers=num_workers)
    val_loader = DataLoader(val_dataset, batch_size=batch_size,collate_fn=my_collate,num_workers=num_workers)
    test_loader = DataLoader(test_dataset, batch_size=batch_size,collate_fn=my_collate,num_workers=num_workers)

    model = Classifier(
        output_dim = len(classes), 
        learning_rate = learning_rate,
        threshold = threshold
    )

    callbacks = [EarlyStopping(monitor="valid_loss",patience=patience, verbose = True)]

    trainer = pl.Trainer(
        #gpus=1,
        accelerator="auto",
        max_epochs = max_epochs,
        log_every_n_steps=100,
        callbacks = callbacks
    )

    trainer.fit(model, train_loader, val_loader)
    trainer.test(dataloaders=test_loader)

    model = model.to(device)

    # Compute out-of-sample embeddings
    print("Computing feature embeddings ...")
    fold_embeddings = compute_embeddings(model, test_loader)
    embeddings_list.append(fold_embeddings)

    print("Computing predicted probabilities ...")
    # Compute out-of-sample predicted probabilities
    fold_pred_probs = compute_pred_probs(model, test_loader)
    pred_probs_list.append(fold_pred_probs)

print("Finished Training")


# Combine embeddings and predicted probabilities from each fold
features = torch.vstack(embeddings_list).numpy()
features = np.squeeze(features)

logits = torch.vstack(pred_probs_list)
pred_probs = logits.numpy()

indices = np.hstack(test_id_list)

imgs = imgs[indices]
_labels = _labels[indices]
dataset = {'image':imgs,'label':_labels}

saving_dir

torch.save(features, '/output/results/features.pt')
torch.save(dataset, '/output/results/dataset.pt')
torch.save(pred_probs, '/output/results/pred_probs.pt')


ImportError: Datalab is not available due to missing dependencies. To install Datalab, run `pip install 'cleanlab[datalab]'`. (raised when trying to call with args: (), kwargs: {'data': {'image': array(['/output/data/labeled_4312/watermark/[ph]478[ph]https___www_esbirky_cz_detail_4043982.jpg',
       '/output/data/labeled_4312/watermark/[ph]2048217[ph]europeana_fashion_MUDE_M_D0152_01.jpg',
       '/output/data/labeled_4312/watermark/[ph]2048217[ph]europeana_fashion_MUDE_M_0269_02.jpg',
       ...,
       '/output/data/labeled_4312/no_watermark/[ph]08502[ph]Athena_Update_ProvidedCHO_The_Israel_Museum__Jerusalem_397502.jpg',
       '/output/data/labeled_4312/no_watermark/[ph]519[ph]catalognumber_mnhn_p_p05316349.jpg',
       '/output/data/labeled_4312/no_watermark/[ph]440[ph]item_PP6JNL6KVN6ZQJMZKGEMFRDCTXJAKXWH.jpg'],
      dtype='<U178'), 'label': array([1, 1, 1, ..., 0, 0, 0])}, 'label_name': 'label', 'image_key': 'image'})

In [None]:
classes

