In [12]:
# https://docs.cleanlab.ai/stable/tutorials/image.html

import torch
from torch import nn
import torchvision.models
from sklearn.datasets import fetch_openml
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
# from skorch import NeuralNetBinaryClassifier
from skorch import NeuralNetClassifier
import numpy as np
from torchvision import models

max_epochs = 5
num_crossval_folds = 5  # for efficiency; values like 5 or 10 will generally work better

# define the network
# model.py
models_dict = {'resnet18': torchvision.models.resnet18,
               'resnet34': torchvision.models.resnet34,
               'resnet50': torchvision.models.resnet50,
               'resnet101': torchvision.models.resnet101,
               'resnet152': torchvision.models.resnet152}


# model_specs = {'model_name':'resnet34','n_channels':4,'n_filters':64,'n_classes':len(ann_dict)-1,'kernel_size':3,'stride':1,'padding':1, 'batch_size':32}
class ResNet(nn.Module):
    def __init__(self, model='resnet34',n_channels=4,n_filters=64,n_classes=2,kernel_size=3,stride=1,padding=1): # n_classes=1
        super().__init__()
        self.n_classes = n_classes
        self.base_model = models_dict[model](pretrained=True)
        self._feature_vector_dimension = self.base_model.fc.in_features
        self.base_model.conv1 = nn.Conv2d(n_channels, n_filters, kernel_size=kernel_size, stride=stride, padding=padding, bias=False)
        self.base_model = nn.Sequential(*list(self.base_model.children())[:-1]) # Remove the final fully connected layer
        self.fc = nn.Linear(self._feature_vector_dimension, n_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.base_model(x)
        features = x.view(x.size(0), -1)
        return self.fc(features)

    def extract_features(self,x):
        x = self.base_model(x)
        return x.view(x.size(0), -1)

    def get_predictions(self,x):
        x = self.base_model(x)
        features = x.view(x.size(0), -1)
        output = self.fc(features)
        if self.n_classes == 1:
            return torch.sigmoid(output)
        else:
            return torch.softmax(output,dim=1)

    def get_predictions_and_features(self,x):
        x = self.base_model(x)
        features = x.view(x.size(0), -1)
        output = self.fc(features)
        if self.n_classes == 1:
            return torch.sigmoid(output), features
        else:
            return torch.softmax(output,dim=1), features

In [13]:
# make the network sklearn compatible
print(torch.cuda.is_available())
if torch.cuda.is_available():
    model_skorch = NeuralNetClassifier(ResNet,max_epochs=max_epochs,device='cuda')
else:
    model_skorch = NeuralNetClassifier(ResNet,max_epochs=max_epochs,device='cpu')

False


In [16]:
# load data
# load positive data
data_positive = np.load('../data/combined_images_parasite.npy')/255.0
label_positive = np.ones(data_positive.shape[0])
print("pos samples: ", data_positive.shape)

# load negative data
data_negative = np.load('../data/combined_images_neg.npy')/255.0
# random_indexes = np.random.randint(0, data_negative.shape[0], size=min(data_negative.shape[0],2000))
# data_negative = data_negative[random_indexes,:,:,:]
# label_negative = np.zeros(data_negative.shape[0])
label_negative = np.zeros(data_negative.shape[0])
print("neg samples: ", data_negative.shape)

# combine positive and negative
data = np.concatenate((data_positive,data_negative),axis=0) # [:2000] TODO: just for testing, get first 2000 pos
label = np.concatenate((label_positive,label_negative))
print("total samples: ", data.shape)

pos samples:  (76126, 4, 31, 31)
neg samples:  (744187, 4, 31, 31)
total samples:  (820313, 4, 31, 31)


In [None]:
# save the data to file

# import pandas as pd
# df = pd.DataFrame({'index': range(len(label)), 'annotation': label.astype("int64")})
# np.save('combined_images_parasite_and_non-parasite.npy', data)
# np.save('combined_ann_parasite_and_non=parasite.npy', label)
# df.to_csv('combined_ann_parasite_and_non=parasite.csv', index=False)

In [None]:
-# shuffle
indices = np.random.choice(len(data), len(data), replace=False)
data = data[indices,:,:,:]
label = label[indices]

data = data.astype("float32")
label = label.astype("int64")
print(label.shape)

In [7]:
# compute out-of-sample predicted probabilities
pred_probs = cross_val_predict(
    model_skorch,
    data,
    label,
    cv=num_crossval_folds,
    verbose=1,
    method="predict_proba",
)



  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1           nan       [32m0.3141[0m           nan  9.2034
      2           nan       0.2656           nan  9.1737
      3           nan       0.1203           nan  9.1411
      4           nan       0.0953           nan  8.6913
      5           nan       0.0875           nan  7.4293




  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1           nan       [32m0.5000[0m           nan  8.8361
      2           nan       0.5000           nan  7.3281
      3           nan       0.5000           nan  7.4063
      4           nan       0.5000           nan  7.7529
      5           nan       0.5000           nan  7.6279




  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1           nan       [32m0.5000[0m           nan  7.6320
      2           nan       0.5000           nan  9.2833
      3           nan       0.5000           nan  8.9653
      4           nan       0.5000           nan  8.8405
      5           nan       0.5000           nan  9.3633




  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1           nan       [32m0.5000[0m           nan  9.3131
      2           nan       0.5000           nan  9.7748
      3           nan       0.5000           nan  8.5288
      4           nan       0.5000           nan  9.3144
      5           nan       0.5000           nan  9.3797




  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1           nan       [32m0.5000[0m           nan  8.5336
      2           nan       0.5000           nan  8.4787
      3           nan       0.5000           nan  9.4265
      4           nan       0.5000           nan  9.2578
      5           nan       0.5000           nan  9.1326


In [9]:
predicted_labels = pred_probs.argmax(axis=1)

acc = accuracy_score(label, predicted_labels)
print(f"Cross-validated estimate of accuracy on held-out data: {acc}")

Cross-validated estimate of accuracy on held-out data: 0.41225


In [11]:
pred_probs

array([[-21.042198, -10.585772],
       [-20.357294, -20.46756 ],
       [-22.115055, -10.822799],
       ...,
       [-14.730211,  46.487206],
       [-14.644593,  31.0262  ],
       [-38.34266 ,   8.534811]], dtype=float32)

In [None]:
# Use cleanlab to find label issues
from cleanlab.filter import find_label_issues

ranked_label_issues = find_label_issues(
    label,
    pred_probs,
    return_indices_ranked_by="self_confidence",
)

print(f"Cleanlab found {len(ranked_label_issues)} label issues.")
print(f"Top 15 most likely label errors: \n {ranked_label_issues[:15]}")