use activelab: https://github.com/cleanlab/examples/blob/master/active_learning_single_annotator/active_learning_single_annotator.ipynb

In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import DataLoader, TensorDataset
import time

from sklearn.model_selection import train_test_split
from cleanlab.multiannotator import get_label_quality_scores, get_active_learning_scores

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# model.py

models_dict = {'resnet18': torchvision.models.resnet18,
               'resnet34': torchvision.models.resnet34,
               'resnet50': torchvision.models.resnet50,
               'resnet101': torchvision.models.resnet101,
               'resnet152': torchvision.models.resnet152}

class ResNet(nn.Module):
    def __init__(self, model='resnet18',n_channels=4,n_filters=64,n_classes=1,kernel_size=3,stride=1,padding=1):
        super().__init__()
        self.n_classes = n_classes
        self.base_model = models_dict[model](pretrained=True)
        self._feature_vector_dimension = self.base_model.fc.in_features
        self.base_model.conv1 = nn.Conv2d(n_channels, n_filters, kernel_size=kernel_size, stride=stride, padding=padding, bias=False)
        self.base_model = nn.Sequential(*list(self.base_model.children())[:-1]) # Remove the final fully connected layer
        self.fc = nn.Linear(self._feature_vector_dimension, n_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.base_model(x)
        features = x.view(x.size(0), -1)
        return self.fc(features)

    def extract_features(self,x):
        x = self.base_model(x)
        return x.view(x.size(0), -1)
    
    def extract_early_features(self, x):  # try earlier layer
        x = self.base_model.conv1(x)
        x = self.base_model.bn1(x)
        x = self.base_model.relu(x)
        x = self.base_model.maxpool(x)
        x = self.base_model.layer1(x)

        return x.view(x.size(0), -1)

    def get_predictions(self,x):
        x = self.base_model(x)
        features = x.view(x.size(0), -1)
        output = self.fc(features)
        if self.n_classes == 1:
            return torch.sigmoid(output)
        else:
            return torch.softmax(output,dim=1)

    def get_predictions_and_features(self,x):
        x = self.base_model(x)
        features = x.view(x.size(0), -1)
        output = self.fc(features)
        if self.n_classes == 1:
            return torch.sigmoid(output), features
        else:
            return torch.softmax(output,dim=1), features

    def get_features(self,x):
        x = self.base_model(x)
        features = x.view(x.size(0), -1)
        return features

In [3]:
# utils.py

def generate_predictions_and_features(model,images,batch_size, verbose=True):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    if images.dtype == np.uint8:
        images = images.astype(np.float32)/255.0 # convert to 0-1 if uint8 input

    # build dataset
    dataset = TensorDataset(
        torch.from_numpy(images), 
        torch.from_numpy(np.ones(images.shape[0]))
        )

    # dataloader
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)

    # run inference 
    all_features = []
    all_predictions = []
    t0 = time.time()

    for k, (images, labels) in enumerate(dataloader):

        images = images.float().to(device)

        predictions, features = model.get_predictions_and_features(images)
        predictions = predictions.detach().cpu().numpy()
        features = features.detach().cpu().numpy().squeeze()

        all_predictions.append(predictions)
        all_features.append(features)

    predictions = np.vstack(all_predictions)
    features = np.vstack(all_features)

    if verbose:
        print('running inference on ' + str(predictions.shape[0]) + ' images took ' + str(time.time()-t0) + ' s')

    return predictions, features

In [4]:
dir_path = '../../npy_v2/'

trained_on = ['PAT-070-3_2023-01-22_15-24-28.812821.npy',
                'PAT-071-3_2023-01-22_15-47-3.096602.npy',
                'PAT-072-1_2023-01-22_17-17-58.363496.npy',
                'PAT-073-1_2023-01-22_16-32-5.192404.npy',
                'PAT-074-1_2023-01-22_16-55-50.887780.npy',
                'PBC-404-1_2023-01-22_19-09-9.267139.npy',
                'PBC-502-1_2023-01-22_17-49-38.429975.npy',
                'PBC-800-1_2023-01-22_21-30-44.794123.npy',
                'PBC-801-1_2023-01-22_22-06-18.047215.npy',
                'PBC-1023-1_2023-01-22_19-59-54.633046.npy']

# exclude above, everything else in npy_v2 should be loaded into images_unlabelled
files_unlabelled = [f for f in os.listdir(dir_path) if f.endswith('.npy') and f not in trained_on and not f.startswith('SBC')]
# TODO: fold in known SBC negatives

In [5]:
images_unlabelled = []
for f in files_unlabelled[:10]: # TODO: do it in batches, 41 slides total
    images_unlabelled.append(np.load(dir_path + f))

images_unlabelled = np.vstack(images_unlabelled)
images_unlabelled.shape

Train model on labeled data and get predicted class probabilites for unlabeled data

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = ResNet(model='resnet34', n_channels=4, n_filters=64, n_classes=3, kernel_size=3, stride=1, padding=1)
model.load_state_dict(torch.load('../model_resnet34_1704164366.7755363.pt'))
model = model.to(device)



In [10]:
def active_query(model, images, batch_size):
    """
    Selects a subset of the o.o.d. data to interactively query user to obtain labels, based on active learning.
    """
    # generate predictions and features
    pred_probs_unlabeled, features = generate_predictions_and_features(model,images,batch_size)
    pred_probs_unlabeled = pred_probs_unlabeled.squeeze()

    # compute active learning scores
    _, active_learning_scores_unlabeled = get_active_learning_scores(
        pred_probs_unlabeled=pred_probs_unlabeled
        # df_labeled['label'].to_numpy(), pred_probs_unlabeled=pred_probs_unlabeled  # TODO: in the future can also choose relabeling from labeled dataset (may need ood preds)
    )

    # active_learning_scores_unlabeled[:5]

    return np.argsort(active_learning_scores_unlabeled)[:batch_size]

In [8]:
next_to_label = active_query(model, images_unlabelled)

running inference on 1565289 images took 151.5050084590912 s


In [9]:
next_to_label

array([ 799615, 1088715,  197927,  177382,  320682,    6950,  759281,
       1267554,  336530,  789011, 1244383, 1220176, 1273893,  760205,
        830740, 1235760,  208420, 1351656,  494312,    2584, 1266352,
       1284274, 1289756, 1287595, 1412573,  135135, 1248574,  528091,
       1279330, 1086184,  376974, 1247145])

- show images to label, get user labels
- update model
- remove from unlabeled images

In [10]:
def update(model, images, labels, num_epochs=10, learning_rate=0.1):   # TODO: simple update with higher LR, can weight based on scores/how off

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    if images.dtype == np.uint8:
        images = images.astype(np.float32)/255.0 # convert to 0-1 if uint8 input

    dataset = TensorDataset(
        torch.from_numpy(images), 
        torch.from_numpy(labels)
    )
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=False)

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    model.train()

    for epoch in range(num_epochs):
        for inputs, labels in dataloader: 

            inputs = inputs.float().to(device)
            labels = labels.long().to(device)

            optimizer.zero_grad() 
            output = model(inputs)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
    
    return model

In [8]:
# simple test run
# test = active_query(data)
# new_model = finetune(model, test)

running inference on 32 images took 0.23706650733947754 s
(32, 4, 31, 31) (32, 3) (32,)


active learning rounds

In [None]:
num_rounds = 10

for _ in range(num_rounds):
    # get next batch to label
    next_to_label = active_query(model, images_unlabelled)
    images = images_unlabelled[next_to_label]

    # get labels from user
    preds, _ = generate_predictions_and_features(model, images, 32) # TODO: already got this from active_query, can just pass in
    labels = np.random.choice(2, len(images)) # TODO: get labels from user

    # update model
    model = update(model, images, labels, 10, 0.1)

    # remove from unlabelled
    images_unlabelled = np.delete(images_unlabelled, next_to_label, axis=0)

    # TODO: evaluate on hold-out set, or stop when user no longer needs to correct

np.save(model, 'model_active_learning.npy')