# Load CIFAR-10H data
This notebook will run our experiments using an expert based on the CIFAR-10H dataset.

Results are for the Confidence approach and our method.

In [None]:
import math
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import argparse
import os
import shutil
import time
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.autograd import Variable

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)


Download dataset from https://github.com/jcpeterson/cifar-10h/blob/master/data/cifar10h-probs.npy, copied in repo for convenience.

In [None]:

cifar10h = np.load('cifar10h-probs.npy')


In [None]:
def metrics_cifar10h(cifar10h, loader):
    correct = 0
    total = 0
    j = 0
    class_wise = [0] * 10
    class_counts = [0] * 10
    for data in loader:
        images, labels = data
        images, labels = images.to(device), labels.to(device)
        batch_size = labels.size()[0]            # batch_size
        for i in range(0,batch_size):
            exp_prediction = np.argmax(np.random.multinomial(1, cifar10h[j]))
            total += 1
            j+= 1
            correct += (exp_prediction == labels[i]).item()
            class_wise[labels[i].item()] += (exp_prediction == labels[i]).item()
            class_counts[labels[i].item()] += 1
    for i in range(0,10):
        class_wise[i] = 100*class_wise[i] / class_counts[i]
    to_print={"classifier accuracy":100*correct/total }
    return 100*correct/total, class_wise


def metrics_cifar10h_class_wise(cifar10h, loader):
    correct = 0
    total = 0
    j = 0
    class_wise = [0] * 10
    class_counts = [0] * 10
    for data in loader:
        images, labels = data
        images, labels = images.to(device), labels.to(device)
        batch_size = labels.size()[0]            # batch_size
        for i in range(0,batch_size):
            exp_prediction = np.argmax(np.random.multinomial(1, cifar10h[j]))
            total += 1
            j+= 1
            correct += (exp_prediction == labels[i]).item()
            class_wise[labels[i].item()] += (exp_prediction == labels[i]).item()
            class_counts[labels[i].item()] += 1
    for i in range(0,10):
        class_wise[i] = 100*class_wise[i] / class_counts[i]
    return class_wise

# data
normalize = transforms.Normalize(mean=[x/255.0 for x in [125.3, 123.0, 113.9]],
                                     std=[x/255.0 for x in [63.0, 62.1, 66.7]])

if False:
    transform_train = transforms.Compose([
        transforms.ToTensor(),
        transforms.Lambda(lambda x: F.pad(x.unsqueeze(0),
                            (4,4,4,4),mode='reflect').squeeze()),
        transforms.ToPILImage(),
        transforms.RandomCrop(32),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
        ])
else:
    transform_train = transforms.Compose([
        transforms.ToTensor(),
        normalize,
        ])
transform_test = transforms.Compose([
    transforms.ToTensor(),
    normalize
    ])

dataset = 'cifar10'


kwargs = {'num_workers': 1, 'pin_memory': True}
train_loader = torch.utils.data.DataLoader(
    datasets.__dict__[dataset.upper()]('../data', train=True, download=True,
                        transform=transform_train),
    batch_size=128, shuffle=True, **kwargs)
val_loader = torch.utils.data.DataLoader(
    datasets.__dict__[dataset.upper()]('../data', train=False, transform=transform_test),
    batch_size=128, shuffle=False, **kwargs)

print class wise accuracies of expert

In [None]:
total, class_wise = metrics_cifar10h(cifar10h,val_loader)
print(f'average accuracy of CIFAR-10H expert is: {total}')
print(f'accuracy on the 10 classes is: {class_wise}')

# Load other utilities

In [None]:
class BasicBlock(nn.Module):
    def __init__(self, in_planes, out_planes, stride, dropRate=0.0):
        super(BasicBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_planes)
        self.relu2 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_planes, out_planes, kernel_size=3, stride=1,
                               padding=1, bias=False)
        self.droprate = dropRate
        self.equalInOut = (in_planes == out_planes)
        self.convShortcut = (not self.equalInOut) and nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride,
                               padding=0, bias=False) or None
    def forward(self, x):
        if not self.equalInOut:
            x = self.relu1(self.bn1(x))
        else:
            out = self.relu1(self.bn1(x))
        out = self.relu2(self.bn2(self.conv1(out if self.equalInOut else x)))
        if self.droprate > 0:
            out = F.dropout(out, p=self.droprate, training=self.training)
        out = self.conv2(out)
        return torch.add(x if self.equalInOut else self.convShortcut(x), out)

class NetworkBlock(nn.Module):
    def __init__(self, nb_layers, in_planes, out_planes, block, stride, dropRate=0.0):
        super(NetworkBlock, self).__init__()
        self.layer = self._make_layer(block, in_planes, out_planes, nb_layers, stride, dropRate)
    def _make_layer(self, block, in_planes, out_planes, nb_layers, stride, dropRate):
        layers = []
        for i in range(int(nb_layers)):
            layers.append(block(i == 0 and in_planes or out_planes, out_planes, i == 0 and stride or 1, dropRate))
        return nn.Sequential(*layers)
    def forward(self, x):
        return self.layer(x)

class WideResNet(nn.Module):
    def __init__(self, depth, num_classes, widen_factor=1, dropRate=0.0):
        super(WideResNet, self).__init__()
        nChannels = [16, 16*widen_factor, 32*widen_factor, 64*widen_factor]
        assert((depth - 4) % 6 == 0)
        n = (depth - 4) / 6
        block = BasicBlock
        # 1st conv before any network block
        self.conv1 = nn.Conv2d(3, nChannels[0], kernel_size=3, stride=1,
                               padding=1, bias=False)
        # 1st block
        self.block1 = NetworkBlock(n, nChannels[0], nChannels[1], block, 1, dropRate)
        # 2nd block
        self.block2 = NetworkBlock(n, nChannels[1], nChannels[2], block, 2, dropRate)
        # 3rd block
        self.block3 = NetworkBlock(n, nChannels[2], nChannels[3], block, 2, dropRate)
        # global average pooling and classifier
        self.bn1 = nn.BatchNorm2d(nChannels[3])
        self.relu = nn.ReLU(inplace=True)
        self.fc = nn.Linear(nChannels[3], num_classes)
        self.nChannels = nChannels[3]
        self.softmax = nn.Softmax()
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.bias.data.zero_()
    def forward(self, x):
        out = self.conv1(x)
        out = self.block1(out)
        out = self.block2(out)
        out = self.block3(out)
        out = self.relu(self.bn1(out))
        out = F.avg_pool2d(out, 8)
        out = out.view(-1, self.nChannels)
        out = self.fc(out)
        out = self.softmax(out)
        return out



In [None]:

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res


    
import random
def metrics_print(net, expert_fn, n_classes, loader):
    correct = 0
    correct_sys = 0
    exp = 0
    exp_total = 0
    total = 0
    real_total = 0
    alone_correct = 0
    with torch.no_grad():
        for data in loader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            batch_size = outputs.size()[0]            # batch_size
            exp_prediction = expert_fn(images, labels)
            for i in range(0,batch_size):
                r = (predicted[i].item() == n_classes)
                prediction = predicted[i]
                if predicted[i] == 10:
                    max_idx = 0
                    # get second max
                    for j in range(0,10):
                        if outputs.data[i][j] >= outputs.data[i][max_idx]:
                            max_idx = j
                    prediction = max_idx
                else:
                    prediction = predicted[i]
                alone_correct += (prediction ==labels[i]).item()
                if r==0:
                    total += 1
                    correct += (predicted[i] == labels[i]).item()
                    correct_sys += (predicted[i] == labels[i]).item()
                if r==1:
                    exp += (exp_prediction[i] == labels[i].item())
                    correct_sys +=(exp_prediction[i] == labels[i].item())
                    exp_total+=1
                real_total += 1
    cov = str(total) + str(" out of") + str(real_total)
    to_print={"coverage":cov, "system accuracy": 100*correct_sys/real_total, "expert accuracy":100* exp/(exp_total+0.0002),"classifier accuracy":100*correct/(total+0.0001), "alone classifier": 100*alone_correct/real_total }
    print(to_print)

def metrics_print_baseline(net_class,   expert_fn, n_classes, loader):
    correct = 0
    correct_sys = 0
    exp = 0
    exp_total = 0
    total = 0
    real_total = 0
    with torch.no_grad():
        for data in loader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs_class = net_class(images)
            _, predicted = torch.max(outputs_class.data, 1)
            batch_size = outputs_class.size()[0]            # batch_size
            
            exp_prediction = expert_fn(images, labels)
            for i in range(0,batch_size):
                r = (exp_prediction[i] == labels[i].item())
                if r==0:
                    total += 1
                    prediction = predicted[i]
                    if predicted[i] == 10:
                        max_idx = 0
                        for j in range(0,10):
                            if outputs_class.data[i][j] >= outputs_class.data[i][max_idx]:
                                max_idx = j
                        prediction = max_idx
                    else:
                        prediction = predicted[i]
                    correct += (prediction == labels[i]).item()
                    correct_sys += (prediction == labels[i]).item()
                if r==1:
                    exp += (exp_prediction[i] == labels[i].item())
                    correct_sys +=(exp_prediction[i] == labels[i].item())
                    exp_total+=1
                real_total += 1
    cov = str(total) + str(" out of") + str(real_total)
    to_print={"coverage":cov, "system accuracy": 100*correct_sys/real_total, "expert accuracy":100* exp/(exp_total+0.0002),"classifier accuracy":100*correct/(total+0.0001) }
    print(to_print)

# Baseline: Confidence 

We load a  trained classification model on CIFAR-10. This model can be obtained from the cifar10_defer_baselines notebook. For convenience we have provided a trained model in ./models.

In [None]:
model_classifier = torch.load("./models/model_base")
model_classifier.eval()
model_classifier.to(device)


In [None]:
def my_CrossEntropyLoss(outputs, labels):
    batch_size = outputs.size()[0]            # batch_size
    outputs =  - torch.log2(outputs[range(batch_size), labels]+0.00001)   # pick the values corresponding to the labels
    return torch.sum(outputs)/batch_size

def train_expert(train_loader_exp, model, optimizer, scheduler, epoch, expert_fn, n_classes):
    """Train for one epoch on the training set"""
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader_exp):
        target = target.to(device)
        input = input.to(device)

        # compute output
        output = model(input)
        # compute new target
        batch_size = output.size()[0]            # batch_size
        m = [0] * batch_size
        for j in range (0,batch_size):
            m[j] = dataset_expert[str(input[j].cpu().numpy())]
        m = torch.tensor(m)
        m = m.to(device)
        # compute loss
        loss = my_CrossEntropyLoss(output, m)


        # measure accuracy and record loss
        prec1 = accuracy(output.data, m, topk=(1,))[0]
        losses.update(loss.data.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % 10 == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                      epoch, i, len(train_loader), batch_time=batch_time,
                      loss=losses, top1=top1))


def validate_expert(val_loader_exp, model, epoch, expert_fn, n_classes):
    """Perform validation on the validation set"""
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    for i, (input, target) in enumerate(val_loader_exp):
        target = target.to(device)
        input = input.to(device)

        # compute output
        with torch.no_grad():
            output = model(input)
        # expert prediction
        batch_size = output.size()[0]            # batch_size
        m = [0] * batch_size
        for j in range (0,batch_size):
            m[j] = dataset_expert[str(input[j].cpu().numpy())]
        m = torch.tensor(m)
        m = m.to(device)
        # compute loss
        loss = my_CrossEntropyLoss(output, m)

        # measure accuracy and record loss
        prec1 = accuracy(output.data, m, topk=(1,))[0]
        losses.update(loss.data.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % 10 == 0:
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                      i, len(val_loader), batch_time=batch_time, loss=losses,
                      top1=top1))

    print(' * Prec@1 {top1.avg:.3f}'.format(top1=top1))

    return top1.avg
best_prec1 = 0
def run_expert(model, data_aug, n_dataset, expert_fn, epochs):
    global best_prec1
    # Data loading code
    
    # get the number of model parameters
    print('Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model.parameters()])))

    # for training on multiple GPUs.
    # Use CUDA_VISIBLE_DEVICES=0,1 to specify which GPUs to use
    # model = torch.nn.DataParallel(model).cuda()
    model = model.to(device)

    # optionally resume from a checkpoint
    

    cudnn.benchmark = True

    # define loss function (criterion) and optimizer
    optimizer = torch.optim.SGD(model.parameters(), 0.1,
                                momentum=0.9, nesterov = True,
                                weight_decay=5e-4)

    # cosine learning rate
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, len(train_loader)*200)

    for epoch in range(0, epochs):
        # train for one epoch
        train_expert(val_loader, model, optimizer, scheduler, epoch, expert_fn, n_dataset)

        # evaluate on validation set
        prec1 = validate_expert(val_loader_rej, model, epoch, expert_fn, n_dataset)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)

    print('Best accuracy: ', best_prec1)



In [None]:
def metrics_print_confid_cifar10h(net_mod, net_exp, dataset_expert_probs, n_classes, loader):
    correct = 0
    correct_sys = 0
    exp = 0
    exp_total = 0
    total = 0
    real_total = 0
    with torch.no_grad():
        for data in loader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs_mod = net_mod(images)
            outputs_exp = net_exp(images)
            _, predicted = torch.max(outputs_mod.data, 1)
            _, predicted_exp = torch.max(outputs_exp.data, 1)
            batch_size = outputs_mod.size()[0]            # batch_size
            for i in range(0,batch_size):
                r_score = 1 - outputs_mod.data[i][predicted[i].item()].item()
                r_score = r_score - outputs_exp.data[i][1].item()
                r = 0
                if r_score >= 0:
                    r = 1
                else:
                    r =  0
                if r==0:
                    total += 1
                    correct += (predicted[i] == labels[i]).item()
                    correct_sys += (predicted[i] == labels[i]).item()
                if r==1:
                    exp_prediction = np.argmax(np.random.multinomial(1, dataset_expert_probs[str(images[i].cpu().numpy())]))
                    exp += (exp_prediction == labels[i].item())
                    correct_sys +=(exp_prediction == labels[i].item())
                    exp_total+=1
                real_total += 1
    cov = str(total) + str(" out of") + str(real_total)
    to_print={"coverage":cov, "system accuracy": 100*correct_sys/real_total, "expert accuracy":100* exp/(exp_total+0.0002),"classifier accuracy":100*correct/(total+0.0001) }
    print(to_print)
    return [100*total/real_total,  100*correct_sys/real_total, 100* exp/(exp_total+0.0002),100*correct/(total+0.0001) ]

Run experiment for confidence model

In [None]:
experiment_data = []
max_trials = 10
for experiment in range(0, max_trials):
    normalize = transforms.Normalize(mean=[x/255.0 for x in [125.3, 123.0, 113.9]],
                                        std=[x/255.0 for x in [63.0, 62.1, 66.7]])


    transform_train = transforms.Compose([
            transforms.ToTensor(),
            normalize,
            ])
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        normalize
        ])

    n_dataset =10
    dataset = 'cifar10'
    kwargs = {'num_workers': 1, 'pin_memory': True}
    val_loader = torch.utils.data.DataLoader(
        datasets.__dict__[dataset.upper()]('../data', train=False, transform=transform_test),
        batch_size=128, shuffle=False, **kwargs)

    dataset_expert = {}
    dataset_expert_probs = {}
    j = 0
    for data in val_loader:
        images, labels = data
        images, labels = images.to(device), labels.to(device)
        batch_size = labels.size()[0]            # batch_size
        for i in range(0,batch_size):
            exp_prediction = np.argmax(np.random.multinomial(1, cifar10h[j]))
            if (exp_prediction == labels[i]).item() :
                dataset_expert[str(images[i].cpu().numpy())] =  0
            else:
                dataset_expert[str(images[i].cpu().numpy())] = 1
            dataset_expert_probs[str(images[i].cpu().numpy())] = cifar10h[j]
            j+= 1
    normalize = transforms.Normalize(mean=[x/255.0 for x in [125.3, 123.0, 113.9]],
                                     std=[x/255.0 for x in [63.0, 62.1, 66.7]])


    transform_train = transforms.Compose([
        transforms.ToTensor(),
        normalize,
        ])
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        normalize
        ])

    n_dataset = 10
    dataset = 'cifar10'
    kwargs = {'num_workers': 1, 'pin_memory': True}


    train_loader = torch.utils.data.DataLoader(
    datasets.__dict__[dataset.upper()]('../data', train=True, download=True,
                                        transform=transform_train),
                        batch_size=128, shuffle=True, **kwargs)

    val_dataset_all = datasets.__dict__[dataset.upper()]('../data', train=False, download=True, transform=transform_test)

    val_size = int(0.5 * len(val_dataset_all))
    val_size_rej = len(val_dataset_all) - val_size

    val_dataset, val_dataset_rej = torch.utils.data.random_split(val_dataset_all, [val_size, val_size_rej])

    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=128, shuffle=True, **kwargs)

    val_loader_rej = torch.utils.data.DataLoader(
        val_dataset_rej,
        batch_size=128, shuffle=True, **kwargs)

    model_expert = WideResNet(28, 2, 4, dropRate=0)
    run_expert(model_expert, False, n_dataset, 5, 1)
    batch_data = metrics_print_confid_cifar10h(model_classifier, model_expert, dataset_expert_probs, 10, val_loader_rej)
    experiment_data.append(batch_data)


## Evaluate

In [None]:
metrics = ['coverage', 'system accuracy', 'expert accuracy', 'classifier accuracy']
print("Results for confidence approach")
for i in range(0,4):
    arr = [0] * max_trials
    for j in range(0,max_trials):
        arr[j] = experiment_data[j][i]
    print(f'{metrics[i]}: avg = {np.average(arr):.3f}, std= {np.std(arr):.3f}  ')

# Our method: 2-stage training

In [None]:
def metrics_print_my_cifar10h(net_mod, dataset_expert_probs, n_classes, loader):
    correct = 0
    correct_sys = 0
    exp = 0
    exp_total = 0
    total = 0
    real_total = 0
    with torch.no_grad():
        for data in loader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs_mod = net_mod(images)
            _, predicted = torch.max(outputs_mod.data, 1)
            batch_size = outputs_mod.size()[0]            # batch_size
            for i in range(0,batch_size):
                r = (predicted[i] == 10)
                if r==0:
                    total += 1
                    correct += (predicted[i] == labels[i]).item()
                    correct_sys += (predicted[i] == labels[i]).item()
                if r==1:
                    exp_prediction = np.argmax(np.random.multinomial(1, dataset_expert_probs[str(images[i].cpu().numpy())]))
                    exp += (exp_prediction == labels[i].item())
                    correct_sys +=(exp_prediction == labels[i].item())
                    exp_total+=1
                real_total += 1
    cov = str(total) + str(" out of") + str(real_total)
    to_print={"coverage":cov, "system accuracy": 100*correct_sys/real_total, "expert accuracy":100* exp/(exp_total+0.0002),"classifier accuracy":100*correct/(total+0.0001) }
    print(to_print)
    return [100*total/real_total,  100*correct_sys/real_total, 100* exp/(exp_total+0.0002),100*correct/(total+0.0001) ]

In [None]:
def reject_CrossEntropyLoss(outputs, m, labels, m2, n_classes):
    '''
    The L_{CE} loss implementation for CIFAR
    ----
    outputs: network outputs
    m: cost of deferring to expert cost of classifier predicting (I_{m =y})
    labels: target
    m2:  cost of classifier predicting (alpha* I_{m\neq y} + I_{m =y})
    n_classes: number of classes
    '''
    batch_size = outputs.size()[0]  # batch_size
    rc = [n_classes] * batch_size
    outputs = -m * torch.log2(outputs[range(batch_size), rc]) - m2 * torch.log2(
        outputs[range(batch_size), labels])  
    return torch.sum(outputs) / batch_size

def train_reject(train_loader, model, optimizer, scheduler, epoch, expert_fn, n_classes, alpha):
    """Train for one epoch on the training set"""
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        target = target.to(device)
        input = input.to(device)

        # compute output
        output = model(input)

        # zero the parameter gradients
        c = 0 #1-cost
        # expert  predictions
        batch_size = output.size()[0]            # batch_size
        m = [0] * batch_size
        m2 = [0] * batch_size
        for j in range (0,batch_size):
            if str(input[j].cpu().numpy()) in dataset_expert:
                exp = dataset_expert[str(input[j].cpu().numpy())]
                if not exp:
                    m[j] =  1
                    m2[j] = alpha
                else:
                    m[j] = 0
                    m2[j] = 1
            else:
                m[j] = 0
                m2[j] = 1
        m = torch.tensor(m)
        m2 = torch.tensor(m2)
        m = m.to(device)
        m2 = m2.to(device)

        # compute loss
        criterion = nn.CrossEntropyLoss()
        #loss = criterion(output,target)
        loss = reject_CrossEntropyLoss(output, m, target, m2, n_classes)

        # measure accuracy and record loss
        prec1 = accuracy(output.data, target, topk=(2,))[0]
        losses.update(loss.data.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % 10 == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                      epoch, i, len(train_loader), batch_time=batch_time,
                      loss=losses, top1=top1))


def validate_reject(val_loader, model, epoch, expert_fn, n_classes):
    """Perform validation on the validation set"""
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    for i, (input, target) in enumerate(val_loader):
        target = target.to(device)
        input = input.to(device)

        # compute output
        with torch.no_grad():
            output = model(input)
        # expert prediction
        batch_size = output.size()[0]            # batch_size
        m = [0] * batch_size
        alpha = 0.5
        m2 = [0] * batch_size
        for j in range (0,batch_size):
            if str(input[j].cpu().numpy()) in dataset_expert:
                exp = dataset_expert[str(input[j].cpu().numpy())]
                if not exp:
                    m[j] =  1
                    m2[j] = alpha
                else:
                    m[j] = 0
                    m2[j] = 1
            else:
                m[j] = 0
                m2[j] = 1
        m = torch.tensor(m)
        m2 = torch.tensor(m2)
        m = m.to(device)
        m2 = m2.to(device)
        # compute loss
        loss = reject_CrossEntropyLoss(output, m, target, m2, n_classes)

        # measure accuracy and record loss
        prec1 = accuracy(output.data, target, topk=(2,))[0]
        losses.update(loss.data.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % 10 == 0:
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                      i, len(val_loader), batch_time=batch_time, loss=losses,
                      top1=top1))

    print(' * Prec@1 {top1.avg:.3f}'.format(top1=top1))

    return top1.avg
best_prec1 = 0
def run_reject(model, data_aug, n_dataset, expert_fn, epochs, alpha):
    global best_prec1

    # get the number of model parameters
    print('Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model.parameters()])))

    # for training on multiple GPUs.
    # Use CUDA_VISIBLE_DEVICES=0,1 to specify which GPUs to use
    # model = torch.nn.DataParallel(model).cuda()
    model = model.to(device)

    # optionally resume from a checkpoint
    

    cudnn.benchmark = True

    # define loss function (criterion) and optimizer
    optimizer = torch.optim.SGD(model.parameters(), 0.0001,
                                momentum=0.9, nesterov = True,
                                weight_decay=5e-4)

    # cosine learning rate
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, len(train_loader)*200)

    for epoch in range(0, 1):
        # train for one epoch
        train_reject(train_loader, model, optimizer, scheduler, epoch, expert_fn, n_dataset, alpha)

        # evaluate on validation set
        prec1 = validate_reject(val_loader_rej, model, epoch, expert_fn, n_dataset)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        if epoch % 1 == 0:
            metrics_print_my_cifar10h(model, dataset_expert_probs, n_dataset, val_loader_rej)
    
    for epoch in range(0, epochs):
        # train for one epoch
        train_reject(val_loader, model, optimizer, scheduler, epoch, expert_fn, n_dataset, alpha)

        if epoch % 1 == 0:
            metrics_print_my_cifar10h(model, dataset_expert_probs, n_dataset, val_loader_rej)
    print('Best accuracy: ', best_prec1)


# Our method: impute

## train model of expert

In [None]:
def my_CrossEntropyLoss(outputs, labels):
    # m: expert costs, labels: ground truth, n_classes: number of classes
    batch_size = outputs.size()[0]            # batch_size
    outputs =  - torch.log2(outputs[range(batch_size), labels]+0.0001)   # pick the values corresponding to the labels
    return torch.sum(outputs)/batch_size

def train_expert(train_loader_exp, model, optimizer, scheduler, epoch, expert_fn, n_classes):
    """Train for one epoch on the training set"""
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader_exp):
        target = target.to(device)
        input = input.to(device)

        # compute output
        output = model(input)
        # compute new target
        batch_size = output.size()[0]            # batch_size
        m = [0] * batch_size
        for j in range (0,batch_size):
            m[j] = dataset_expert[str(input[j].cpu().numpy())]
        m = torch.tensor(m)
        m = m.to(device)
        # compute loss
        loss = my_CrossEntropyLoss(output, m)


        # measure accuracy and record loss
        prec1 = accuracy(output.data, m, topk=(1,))[0]
        losses.update(loss.data.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % 10 == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                      epoch, i, len(train_loader), batch_time=batch_time,
                      loss=losses, top1=top1))


def validate_expert(val_loader_exp, model, epoch, expert_fn, n_classes):
    """Perform validation on the validation set"""
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    for i, (input, target) in enumerate(val_loader_exp):
        target = target.to(device)
        input = input.to(device)

        # compute output
        with torch.no_grad():
            output = model(input)
        # expert prediction
        batch_size = output.size()[0]            # batch_size
        m = [0] * batch_size
        for j in range (0,batch_size):
            m[j] = dataset_expert[str(input[j].cpu().numpy())]

        m = torch.tensor(m)
        m = m.to(device)
        # compute loss
        loss = my_CrossEntropyLoss(output, m)

        # measure accuracy and record loss
        prec1 = accuracy(output.data, m, topk=(1,))[0]
        losses.update(loss.data.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % 10 == 0:
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                      i, len(val_loader), batch_time=batch_time, loss=losses,
                      top1=top1))

    print(' * Prec@1 {top1.avg:.3f}'.format(top1=top1))

    return top1.avg
best_prec1 = 0
def run_expert(model, data_aug, n_dataset, expert_fn, epochs):
    global best_prec1
    # Data loading code
    
    # get the number of model parameters
    print('Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model.parameters()])))

    # for training on multiple GPUs.
    # Use CUDA_VISIBLE_DEVICES=0,1 to specify which GPUs to use
    # model = torch.nn.DataParallel(model).cuda()
    model = model.to(device)

    # optionally resume from a checkpoint
    

    cudnn.benchmark = True

    # define loss function (criterion) and optimizer
    optimizer = torch.optim.SGD(model.parameters(), 0.1,
                                momentum=0.9, nesterov = True,
                                weight_decay=5e-4)

    # cosine learning rate
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, len(train_loader)*200)

    for epoch in range(0, epochs):
        # train for one epoch
        train_expert(val_loader, model, optimizer, scheduler, epoch, expert_fn, n_dataset)

        # evaluate on validation set
        prec1 = validate_expert(val_loader_rej, model, epoch, expert_fn, n_dataset)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)

    print('Best accuracy: ', best_prec1)


augment dataset with predictions from expert network

In [None]:
def reject_CrossEntropyLoss(outputs, m, labels, m2, n_classes):
    # m: expert costs, labels: ground truth, n_classes: number of classes
    batch_size = outputs.size()[0]            # batch_size
    rc = [n_classes] * batch_size
    rc = torch.tensor(rc)
    #outputs =  - torch.log2(outputs[range(batch_size), labels])   # regular CE
    outputs =  -m*torch.log2( outputs[range(batch_size), rc]) - m2*torch.log2(outputs[range(batch_size), labels])   # pick the values corresponding to the labels
    return torch.sum(outputs)/batch_size
def train_reject_(train_loader, model, optimizer, scheduler, epoch, expert_fn, n_classes, alpha):
    """Train for one epoch on the training set"""
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        target = target.to(device)
        input = input.to(device)

        # compute output
        output = model(input)

        # zero the parameter gradients
        c = 0 #1-cost
        # expert  predictions
        batch_size = output.size()[0]            # batch_size
        m = [0] * batch_size
        m2 = [1] * batch_size
        m = torch.tensor(m)
        m2 = torch.tensor(m2)
        m = m.to(device)
        m2 = m2.to(device)

        # compute loss
        #loss = criterion(output,target)
        loss = reject_CrossEntropyLoss(output, m, target, m2, n_classes)

        # measure accuracy and record loss
        prec1 = accuracy(output.data, target, topk=(1,))[0]
        losses.update(loss.data.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % 10 == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                      epoch, i, len(train_loader), batch_time=batch_time,
                      loss=losses, top1=top1))

def train_reject_impute(train_loader, model, optimizer, scheduler, epoch, expert_fn, n_classes, alpha):
    """Train for one epoch on the training set"""
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        target = target.to(device)
        input = input.to(device)

        # compute output
        output = model(input)

        # zero the parameter gradients
        c = 0 #1-cost
        # expert  predictions
        batch_size = output.size()[0]            # batch_size
        m = [0] * batch_size
        m2 = [0] * batch_size
        for j in range (0,batch_size):
            if str(input[j].cpu().numpy()) in dataset_expert:
                exp = dataset_expert[str(input[j].cpu().numpy())]
                if not exp:
                    m[j] =  1
                    m2[j] = alpha
                else:
                    m[j] = 0
                    m2[j] = 1
            else:
                m[j] = 0
                m2[j] = 1
        m = torch.tensor(m)
        m2 = torch.tensor(m2)
        m = m.to(device)
        m2 = m2.to(device)

        # compute loss
        criterion = nn.CrossEntropyLoss()
        #loss = criterion(output,target)
        loss = reject_CrossEntropyLoss(output, m, target, m2, n_classes)

        # measure accuracy and record loss
        prec1 = accuracy(output.data, target, topk=(1,))[0]
        losses.update(loss.data.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % 10 == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                      epoch, i, len(train_loader), batch_time=batch_time,
                      loss=losses, top1=top1))


def validate_reject_impute(val_loader, model, epoch, expert_fn, n_classes):
    """Perform validation on the validation set"""
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    for i, (input, target) in enumerate(val_loader):
        target = target.to(device)
        input = input.to(device)

        # compute output
        with torch.no_grad():
            output = model(input)
        # expert prediction
        batch_size = output.size()[0]            # batch_size
        m = [0] * batch_size
        alpha = 0.5
        m2 = [0] * batch_size
        for j in range (0,batch_size):
            if str(input[j].cpu().numpy()) in dataset_expert:
                exp = dataset_expert[str(input[j].cpu().numpy())]
                if not exp:
                    m[j] =  1
                    m2[j] = alpha
                else:
                    m[j] = 0
                    m2[j] = 1
            else:
                m[j] = 0
                m2[j] = 1
        m = torch.tensor(m)
        m2 = torch.tensor(m2)
        m = m.to(device)
        m2 = m2.to(device)
        # compute loss
        loss = reject_CrossEntropyLoss(output, m, target, m2, n_classes)

        # measure accuracy and record loss
        prec1 = accuracy(output.data, target, topk=(1,))[0]
        losses.update(loss.data.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % 10 == 0:
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                      i, len(val_loader), batch_time=batch_time, loss=losses,
                      top1=top1))

    print(' * Prec@1 {top1.avg:.3f}'.format(top1=top1))

    return top1.avg
best_prec1 = 0
def run_reject_impute(model, data_aug, n_dataset, expert_fn, epochs, alpha):
    global best_prec1

    # get the number of model parameters
    print('Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model.parameters()])))

    # for training on multiple GPUs.
    # Use CUDA_VISIBLE_DEVICES=0,1 to specify which GPUs to use
    # model = torch.nn.DataParallel(model).cuda()
    model = model.to(device)

    # optionally resume from a checkpoint
    

    cudnn.benchmark = True

    # define loss function (criterion) and optimizer
    optimizer = torch.optim.SGD(model.parameters(), 0.0001,
                                momentum=0.9, nesterov = True,
                                weight_decay=5e-4)

    # cosine learning rate
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, len(train_loader)*200)

    for epoch in range(0, 10):
        # train for one epoch
        train_reject_impute(train_loader, model, optimizer, scheduler, epoch, expert_fn, n_dataset, alpha)

        # evaluate on validation set
        prec1 = validate_reject_impute(val_loader_rej, model, epoch, expert_fn, n_dataset)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        if epoch % 1 == 0:
            metrics_print_my_cifar10h(model, dataset_expert_probs, n_dataset, val_loader_rej)
    
    for epoch in range(0, 2):
        # train for one epoch
        train_reject_impute(val_loader, model, optimizer, scheduler, epoch, expert_fn, n_dataset, alpha)
    
        # evaluate on validation set
        prec1 = validate_reject_impute(val_loader_rej, model, epoch, expert_fn, n_dataset)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        if epoch % 1 == 0:
            metrics_print_my_cifar10h(model, dataset_expert_probs, n_dataset, val_loader_rej)
    print('Best accuracy: ', best_prec1)


run experiment for our method impute

In [None]:
experiment_data = []
max_trials = 10
for experiment in range(0, max_trials):
    normalize = transforms.Normalize(mean=[x/255.0 for x in [125.3, 123.0, 113.9]],
                                        std=[x/255.0 for x in [63.0, 62.1, 66.7]])


    transform_train = transforms.Compose([
            transforms.ToTensor(),
            normalize,
            ])
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        normalize
        ])

    n_dataset =10
    dataset = 'cifar10'
    kwargs = {'num_workers': 1, 'pin_memory': True}
    val_loader = torch.utils.data.DataLoader(
        datasets.__dict__[dataset.upper()]('../data', train=False, transform=transform_test),
        batch_size=128, shuffle=False, **kwargs)

    dataset_expert = {}
    dataset_expert_probs = {}
    j = 0
    for data in val_loader:
        images, labels = data
        images, labels = images.to(device), labels.to(device)
        batch_size = labels.size()[0]            # batch_size
        for i in range(0,batch_size):
            exp_prediction = np.argmax(np.random.multinomial(1, cifar10h[j]))
            if (exp_prediction == labels[i]).item() :
                dataset_expert[str(images[i].cpu().numpy())] =  0
            else:
                dataset_expert[str(images[i].cpu().numpy())] = 1
            dataset_expert_probs[str(images[i].cpu().numpy())] = cifar10h[j]
            j+= 1

    normalize = transforms.Normalize(mean=[x/255.0 for x in [125.3, 123.0, 113.9]],
                                        std=[x/255.0 for x in [63.0, 62.1, 66.7]])


    transform_train = transforms.Compose([
        transforms.ToTensor(),
        normalize,
        ])
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        normalize
        ])

    n_dataset = 10
    dataset = 'cifar10'
    kwargs = {'num_workers': 1, 'pin_memory': True}


    train_loader = torch.utils.data.DataLoader(
    datasets.__dict__[dataset.upper()]('../data', train=True, download=True,
                                        transform=transform_train),
                        batch_size=128, shuffle=True, **kwargs)

    val_dataset_all = datasets.__dict__[dataset.upper()]('../data', train=False, download=True, transform=transform_test)

    val_size = int(0.5 * len(val_dataset_all))
    val_size_rej = len(val_dataset_all) - val_size

    val_dataset, val_dataset_rej = torch.utils.data.random_split(val_dataset_all, [val_size, val_size_rej])

    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=128, shuffle=True, **kwargs)

    val_loader_rej = torch.utils.data.DataLoader(
        val_dataset_rej,
        batch_size=128, shuffle=True, **kwargs)

    combined_data = torch.utils.data.ConcatDataset([datasets.__dict__[dataset.upper()]('../data', train=True, download=True,
                                        transform=transform_train),val_dataset])
    train_loader = torch.utils.data.DataLoader(combined_data,
                        batch_size=128, shuffle=True, **kwargs)

    model_expert = WideResNet(28, 2, 4, dropRate=0)
    run_expert(model_expert, False, n_dataset, 2, 2)

    with torch.no_grad():
            for data in train_loader:
                images, labels = data
                images, labels = images.to(device), labels.to(device)
                outputs_mod = model_expert(images)
                _, predicted = torch.max(outputs_mod.data, 1)
                batch_size = outputs_mod.size()[0]            # batch_size
                for i in range(0,batch_size):
                    exp_prediction = predicted[i]
                    dataset_expert[str(images[i].cpu().numpy())] =  exp_prediction
    alpha = 1
    model = torch.load("models/model_rejector_base")

    run_reject_impute(model, False, n_dataset, 2, 15, alpha)
    batch_data =  metrics_print_my_cifar10h(model, dataset_expert_probs, n_dataset, val_loader_rej)
    experiment_data.append(batch_data)
    


## Evaluate

In [None]:
metrics = ['coverage', 'system accuracy', 'expert accuracy', 'classifier accuracy']
print("Results for our method Impute ")
for i in range(0,4):
    arr = [0] * max_trials
    for j in range(0,max_trials):
        arr[j] = experiment_data[j][i]
    print(f'{metrics[i]}: avg = {np.average(arr):.3f}, std= {np.std(arr):.3f}  ')