In [1]:
# Set up the environment 
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

compress_out_dim = 256

In [2]:
# Clone the ResNet implementation we are going to use
# state_dict_path = 'resnet_20_64_pretrain.pt'
state_dict_path = 'resnet_20_{}_pretrain.pt'.format(compress_out_dim)
import os
import shutil

if not os.path.exists(state_dict_path):
    !git clone https://github.com/geos98/CS294-082.git
    shutil.copy(os.path.join('CS294-082', state_dict_path), '.')
# !git clone https://github.com/akamaster/pytorch_resnet_cifar10

fatal: destination path 'pytorch_resnet_cifar10' already exists and is not an empty directory.


In [3]:
if compress_out_dim == 256:
    from resnet_256_out import resnet20
if compress_out_dim == 64:
    from resnet_64_out import resnet20

state_dict = torch.load(state_dict_path, map_location=device)

model = resnet20()
miss = model.load_state_dict(state_dict)

print(miss)

model.to(device)
model.eval()


<All keys matched successfully>


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (shortcut): Sequential()
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=

In [4]:
# Create datasets
import torchvision.transforms as transforms
import torchvision

from torch.utils.data import DataLoader

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, 4),
    transforms.ToTensor(),
    normalize,
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    normalize,
])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=train_transform)

# Train data with test set trainsform (i.e., no crop, no flip)
traintestset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=test_transform)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                        download=True, transform=test_transform)

train_loader = DataLoader(trainset, batch_size=128, shuffle=True, pin_memory=True)
test_loader = DataLoader(testset, batch_size=128, shuffle=False, pin_memory=True)
traintest_loader = DataLoader(traintestset, batch_size=128, shuffle=False, pin_memory=True)


Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [5]:
# Evaluation on accuracy
from tqdm import tqdm


with torch.no_grad():
    model.eval()

    correct_test = 0
    total_test = 0
    # Accuracy on test set
    for p, l in tqdm(test_loader):
        p = p.to(device)
        out = model(p)
        _, predicted = out.max(1)

        correct_test += predicted.eq(l).sum().item()
        total_test += l.size(0)
    print('Testing Set: Predicted {} instances out of {}'.format(correct_test, total_test))

    correct_train = 0
    total_train = 0
    # Accuracy on test set
    for p, l in tqdm(traintest_loader):
        p = p.to(device)
        out = model(p)
        _, predicted = out.max(1)

        correct_train += predicted.eq(l).sum().item()
        total_train += l.size(0)
    print('Trainning Set: Predicted {} instances out of {}'.format(correct_train, total_train))

100%|██████████| 79/79 [01:11<00:00,  1.10it/s]


Testing Set: Predicted 9410 instances out of 10000


 83%|████████▎ | 324/391 [05:02<01:03,  1.06it/s]

## Q1: What is the variable the machine learner is supposed to predict? How accurate is the labeling? What is the annotator agreement (measured)?
The CIFAR-10 dataset consists of 60,000 images of 10 classes which are  `airplane`, `automobile`, `bird`, `cat`, `deer`, `dog`, `frog`, `horse`, `ship` and `truck`. The machine learner should take-in an image and predict what is the object in the image by outputing one of the 10 classes listed above.

There are a total of 22 known and validated label error exists in this dataset[1]. The label error was found algorithmically and further verified by human. This is 99.96% label accuracy. For the purpose of this project, we will ignore this error and assume the labelling is correct. 

Researchers at the University of Toronto paid human labler to label the dataset[2]. Each labelling is done by one human labler and verified by the one of the researchers after initial labelling. The in-accuracy could be caused by human error due to the large workload needed to label this dataset.

## Q2: What is the required accuracy metric for success? How much data do we have to train the prediction of the variable? Are the classes balanced? How many modalities could be exploited in the data? Is there temporal information? How much noise are we expecting? Do we expect bias?

Since this dataset serves as a benchmark towards the machine learner's ability to classify subjects in the image, the required accuracy metric is higher the better. However, there were no real-world implication for the success at this time.

We have 60,000 images, splited into a trainning set of 50,000 images and testing set of 10,000 images. 

The classes are perfectly balanced in both trainning set and testing set. Meaning that in total there are 6,000 images of each class.

There is only one modality, the visual, that we can exploit. 

There are no temporal information. 

Since every sample is an image with a background, and we are only interested in the main subject, this means that all the background portion of the image is effectively noise for our purpose. Therefore, we need to apply strategies like convolution to extract important features.

We expect the machine learner to focus on the center of the image since that is the place where most subject would exists in the image.

## Q3: What is the MEC of the data? What is the expected MEC for a neural net?

Since we only care about the MEC of the data after the "compression", we will create a CSV file of data after compression

In [6]:
# Generate .csv from pre-trained ResNet-18 (only using the conv layer to extract deepfeature)
import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
from itertools import chain

# Get pre-trained conv output
def run_conv(x):
    out = F.relu(model.bn1(model.conv1(x)))
    out = model.layer1(out)
    out = model.layer2(out)
    out = model.layer3(out)
    out = F.avg_pool2d(out, out.size()[3])
    out = out.view(out.size(0), -1)

    return out

compressed_file = 'ompressed_{}.csv'.format(compress_out_dim)
with open(compressed_file, 'w+') as fout:
    with torch.no_grad():
        model.eval()
        total_len = len(traintestset) + len(testset)
        for p, l in tqdm(chain(traintestset, testset), total = total_len):
            p = torch.unsqueeze(p, 0).to(device)
            conv_data = run_conv(p)
            np_data = conv_data.flatten().cpu().numpy()
            CSV_str = ','.join(['%.15f' % num for num in np_data])
            CSV_str += ',{}'.format(l)
            fout.write(CSV_str + '\n')
            

100%|██████████| 60000/60000 [04:09<00:00, 240.84it/s]


We can either run Brainome on the dataset

In [7]:
# Install Brainome for analysis
#%pip install brainome
#!brainome login
#!brainome compressed.csv -y -headerless -f NN -vvv

Or we can run our own analysis

In [8]:
import pandas as pd
import numpy as np
import math

from IPython.display import display

def memorize(df: pd.DataFrame, drop = 0):
    dfs = pd.DataFrame({'sum': df.iloc[:, :-1].sum(axis = 1), 'label': df.iloc[:, 64]})

    drop_count = int(drop * df.shape[0])
    drops = np.random.choice(dfs.index, drop_count, replace = False)
    dfs = dfs.drop(drops)

    dfs.sort_values('sum')
    thr = (dfs['label'].diff() != 0).sum()
    mec = math.log2((thr + 1))

    return mec

# Implement Alogrithm 4 in the book (page 93)
df = pd.read_csv('compressed.csv', header=None)
dim = df.shape[1] - 1

mec_mem = memorize(df)
mec = (mec_mem * (dim + 1)) + mec_mem + 1
print('MEC = ', mec, ' bits')

MEC =  1038.7105789672364  bits


As calculated above, we find that the MEC of the data is around 15.72 bits.


The decision layer in ResNet-20 is a simple fully connected layer with 64 input and 10 output. That is, 10 neurons each with 64 input.Now, each individual neuron therefore have parameter count of 64 + 1 = 65 and the MEC of the network is  
65 * 10 = 650 bits

## Q4: What is the expected generalization in bits/bit and as a consequence the average resilience in dB? Is the resilience enough for the task? How bad can adversarial examples be? Do we expect data drift?

In [13]:
# Test the data on pre-trained decision layer
from torch.utils.data.dataset import Dataset
class CompressedCifar10(Dataset):
    def __init__(self, type = 'all'):
        df = pd.read_csv('compressed.csv', header=None)

        row_begin = 0
        row_end = 60000

        if type == 'train':
            row_end = 50000
        if type == 'test':
            row_begin = 50000

        col_max = df.shape[1]
        
        x = df.iloc[row_begin:row_end, 0:col_max -  1].values
        y = df.iloc[row_begin:row_end, col_max - 1].values

        self.x_data = torch.tensor(x, dtype=torch.float32, device=device)
        self.y_data = torch.tensor(y, dtype=torch.long, device=device)
    
    def __len__(self):
        return len(self.y_data)
    
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

total_correct = 0
with torch.no_grad():
    dataset = CompressedCifar10()

    for x, y in tqdm(dataset):
        preidct = model.linear(x)
        predict = torch.argmax(preidct)

        if predict == y:
            total_correct += 1

print('Correctly predicted instances: ', total_correct)


100%|██████████| 60000/60000 [00:00<00:00, 98340.34it/s]

Correctly predicted instances:  59038





Therefore, combine MEC = 650 bit, we get the Generalization is

$$G = \frac{59038  \log_2{10}}{MEC} = 188.81 \frac{\text{bits}}{\text{bit}}$$

Which would then traslate to resilience $R = 20 * \log_{10}G = 20 * \log_{10} 188.81 = 45.52 \text{dB}$

Since this is a benchmark for machine learning models, we don't expect adversarial examples nor do we expect data drift. The data is as-is, nothing new would be added.

## Q6: Trian your machine learner for accuracy at memory equivalent capacity

We get MEC of 1038 bits in Q4, 

In [None]:
from linear_with_mec import create_linear_layer

import torch
import torch.nn as nn
import torch.nn.functional as F

net, mec = create_linear_layer(16)
print('The decision layer has MEC {} bits'.format(mec))

net.to(device)
net.train()

print_frequency = 50

criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(net.parameters(), 1e-3)
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 200])

compressed_train_loader = DataLoader(CompressedCifar10(type='train'), 64, shuffle = True)

def evaluate(model, data_loader):
    model.eval()

    correct_count = 0
    total = 0
    with torch.no_grad():
        for data, label in tqdm(data_loader):
            data = data.to(device)
            label = label.to(device)

            output = model(data)
            _, predicted = output.max(1)
            total += label.size(0)
            correct_count += predicted.eq(label).sum().item()
    return correct_count, total

best_mem = 0

def train(dataloader, model, optimizer, criterion):
    model.train()
    running_loss = 0
    for data, label in dataloader:
        data = data.to(device)
        label = label.to(device)

        output = model(data)
        loss = criterion(output, label)
        running_loss += loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return running_loss / len(dataloader)
    
for epoch in range(240):
    running_loss =  train(compressed_train_loader, net, optimizer, criterion)

    if epoch % 20 == 19:
        print('Finished epoch {}'.format(epoch + 1))
        print('Current LR: {}'.format(optimizer.param_groups[0]['lr']))
        print('Total Loss {}'.format(running_loss / len(compressed_train_loader)))
        correct_count, total = evaluate(net, compressed_train_loader)
        print('Finished evaluating, memorized {} out of {}\n'.format(correct_count, total))

        best_mem = max(best_mem, correct_count / total)
    lr_scheduler.step()

print('Best trainning memorization is {}'.format(best_mem))

The decision layer has MEC 1056 bits
Finished epoch 20
Current LR: 0.001
Total Loss 9.893238711811136e-06


100%|██████████| 782/782 [00:00<00:00, 3268.51it/s]


Finished evaluating, memorized 49874 out of 50000

Finished epoch 40
Current LR: 0.001
Total Loss 7.706622454861645e-06


100%|██████████| 782/782 [00:00<00:00, 3155.95it/s]


Finished evaluating, memorized 49921 out of 50000

Finished epoch 60
Current LR: 0.001
Total Loss 6.502566975541413e-06


100%|██████████| 782/782 [00:00<00:00, 3140.76it/s]


Finished evaluating, memorized 49930 out of 50000

Finished epoch 80
Current LR: 0.001
Total Loss 6.18038711763802e-06


100%|██████████| 782/782 [00:00<00:00, 3186.25it/s]


Finished evaluating, memorized 49923 out of 50000

Finished epoch 100
Current LR: 0.001
Total Loss 5.2952173064113595e-06


100%|██████████| 782/782 [00:00<00:00, 3194.10it/s]


Finished evaluating, memorized 49944 out of 50000

Finished epoch 120
Current LR: 0.0001
Total Loss 4.0207210076914635e-06


100%|██████████| 782/782 [00:00<00:00, 3076.17it/s]


Finished evaluating, memorized 49950 out of 50000

Finished epoch 140
Current LR: 0.0001
Total Loss 3.966811618738575e-06


100%|██████████| 782/782 [00:00<00:00, 3221.45it/s]


Finished evaluating, memorized 49956 out of 50000

Finished epoch 160
Current LR: 0.0001
Total Loss 3.901964191754814e-06


100%|██████████| 782/782 [00:00<00:00, 3174.16it/s]


Finished evaluating, memorized 49949 out of 50000

Finished epoch 180
Current LR: 0.0001
Total Loss 3.825977273663739e-06


100%|██████████| 782/782 [00:00<00:00, 3220.27it/s]


Finished evaluating, memorized 49957 out of 50000

Finished epoch 200
Current LR: 0.0001
Total Loss 3.7760869417979848e-06


100%|██████████| 782/782 [00:00<00:00, 3207.34it/s]


Finished evaluating, memorized 49962 out of 50000

Finished epoch 220
Current LR: 1e-05
Total Loss 3.6837961943092523e-06


100%|██████████| 782/782 [00:00<00:00, 3109.99it/s]


Finished evaluating, memorized 49960 out of 50000

Finished epoch 240
Current LR: 1e-05
Total Loss 3.6799933695874643e-06


100%|██████████| 782/782 [00:00<00:00, 3199.39it/s]

Finished evaluating, memorized 49960 out of 50000

Best trainning memorization is 0.99924





A decision layer with MEC 1056 bit (the data has 1038 bit) could memorize 99.9% of the trainning data in 240 epochs. (should investigate why we cannot get 100%???)

## Q7: Trian for generalization

In [None]:
compressed_train_loader = DataLoader(CompressedCifar10(type='train'), 64, shuffle = True)
compressed_test_loader = DataLoader(CompressedCifar10(type='test'), 64, shuffle = False)

mec_acc = []

for hidden_size in reversed(range(10, 65, 5)):
    net, mec = create_linear_layer(hidden_size)
    print('Testing for decision layer with MEC {} bits'.format(mec))


    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.Adam(net.parameters(), 1e-3)
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 200])

    for epoch in range(240):
        running_loss =  train(compressed_train_loader, net, optimizer, criterion)

    print('Finished trainning')
    correct_test, total_test = evaluate(net, compressed_test_loader)
    correct_train, total_train = evaluate(net, compressed_train_loader)
    print('Finished evaluating, predicted {} out of {}\n'.format(correct_test, total_test))

    mec_acc.append((mec, correct_test / total_test, correct_train / total_train))

print(mec_acc)
    

Testing for decision layer with MEC 3960 bits
Finished trainning


100%|██████████| 157/157 [00:00<00:00, 3316.01it/s]
100%|██████████| 782/782 [00:00<00:00, 3198.13it/s]


Finished evaluating, predicted 9161 out of 10000

Testing for decision layer with MEC 3630 bits


KeyboardInterrupt: 

## Reference
[1] https://arxiv.org/abs/2103.14749  
[2] https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf