# Linear Classifier 
to be compared to `cosine similarity classifier `

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing

import torch
import torch.nn as nn

from torch import relu

from torch.utils.data import DataLoader, Dataset

import torchvision.datasets as dsets
import torchvision.transforms as transforms

In [2]:
# GPU settings
device = torch.device(0)
device

device(type='cuda', index=0)

### Few-shot Learning Settings
K-shot N-ways

In [3]:
K = 10
N = 1
left_class = 7

### Get a dataset

In [4]:
data_transform = transforms.Compose([transforms.ToTensor()])

# import the `MNIST datasets`
mnist_train = dsets.MNIST(root='data',
                          train=True,
                          transform=data_transform,
                          download=True)

mnist_test = dsets.MNIST(root='data',
                          train=False,
                          transform=data_transform,
                          download=True)

# build the `DataLoader`
train_data_loader = DataLoader(mnist_train, batch_size=2**10)
test_data_loader = DataLoader(mnist_test, batch_size=mnist_test.data.shape[0])

In [5]:
# Label Encoder
label_encoder  = preprocessing.LabelEncoder()

targets = list(range(0, 10, 1))
targets.pop(left_class)
targets = np.array(targets).reshape(-1, 1)

label_encoder.fit(targets);

  return f(*args, **kwargs)


In [6]:
for x, y in train_data_loader:
    print(x); print(y); print(x.max()); print(x.shape)
    break

tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        ...,


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0.

### Build a model

In [7]:
class Model(nn.Module):
    def __init__(self, in_size=28, embedding_feature_size=2, n_classes=10):
        super().__init__()
        
        # Data properties
        in_channels = 1
        
        # Define layers
        # 1
        self.conv1 = nn.Conv2d(in_channels, 12, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(12)
        self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2)

        # 2
        self.conv2 = nn.Conv2d(12, 24, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(24)
        self.maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2)

        # 3
        self.conv3 = nn.Conv2d(24, 48, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(48)
        self.maxpool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # 4
        self.last_embedding_layer = nn.Linear(48*3*3, embedding_feature_size)
        
        # FC
        self.fc1 = nn.Linear(embedding_feature_size, n_classes)
        
    def forward(self, x):
        #============================
        # Feature Extractor
        h1 = self.maxpool1(relu(self.bn1(self.conv1(x))))
        h2 = self.maxpool2(relu(self.bn2(self.conv2(h1))))
        h3 = self.maxpool3(relu(self.bn3(self.conv3(h2))))
        
        batch_size = x.shape[0]
        h3 = h3.view(batch_size, -1)  # 1
        
        z = self.last_embedding_layer(h3)
        
        # linear classifier
        out = self.fc1(z)
        
        return out

In [8]:
model = Model(embedding_feature_size=64, n_classes=9).to(device)

### Compile

In [9]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

### Train

In [10]:
# settings
n_epochs = 6

In [11]:
def leave_out_a_class(x, y, left_class):
    "leave out some class for the few-shot learning"
    indices = (y != left_class)
    return x[indices, :, :, :], y[indices]

In [12]:
train_hist = {"epochs": [], "loss_per_epoch": [], "loss": [], "test_acc": []}

for epoch in range(n_epochs):

    loss_per_epoch = 0
    iters = 0
    for x, y in train_data_loader:
        x, y = leave_out_a_class(x, y, left_class)
        y = torch.tensor(label_encoder.transform(y))
        x, y = x.to(device), y.to(device)
        
        optimizer.zero_grad()
        
        yhat = model(x)
        
        loss = criterion(yhat, y)
        loss.backward()
        
        optimizer.step()
        
        # data storage
        loss_per_epoch += loss
        iters += 1
        train_hist["loss"].append(loss)
        #print(round(loss.item(), 2), end=" ")
            
    train_hist["epochs"].append(epoch)
    train_hist["loss_per_epoch"].append(loss_per_epoch)
    
    # validation
    with torch.no_grad():
        test_acc = 0
        for x_test, y_test in test_data_loader:
            x_test, y_test = leave_out_a_class(x_test, y_test, left_class)
            y_test = torch.tensor(label_encoder.transform(y_test))
            x_test, y_test = x_test.to(device), y_test.to(device)
            yhat = torch.argmax(model(x_test.to(device)), axis=1)
            test_acc += np.mean((yhat.to("cpu") == y_test.to("cpu")).numpy())
        train_hist["test_acc"].append(test_acc)
    
    print("epoch: {}, loss: {:0.3f}, test_acc: {:0.3f}".format(epoch, loss_per_epoch/iters, test_acc))

epoch: 0, loss: 0.546, test_acc: 0.973
epoch: 1, loss: 0.084, test_acc: 0.982
epoch: 2, loss: 0.056, test_acc: 0.986
epoch: 3, loss: 0.043, test_acc: 0.989
epoch: 4, loss: 0.034, test_acc: 0.989
epoch: 5, loss: 0.028, test_acc: 0.989


### Make `support set` and `query set`

In [13]:
indices = (mnist_test.targets == left_class)

class SupportSet(Dataset):
    def __init__(self,):
        super().__init__()
        
        support_set_X = []
        support_set_Y = []
        for c in range(0, 10):
            indices = (mnist_test.targets == c)
            support_set_X.append( mnist_test.data[indices, :, :][:N, :, :].numpy() / 255. )
            support_set_Y.append( mnist_test.targets[indices][:N].numpy() )

        self.support_set_X = torch.from_numpy(np.array(support_set_X)).view(N*10, 1, 28, 28)
        self.support_set_Y = torch.from_numpy(np.array(support_set_Y).flatten())

        self.len = self.support_set_X.shape[0]
        
    def __getitem__(self, idx):
        return self.support_set_X[idx], self.support_set_Y[idx]
    
    def __len__(self):
        return self.len

support_set = SupportSet()
support_set_data_loader = DataLoader(support_set, batch_size=2**10)

### Freeze `Feature Extractor` and replace the last layer
fine-tuning

In [14]:
# check out the layers with `.named_children()`
for i, chd in enumerate(model.named_children()):
    print(i, "|", chd)

0 | ('conv1', Conv2d(1, 12, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)))
1 | ('bn1', BatchNorm2d(12, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True))
2 | ('maxpool1', MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False))
3 | ('conv2', Conv2d(12, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)))
4 | ('bn2', BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True))
5 | ('maxpool2', MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False))
6 | ('conv3', Conv2d(24, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)))
7 | ('bn3', BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True))
8 | ('maxpool3', MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False))
9 | ('last_embedding_layer', Linear(in_features=432, out_features=64, bias=True))
10 | ('fc1', Linear(in_features=64, out_features=9, bias=True))


In [15]:
# freeze all the existing layers
for param in model.parameters():
    param.requires_grad = False

In [16]:
# replace the last layer
model.fc1 = nn.Linear(64, 10).to(device)

In [17]:
for name, param in model.named_parameters():
    print(name, '| grad:{}'.format(param.requires_grad), '| device: {}'.format(param.device))

conv1.weight | grad:False | device: cuda:0
conv1.bias | grad:False | device: cuda:0
bn1.weight | grad:False | device: cuda:0
bn1.bias | grad:False | device: cuda:0
conv2.weight | grad:False | device: cuda:0
conv2.bias | grad:False | device: cuda:0
bn2.weight | grad:False | device: cuda:0
bn2.bias | grad:False | device: cuda:0
conv3.weight | grad:False | device: cuda:0
conv3.bias | grad:False | device: cuda:0
bn3.weight | grad:False | device: cuda:0
bn3.bias | grad:False | device: cuda:0
last_embedding_layer.weight | grad:False | device: cuda:0
last_embedding_layer.bias | grad:False | device: cuda:0
fc1.weight | grad:True | device: cuda:0
fc1.bias | grad:True | device: cuda:0


In [18]:
optimizer = torch.optim.Adam([param for param in model.parameters() if param.requires_grad], 
                              lr=5e-4)

### Fine-Tuning

In [19]:
train_hist = {"epochs": [], "loss_per_epoch": [], "loss": [], "test_acc": []}

In [20]:
n_epochs = 200

In [21]:
for epoch in range(n_epochs):

    loss_per_epoch = 0
    iters = 0
    for x, y in support_set_data_loader:
        x, y = x.to(device, dtype=torch.float), y.to(device)
        
        optimizer.zero_grad()
        
        yhat = model(x)
        
        loss = criterion(yhat, y)
        loss.backward()
        
        optimizer.step()
        
        # data storage
        loss_per_epoch += loss
        iters += 1
        train_hist["loss"].append(loss)
        #print(round(loss.item(), 2), end=" ")
            
    train_hist["epochs"].append(epoch)
    train_hist["loss_per_epoch"].append(loss_per_epoch)
    
    # validation
    if (epoch % 10) == 0:
        with torch.no_grad():
            test_acc = 0
            for x_test, y_test in test_data_loader:
                x_test, y_test = x_test.to(device, dtype=torch.float), y_test.to(device)
                yhat = torch.argmax(model(x_test.to(device)), axis=1)
                test_acc += np.mean((yhat.to("cpu") == y_test.to("cpu")).numpy())
            train_hist["test_acc"].append(test_acc)

        print("epoch: {}, loss: {:0.3f}, test_acc: {:0.3f}".format(epoch, loss_per_epoch/iters, test_acc))

epoch: 0, loss: 3.406, test_acc: 0.060
epoch: 10, loss: 2.789, test_acc: 0.088
epoch: 20, loss: 2.244, test_acc: 0.159
epoch: 30, loss: 1.783, test_acc: 0.357
epoch: 40, loss: 1.406, test_acc: 0.554
epoch: 50, loss: 1.107, test_acc: 0.685
epoch: 60, loss: 0.874, test_acc: 0.747
epoch: 70, loss: 0.696, test_acc: 0.795
epoch: 80, loss: 0.560, test_acc: 0.844
epoch: 90, loss: 0.457, test_acc: 0.875
epoch: 100, loss: 0.379, test_acc: 0.894
epoch: 110, loss: 0.319, test_acc: 0.902
epoch: 120, loss: 0.272, test_acc: 0.908
epoch: 130, loss: 0.236, test_acc: 0.911
epoch: 140, loss: 0.207, test_acc: 0.913
epoch: 150, loss: 0.183, test_acc: 0.916
epoch: 160, loss: 0.164, test_acc: 0.917
epoch: 170, loss: 0.148, test_acc: 0.919
epoch: 180, loss: 0.134, test_acc: 0.920
epoch: 190, loss: 0.122, test_acc: 0.920


### Evaluate the learned representations

`left_class` only

In [22]:
with torch.no_grad():
    
    indices = (mnist_test.targets == left_class)
    
    test_acc = 0
    count = 0
    for x_test, y_test in test_data_loader:
        x_test, y_test = x_test.to(device), y_test.to(device)
        x_test, y_test = x_test[indices,:,:], y_test[indices]
        
        yhat = torch.argmax(model(x_test.to(device)), axis=1)
        
        test_acc += np.mean((yhat.to("cpu") == y_test.to("cpu")).numpy())
        count += 1
        
    test_acc /= count

In [23]:
test_acc

0.6303501945525292

over all classes

In [24]:
with torch.no_grad():
    test_accs = []
    for c in range(10):
        indices = (mnist_test.targets == c)
    
        test_acc = 0
        count = 0
        for x_test, y_test in test_data_loader:
            x_test, y_test = x_test.to(device), y_test.to(device)
            x_test, y_test = x_test[indices,:,:], y_test[indices]

            yhat = torch.argmax(model(x_test.to(device)), axis=1)

            test_acc += np.mean((yhat.to("cpu") == y_test.to("cpu")).numpy())
            count += 1

        test_acc /= count
        test_accs.append(test_acc)
        print(f"class: {c} | acc: {round(test_acc, 3)}")

print("\n overall acc: {:0.3f}".format(np.mean(test_accs)))

class: 0 | acc: 0.978
class: 1 | acc: 0.35
class: 2 | acc: 0.905
class: 3 | acc: 0.739
class: 4 | acc: 0.829
class: 5 | acc: 0.936
class: 6 | acc: 0.817
class: 7 | acc: 0.63
class: 8 | acc: 0.896
class: 9 | acc: 0.962

 overall acc: 0.804


---