Imports...

In [1]:
import torch.nn as nn   # Nueral network modules.
from collections import OrderedDict
import torch  # Base torch library
from torch.utils.data import DataLoader  # Minibathces
import torchvision.datasets as datasets  # MNIST dataset
import torchvision.transforms as transforms
import numpy as np
import torch.nn as nn  # Neural network modules
import torch.optim as optim  # Optimization algorithms
import pandas as pd

Model Class:

In [2]:
class NN(nn.Module):

    def __init__(self, input_size, middle_width, num_classes):


        super(NN, self).__init__()
        self.features = nn.Sequential(OrderedDict([
            ('hidden_layer', nn.Linear(input_size, middle_width)),
            ('hidden_activation', nn.ReLU()),
        ]))
        self.readout = nn.Linear(middle_width, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = self.readout(x)

        return x

Network Functions:

In [3]:
def set_device():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    return device


def mnist_dataset(batch_size, train=True, values=list(range(10))):
    # Initializing MNIST data set.
    dataset = datasets.MNIST(root='dataset/', train=train, transform=transforms.ToTensor(), download=True)

    targets_list = dataset.targets.tolist()
    values_index = [i for i in range(len(dataset)) if targets_list[i] in values]

    # Creating a subset of ### MNIST targets.
    subset = torch.utils.data.Subset(dataset, values_index)
    loader = DataLoader(dataset=subset, batch_size=batch_size, shuffle=True)

    return loader


def train(loader, device, model, loss_function, optimizer_function, values=list(range(10))):
    # Training on each data point.
    for batch_idx, (data, targets) in enumerate(loader):
        data = data.reshape(data.shape[0], -1).to(device=device)
        targets = targets.to(device=device)

        # Forwards.
        scores = model(data)
        loss = loss_function(scores, classify_targets(targets, values))

        # Backwards.
        optimizer_function.zero_grad()
        loss.backward()

        optimizer_function.step()
        
        
        phi = slow_model.features(data)
        
        return targets, phi


def record_accuracy(device, model, train_loader, test_loader, epoch, values=list(range(10))):
    epoch_accuracy = np.array([[
        epoch + 1,
        check_accuracy(device, model, train_loader, values).cpu(),
        check_accuracy(device, model, test_loader, values).cpu()
    ]])

    return epoch_accuracy


def check_accuracy(device, model, loader, values=list(range(10))):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device)
            y = classify_targets(y, values).to(device=device)
            x = x.reshape(x.shape[0], -1)

            scores = model(x)
            # 64images x 10,

            predictions = scores.argmax(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

    return 100 - 100. * num_correct / num_samples


def classify_targets(targets, values):
    new_targets = targets.clone()

    # Changing targets to a classifiable number.
    for key, element in enumerate(values):
        new_targets[targets == element] = key
    return new_targets

Main Program:

In [4]:
# Checking & Setting Device Allocation
device = set_device()
print(f"Running on {device}")

# Hyper Parameters
hp = {
    "Input Size": 784,
    "Middle Layer Width": 2000,
    "Num Classes": 2,
    "Regular Learning Rate": 0.01,
    "Slow Learning Rate": 0.001,
    "Batch Size": 200,
    "Epochs": 1
}
print(f"Hyper Parameters: {hp}")

# Initializing Model
slow_model = NN(input_size=hp["Input Size"],
                middle_width=hp["Middle Layer Width"],
                num_classes=hp["Num Classes"]).to(device=device)

reg_model = NN(input_size=hp["Input Size"],
               middle_width=hp["Middle Layer Width"],
               num_classes=hp["Num Classes"]).to(device=device)

# Loading MNIST Dataset
mnist_values = [8, 9]
print(f"MNIST digits {mnist_values}")
train_loader = mnist_dataset(hp["Batch Size"], values=mnist_values)
validate_loader = mnist_dataset(hp["Batch Size"], train=False, values=mnist_values)

# Loss function
loss_function = nn.CrossEntropyLoss()

# Optimizers
sl_optimizer = optim.SGD([{'params': slow_model.features.hidden_layer.parameters()},
                          {'params': slow_model.readout.parameters(),
                           'lr': hp["Regular Learning Rate"]}],
                         lr=hp["Slow Learning Rate"])
r_optimizer = optim.SGD(reg_model.parameters(), lr=hp["Regular Learning Rate"])

# Creating 'empty' arrays for future storing of accuracy metrics
slow_accuracy = np.zeros((1, 3))
regular_accuracy = np.zeros((1, 3))

print("Training models...")
for epoch in range(hp["Epochs"]):

    # Slow Model
    sl_targets, sl_phi = train(train_loader, device, slow_model, loss_function, sl_optimizer, values=mnist_values)
    slow_accuracy_epoch = record_accuracy(device, slow_model, train_loader, validate_loader, epoch, mnist_values)
    slow_accuracy = np.concatenate((slow_accuracy, slow_accuracy_epoch))
    print("Slow: ")
    print(slow_accuracy_epoch)
    # Regular Model
    reg_targets, reg_phi = train(train_loader, device, reg_model, loss_function, r_optimizer, values=mnist_values)
    regular_accuracy_epoch = record_accuracy(device, reg_model, train_loader, validate_loader, epoch, mnist_values)
    regular_accuracy = np.concatenate((regular_accuracy, regular_accuracy_epoch))
    print("Reg: ")
    print(regular_accuracy_epoch)
    print(f"-Finished epoch {epoch + 1}/{hp['Epochs']}")

Running on cpu
Hyper Parameters: {'Input Size': 784, 'Middle Layer Width': 2000, 'Num Classes': 2, 'Regular Learning Rate': 0.01, 'Slow Learning Rate': 0.001, 'Batch Size': 200, 'Epochs': 1}
MNIST digits [8, 9]
Training models...
Slow: 
[[ 1.         46.47457504 48.10892487]]
Reg: 
[[ 1.         46.52542496 47.25164032]]
-Finished epoch 1/1


***
Kernel Alignment Calc:
***

Kernel Matrix: $K_{1}$

In [5]:
def ones(vector):
    for i in range(vector.size()[1]):
        if vector[0][i] == 9:
            vector[0][i] = int(1)
        elif vector[0][i] == 8:
            vector[0][i] = int(-1)
    return vector

In [6]:
sl_targets = torch.t(torch.unsqueeze(sl_targets, -1))
sl_targets = ones(sl_targets)

In [7]:
K1 = torch.matmul(torch.t(sl_targets), sl_targets)

In [8]:
K1.size(), K1

(torch.Size([200, 200]),
 tensor([[ 1, -1, -1,  ..., -1, -1, -1],
         [-1,  1,  1,  ...,  1,  1,  1],
         [-1,  1,  1,  ...,  1,  1,  1],
         ...,
         [-1,  1,  1,  ...,  1,  1,  1],
         [-1,  1,  1,  ...,  1,  1,  1],
         [-1,  1,  1,  ...,  1,  1,  1]]))

***

Kernel Matrix: $K_{2}$

In [9]:
slow_model.features.hidden_activation.eval()

ReLU()

In [10]:
phi = sl_phi

In [22]:
K2 = torch.mm(phi, torch.t(phi))
K2.size(), K2

(torch.Size([200, 200]),
 tensor([[61.2843, 28.7001, 32.7924,  ..., 28.3563, 19.0159, 26.5034],
         [28.7001, 43.8720, 32.2265,  ..., 30.5616, 23.2612, 26.1997],
         [32.7924, 32.2265, 57.8523,  ..., 35.8389, 28.1416, 37.5358],
         ...,
         [28.3563, 30.5616, 35.8389,  ..., 47.1405, 24.3216, 31.8156],
         [19.0159, 23.2612, 28.1416,  ..., 24.3216, 28.0343, 25.8679],
         [26.5034, 26.1997, 37.5358,  ..., 31.8156, 25.8679, 44.0232]],
        grad_fn=<MmBackward0>))

***

Kernel Centering: 

$K_{c} = \left[ I - \frac{11^{T}}{m} \right] K \left[ I - \frac{11^{T}}{m} \right]$

*Note: let 1 denote the vector with all enteries equal to one and I being the identity matrix*

In [25]:
def kernel_centering(K):
    # Lemmna 1
    
    m = K.size()[0]
    I = torch.eye(m)
    l = torch.ones(m, 1)
    
    # I - ll^T / m
    mat = I - torch.matmul(l, torch.t(l))/m
    
    
    return torch.matmul(torch.matmul(mat, K), mat)
    
    

Centering Kernel $K_{1}$

In [26]:
Kc1 = kernel_centering(K1.float())
Kc1, Kc1.size()

(tensor([[ 0.9025, -0.9975, -0.9975,  ..., -0.9975, -0.9975, -0.9975],
         [-0.9975,  1.1025,  1.1025,  ...,  1.1025,  1.1025,  1.1025],
         [-0.9975,  1.1025,  1.1025,  ...,  1.1025,  1.1025,  1.1025],
         ...,
         [-0.9975,  1.1025,  1.1025,  ...,  1.1025,  1.1025,  1.1025],
         [-0.9975,  1.1025,  1.1025,  ...,  1.1025,  1.1025,  1.1025],
         [-0.9975,  1.1025,  1.1025,  ...,  1.1025,  1.1025,  1.1025]]),
 torch.Size([200, 200]))

Centering Kernel $K_{2}$

In [27]:
Kc2 = kernel_centering(K2)
Kc2, Kc2.size()

(tensor([[31.6344, -1.1747, -1.1958,  ..., -1.7508, -4.2313, -2.6194],
         [-1.1747, 13.7724, -1.9866,  ...,  0.2297, -0.2108, -3.1480],
         [-1.1958, -1.9866, 19.5258,  ...,  1.3935,  0.5561,  4.0747],
         ...,
         [-1.7508,  0.2297,  1.3935,  ..., 16.5762,  0.6172,  2.2356],
         [-4.2313, -0.2108,  0.5561,  ...,  0.6172, 11.1899,  3.1478],
         [-2.6194, -3.1480,  4.0747,  ...,  2.2356,  3.1478, 15.4275]],
        grad_fn=<MmBackward0>),
 torch.Size([200, 200]))

***

Kernel Aligment Function: $\hat{p}(K, K') = \frac{\langle K_{c}, K_{c}' \rangle}{\| K_{c} \| \| K_{c}' \|} $

In [28]:
def frobenius_product(K1, K2):
    return torch.trace(torch.mm(K2, torch.t(K1)))

def kernel_alignment(K1, K2):
    return frobenius_product(K1, K2)/((torch.norm(K1, p='fro')*torch.norm(K2, p='fro')))

In [29]:
kernel_alignment(Kc1, Kc2)

tensor(0.3699, grad_fn=<DivBackward0>)