In [1]:
import torch
import numpy as np
import random
from torch.autograd import Variable
import torch.nn as nn
import itertools
from copy import deepcopy
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.datasets as dset

In [2]:
class BinaryClassificationTask():
    def __init__(self):
        self.classes = list(range(10))
        trans = transforms.Compose([transforms.ToTensor()])
        
        self.train_set = dset.MNIST(root='./data/', train=True, transform=trans, download=False)
        self.test_set = dset.MNIST(root='./data/', train=False, transform=trans, download=False)
        
    def salt_and_pepper_noise(self, prob, x):
        probs = torch.rand(*x.size())
        x[probs < prob / 2] = 0
        x[probs > 1 - prob / 2] = 1
        return x
        
    def init(self, batch_size, noise_prob):
        labels = np.random.choice(self.classes, 2, replace=False)

        # For the next task we won't encounter the same label again
        self.classes = [x for x in self.classes if x not in labels] 
        
        print('Binary classification between {} and {}'.format(labels[0], labels[1]))
        
        train_set = self.convert2tensor(self.train_set, labels, 
                                        train=True, noise_prob = noise_prob)
        test_set = self.convert2tensor(self.test_set, labels)
        
        train_loader = DataLoader(train_set, 
                                  batch_size, 
                                  shuffle=True)
        
        test_loader = DataLoader(test_set, 
                                 batch_size, 
                                 shuffle=True)
        
        return train_loader, test_loader

    def convert2tensor(self, dset, labels, train=False, noise_prob=None):
        x_set = []
        y_set = []
        
        for x, y in dset:
            if y == labels[0]:
                x_set.append(x)
                y_set.append(torch.LongTensor([0]))
            elif y == labels[1]:
                x_set.append(x)
                y_set.append(torch.LongTensor([1]))
        
        x_set = torch.cat(x_set, 0)
        x_set = x_set.view(x_set.size()[0], -1)
        if train:
            x_set = self.salt_and_pepper_noise(noise_prob, x_set)
        
        y_set = torch.cat(y_set, 0)
        dataset = TensorDataset(x_set, y_set)
        
        return dataset

In [3]:
np.random.seed(42)

L = 3  # Layers
M = 10 # Modules
N = 3  # Max modules in one layer
P = 64 # Population
G = 100 # Generations

In [4]:
def initialize_pathways():
    layer_configs = list(itertools.combinations_with_replacement(list(range(M)), N))
    layer_configs = np.array(layer_configs)
    indices = np.random.choice(len(layer_configs), (P, L))
    pathways = layer_configs[indices]
        
    return pathways # Shape: P x L x N

In [5]:
class PathNet(nn.Module):
    '''
        The architecture follows the paper specifications
            https://arxiv.org/pdf/1701.08734.pdf
        for SUPERVISED LEARNING tasks
    '''
    def __init__(self, M, L=3):
        super(PathNet, self).__init__()
        self.M = M
        self.L = L
    
        self.relu = nn.ReLU()
        self.layer1 = [nn.Linear(28 * 28, 20) for i in range(M)]
        self.layer2 = [nn.Linear(20, 20) for i in range(M)]
        self.layer3 = [nn.Linear(20, 20) for i in range(M)]
        
        self.optimizer_params = []
        for m in range(self.M):
            self.optimizer_params.append({'params' : self.layer1[m].parameters()})
            self.optimizer_params.append({'params' : self.layer2[m].parameters()})
            self.optimizer_params.append({'params' : self.layer3[m].parameters()})
        
    def sum_layer(self, layer_outputs):
        if len(layer_outputs) == 1:
            return layer_outputs[0]
        
        return [layer_outputs[i] + layer_outputs[i+1] 
                for i in range(len(layer_outputs) - 1)][0]
    
    def forward(self, x, pathway):
        layer1_active_modules_index = list(set(pathway[0]))
        layer2_active_modules_index = list(set(pathway[1]))
        layer3_active_modules_index = list(set(pathway[2]))
        
        layer1_output = [self.relu(self.layer1[m](x)) for m in layer1_active_modules_index]
        layer1_output_sum = self.sum_layer(layer1_output)

        layer2_output = [self.relu(self.layer2[m](layer1_output_sum)) for m in layer2_active_modules_index]
        layer2_output_sum = self.sum_layer(layer2_output)

        layer3_output = [self.relu(self.layer3[m](layer2_output_sum)) for m in layer3_active_modules_index]
        layer3_output_sum = self.sum_layer(layer3_output)

        output = self.last_layer(layer3_output_sum)
        
        return output
        
    def initialize_new_task(self, last_layer):
        self.last_layer = last_layer
        self.optimizer_params.append({'params' : last_layer.parameters()})
        
    def output_shape_calculator(self):
        pass
    
    def get_optimizer_params(self):
        return self.optimizer_params
    
    def done_task(self, best_pathway):
        # Freeze best pathway
        # Re-initialize all others
        layer1_active_modules_index = list(set(best_pathway[0]))
        layer2_active_modules_index = list(set(best_pathway[1]))
        layer3_active_modules_index = list(set(best_pathway[2]))
        
        self.optimizer_params = []
    
        # Freeze and add parameters to train
        for i in range(self.M):
            if i in layer1_active_modules_index:
                self.layer1[i].requires_grad = False
            else:
                self.layer1[i].reset_parameters()
                self.layer1[i].requires_grad = True
                self.optimizer_params.append({'params' : self.layer1[i].parameters()})
        
            if i in layer2_active_modules_index:
                self.layer2[i].requires_grad = False
            else:
                self.layer2[i].reset_parameters()
                self.layer2[i].requires_grad = True
                self.optimizer_params.append({'params' : self.layer2[i].parameters()})
        
            if i in layer3_active_modules_index:
                self.layer3[i].requires_grad = False
            else:
                self.layer3[i].reset_parameters()
                self.layer3[i].requires_grad = True
                self.optimizer_params.append({'params' : self.layer3[i].parameters()})

In [6]:
class BinaryEvolutionTrainer(object):
    def __init__(self, model, optimizer, loss_func, 
                 train_loader, test_loader, convergence_threshold,
                 batch_epochs=50):
        
        self.model = model
        self.loss_func = loss_func
        self.batch_epochs = batch_epochs
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.convergence_threshold = convergence_threshold
        self.optimizer = optimizer
      
    def initialize_pathways():
        layer_configs = list(itertools.combinations_with_replacement(list(range(M)), N))
        layer_configs = np.array(layer_configs)
        indices = np.random.choice(len(layer_configs), (P, L))
        pathways = layer_configs[indices]

        return pathways # Shape: P x L x N
    
    def mutate(self, pathway):
        prob_mutate = 1./ (L * N) # Increase probability of mutation

        # Probability of mutation for every element
        prob = np.random.rand(L, N)

        # Mutations for chosen elements
        permutations = np.random.randint(-2, 2, size=(L, N))
        permutations[prob > prob_mutate] = 0

        # Mutate
        pathway = (pathway + permutations) % M
        
        return pathway
    
    def evaluate(self, pathway, batch_size):
        correct = 0
        
        for x, y in self.test_loader:
            x, y = Variable(x, volatile=True), Variable(y, volatile=True)
            
            output = self.model(x, pathway)
            _, pred = torch.max(output.data, 1)
            
            correct += (pred == y.data).sum()
            
        accuracy = correct * 1.0 / len(self.test_loader) / batch_size

        return accuracy
    
    def train_model(self, pathway):
        for batch_idx, (data, target) in enumerate(self.train_loader):
            # Stop training after 50 batches, evaluate fitness
            if batch_idx >= self.batch_epochs:
                fitness = self.evaluate(pathway, len(target))
                return fitness

            self.optimizer.zero_grad()

            data, target = Variable(data), Variable(target)
            output = self.model(data, pathway)

            loss = self.loss_func(output, target)

            loss.backward()
            self.optimizer.step()
    
    def train(self):
        self.model.train()
        
        fitnesses = []
        best_pathway = None
        best_fitness = -float('inf')
        pathways = initialize_pathways()
        gen = 0
        
        while best_fitness < self.convergence_threshold:
            chosen_pathways = pathways[np.random.choice(P, 2)]
            
            current_fitnesses = []
            
            for pathway in chosen_pathways:
                fitness = self.train_model(pathway)
                
                current_fitnesses.append(fitness)
                
                if fitness > best_fitness:
                    best_fitness = fitness
                    best_pathway = pathway
                
            # All pathways finished evaluating, copy the one with highest fitness
            # to all other ones and mutate
            pathways = np.array([best_pathway] + [self.mutate(deepcopy(best_pathway)) 
                                              for _ in range(P - 1)])
            
            fitnesses.append(max(current_fitnesses))
            
            if gen % 20 == 0:
                print('Generation {} best fitness is {}'.format(gen, best_fitness))
            gen += 1
        
        # Task training is done
        self.model.done_task(best_pathway)
        
        return best_pathway, gen, fitnesses

In [None]:
# Configure high-level pipeline

# Data loader
task = BinaryClassificationTask()
train_loader, test_loader = task.init(batch_size=16, noise_prob=0.5)

# Model initialization
task_layer = nn.Sequential(
                    nn.Linear(20, 2), 
                    nn.Softmax())

pathnet = PathNet(M)
pathnet.initialize_new_task(task_layer)

# Optimizer / loss
optimizer_params = pathnet.get_optimizer_params()

optimizer = optim.SGD(optimizer_params, lr=0.001)
loss_func = nn.CrossEntropyLoss()

Binary classification between 8 and 1


In [None]:
# PathNet Trainer
evol_trainer = BinaryEvolutionTrainer(pathnet, optimizer, loss_func, 
                                      train_loader, test_loader, 
                                      convergence_threshold = 0.98)

best_task_pathway, converge_generation, fitnesses = evol_trainer.train()

Generation 0 best fitness is 0.537405303030303
Generation 20 best fitness is 0.8129734848484849
Generation 40 best fitness is 0.8129734848484849
Generation 60 best fitness is 0.9597537878787878
Generation 80 best fitness is 0.9668560606060606
Generation 100 best fitness is 0.96875
Generation 120 best fitness is 0.9725378787878788
Generation 140 best fitness is 0.9753787878787878
Generation 160 best fitness is 0.9753787878787878
Generation 180 best fitness is 0.9753787878787878
Generation 200 best fitness is 0.9753787878787878
Generation 220 best fitness is 0.9753787878787878
Generation 240 best fitness is 0.9763257575757576
Generation 260 best fitness is 0.978219696969697
Generation 280 best fitness is 0.978219696969697
Generation 300 best fitness is 0.978219696969697
Generation 320 best fitness is 0.978219696969697
Generation 340 best fitness is 0.978219696969697
Generation 360 best fitness is 0.978219696969697
Generation 380 best fitness is 0.9805871212121212
Generation 400 best fitn

In [None]:
print('Model converge at generation', converge_generation)

In [None]:
# 1. Re-write data pipeline + one-hot coding + flatten input  Checked
# 2. Change loss function / output, change to Linear layers   Checked
# 3. Transfer learning, save bestpath, freeze net, re-initialize net
# 4. Train on second task, record transfer learning performance -> time vs accuracy.

In [None]:
new_train_loader, new_test_loader = task.init(batch_size=16, noise_prob=0.5)

# Model initialization
new_task_layer = nn.Sequential(
                    nn.Linear(20, 2), 
                    nn.Softmax())

pathnet.initialize_new_task(new_task_layer)

# Optimizer / loss
optimizer_params = pathnet.get_optimizer_params()
optimizer = optim.SGD(optimizer_params, lr=0.001)
loss_func = nn.CrossEntropyLoss()

In [None]:
new_evol_trainer = BinaryEvolutionTrainer(pathnet, optimizer, loss_func, 
                                          new_train_loader, new_test_loader, 
                                          convergence_threshold = 0.98)
new_best_task_pathway, new_converge_generation, new_fitnesses = new_evol_trainer.train()

In [None]:
print('Model converge at generation', new_converge_generation)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(list(range(converge_generation)), fitnesses, c='r')
plt.plot(list(range(new_converge_generation)), new_fitnesses, c='b')
plt.show()