<h1> Imports

In [9]:
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch
from torch.autograd import Variable
import random
import torch.nn.functional as F
from torch.autograd import Variable as V

<h1> Data Loading and Generation

This Sine function generator is based on the repostory: https://github.com/AdrienLE/ANIML/blob/master/ANIML.ipynb

In [23]:
#This class and everything within this cell is copied directly from:
#https://github.com/AdrienLE/ANIML/blob/master/ANIML.ipynb

class ModifiableModule(nn.Module):
    def params(self):
        return [p for _, p in self.named_params()]
    
    def named_leaves(self):
        return []
    
    def named_submodules(self):
        return []
    
    def named_params(self):
        subparams = []
        for name, mod in self.named_submodules():
            for subname, param in mod.named_params():
                subparams.append((name + '.' + subname, param))
        return self.named_leaves() + subparams
    
    def set_param(self, name, param):
        if '.' in name:
            n = name.split('.')
            module_name = n[0]
            rest = '.'.join(n[1:])
            for name, mod in self.named_submodules():
                if module_name == name:
                    mod.set_param(rest, param)
                    break
        else:
            setattr(self, name, param)
            
    def copy(self, other, same_var=False):
        for name, param in other.named_params():
            if not same_var:
                param = V(param.data.clone(), requires_grad=True)
            self.set_param(name, param)

class GradLinear(ModifiableModule):
    def __init__(self, *args, **kwargs):
        super().__init__()
        ignore = nn.Linear(*args, **kwargs)
        self.weights = V(ignore.weight.data, requires_grad=True)
        self.bias = V(ignore.bias.data, requires_grad=True)
        
    def forward(self, x):
        return F.linear(x, self.weights, self.bias)
    
    def named_leaves(self):
        return [('weights', self.weights), ('bias', self.bias)]

class SineModel(ModifiableModule):
    def __init__(self):
        super().__init__()
        self.hidden1 = GradLinear(1, 40)
        self.hidden2 = GradLinear(40, 40)
        self.out = GradLinear(40, 1)
        
    def forward(self, x):
        x = F.relu(self.hidden1(x))
        x = F.relu(self.hidden2(x))
        return self.out(x)
    
    def named_submodules(self):
        return [('hidden1', self.hidden1), ('hidden2', self.hidden2), ('out', self.out)]

In [25]:
class SineWaveTask:
    def __init__(self):
        self.a = np.random.uniform(0.1, 5.0)
        self.b = np.random.uniform(0, 2*np.pi)
        self.train_x = None
        
    def f(self, x):
        return self.a * np.sin(x + self.b)
        
    def training_set(self, size=10, force_new=False):
        if self.train_x is None and not force_new:
            self.train_x = np.random.uniform(-5, 5, size)
            x = self.train_x
        elif not force_new:
            x = self.train_x
        else:
            x = np.random.uniform(-5, 5, size)
        y = self.f(x)
        return torch.Tensor(x), torch.Tensor(y)
    
    def test_set(self, size=50):
        x = np.linspace(-5, 5, size)
        y = self.f(x)
        return torch.Tensor(x), torch.Tensor(y)
    
    def plot(self, *args, **kwargs):
        x, y = self.test_set(size=100)
        return plt.plot(x.numpy(), y.numpy(), *args, **kwargs)
    
    def plot_model(self, new_model, *args, **kwargs):
        x, y_true = self.test_set(size=100)
        x = Variable(x[:, None])
        y_true = Variable(y_true[:, None])    

        y_pred = new_model(x)

        plt.plot(x.data.numpy().flatten(),
                 y_pred.data.numpy().flatten(),
                 *args, **kwargs)

TRAIN_SIZE = 20000
TEST_SIZE = 1000
SINE_TRAIN = [SineWaveTask() for _ in range(TRAIN_SIZE)]
SINE_TEST = [SineWaveTask() for _ in range(TEST_SIZE)]

<h1> Neural Network Model

<h1> Helper functions

In [26]:
# The Minimum Square Error is used to evaluate the difference between prediction and ground truth
criterion = nn.MSELoss()

def copy_existing_model(model):
    # Function to copy an existing model
    # We initialize a new model
    new_model = Neural_Network()
    # Copy the previous model's parameters into the new model
    new_model.load_state_dict(model.state_dict())
    return new_model

def get_samples_in_good_format(wave, num_samples=10, force_new=False):
  #This function is used to sample data from a wave
  x, y_true = wave.training_set(size=num_samples, force_new=force_new)
  # We add [:,None] to get the right dimensions to pass to the model: we want K x 1 (we have scalars inputs hence the x 1)
  # Note that we convert everything torch tensors
  x = torch.tensor(x[:,None])
  y_true = torch.tensor(y_true[:,None])
  return x,y_true

def initialization_to_store_meta_losses():
  # This function creates lists to store the meta losses
  global store_train_loss_meta; store_train_loss_meta = []
  global store_test_loss_meta; store_test_loss_meta = []

def test_set_validation(model,new_model,wave,lr_inner,k,store_test_loss_meta):
    # This functions does not actually affect the main algorithm, it is just used to evaluate the new model
    new_model = training(model, wave, lr_inner, k)
    # Obtain the loss
    loss = evaluation(new_model, wave)
    # Store loss
    store_test_loss_meta.append(loss)

def train_set_evaluation(new_model,wave,store_train_loss_meta):
    loss = evaluation(new_model, wave)
    store_train_loss_meta.append(loss) 

def print_losses(epoch,store_train_loss_meta,store_test_loss_meta,printing_step=1000):
  if epoch % printing_step == 0:
    print(f'Epochh : {epoch}, Average Train Meta Loss : {np.mean(store_train_loss_meta)}, Average Test Meta Loss : {np.mean(store_test_loss_meta)}')

#This is based on the paper update rule, we calculate the difference between parameters and then this is used by the optimizer, rather than doing the update by hand
def reptile_parameter_update(model,new_model):
  # Zip models for the loop
  zip_models = zip(model.parameters(), new_model.parameters())
  for parameter, new_parameter in zip_models:
    if parameter.grad is None:
      parameter.grad = torch.tensor(torch.zeros_like(parameter))
    # Here we are adding the gradient that will later be used by the optimizer
    parameter.grad.data.add_(parameter.data - new_parameter.data)

# Define commands in order needed for the metaupdate
# Note that if we change the order it doesn't behave the same
def metaoptimizer_update(metaoptimizer):
  # Take step
  metaoptimizer.step()
  # Reset gradients
  metaoptimizer.zero_grad()

def metaupdate(model,new_model,metaoptimizer):
  # Combine the two previous functions into a single metaupdate function
  # First we calculate the gradients
  reptile_parameter_update(model,new_model)
  # Use those gradients in the optimizer
  metaoptimizer_update(metaoptimizer)

def evaluation(new_model, wave, item = True, num_samples=10, force_new=False):
    # Get data
    x, label = get_samples_in_good_format(wave,num_samples=num_samples, force_new=force_new)
    # Make model prediction
    prediction = new_model(x)
    # Get loss
    if item == True: #Depending on whether we need to return the loss value for storing or for backprop
      loss = criterion(prediction,label).item()
    else:
      loss = criterion(prediction,label)
    return loss

def training(model, wave, lr_k, k):
    # Create new model which we will train on
    new_model = copy_existing_model(model)
    # Define new optimizer
    koptimizer = torch.optim.SGD(new_model.parameters(), lr=lr_k)
    # Update the model multiple times, note that k>1 (do not confuse k with K)
    for i in range(k):
        # Reset optimizer
        koptimizer.zero_grad()
        # Evaluate the model
        loss = evaluation(new_model, wave, item = False)
        # Backpropagate
        loss.backward()
        koptimizer.step()
    return new_model

# Second-Order MAML

In [36]:
#This uses their neural network configuration
# maml training
# note: uses comments and structure largely from code ocariz wrote! 
#alternative code
'''
Handling computation graphs and second-order backprop help and partial inspiration from: 
- https://discuss.pytorch.org/t/how-to-save-computation-graph-of-a-gradient/128286/2 
- https://discuss.pytorch.org/t/when-do-i-use-create-graph-in-autograd-grad/32853/3 
- https://lucainiaoge.github.io/download/PyTorch-create_graph-is-true_Tutorial_and_Example.pdf
- https://www.youtube.com/watch?v=IkDw22a8BDE
- https://discuss.pytorch.org/t/how-to-manually-update-network-parameters-while-keeping-track-of-its-computational-graph/131642/2
- https://discuss.pytorch.org/t/how-to-calculate-2nd-derivative-of-a-likelihood-function/15085/3
- https://pytorch.org/tutorials/recipes/recipes/zeroing_out_gradients.html

Note, different ways to refer to the task-specific vs. meta/aggregate updates to the parameters
Sometimes called "inner" and "outer" loop, respectively
Here, refered to as "task_specific" and "agg"/meta" (the latter, for consistency w/ ocariz code)
'''


T = 5 # num tasks
N = 1 # number of inner loop steps (notation from: https://www.bayeswatch.com/2018/11/30/HTYM/)
num_samples = 10 # number of samples to draw from the task
lr_task_specific = 0.01 # task specific learning rate
lr_meta = 0.001 # meta-update learning rate
waves = random.sample(SINE_TRAIN, T)
num_epochs = 10
printing_step = 2 # show log of loss every x epochs
# num_epochs = int(1e5) 
# printing_step = 1000 # show log of loss every x epochs

# Initializations
initialization_to_store_meta_losses()


#Instantiate the other model, from the prior class
model = SineModel()

#Use the different syntax of model.params()
meta_optimizer = torch.optim.Adam(model.params(), lr = lr_meta)


for epoch in range(num_epochs):
        
    # store loss over all tasks to then do a large meta-level update of initial params
    # idea/help from video: https://www.youtube.com/watch?v=IkDw22a8BDE
    meta_loss = None 
    

    # loop over tasks and fine-tune weights per task
    for i, T_i in enumerate(waves): 
        # copy model to use the same starting weights
        
        #Use the different copying function capacity
        new_model = SineModel()
        new_model.copy(model, same_var=True)
        
        
        # note, b/c of manual gradient updates, need to zero out the gradients at the start
        # https://pytorch.org/tutorials/recipes/recipes/zeroing_out_gradients.html
        new_model.zero_grad()

        # use model to predict on task-specific training set
        task_specific_loss = evaluation(new_model, T_i, item = False, num_samples=num_samples, force_new=True)
        # save the computation graph for later backprop
        # help from: https://discuss.pytorch.org/t/how-to-save-computation-graph-of-a-gradient/128286/2
        gradient_info = torch.autograd.grad(task_specific_loss, new_model.parameters(), 
                                        create_graph=True)
        # now, need to extract gradients for each param and get a new graph 
        # help from: https://discuss.pytorch.org/t/how-to-calculate-2nd-derivative-of-a-likelihood-function/15085/3
        # and: https://www.youtube.com/watch?v=IkDw22a8BDE
        model_param_data = new_model.state_dict()
        # note: order of computation is preserved and state_dict = ordered, so okay to loop
        # https://github.com/HIPS/autograd/blob/master/docs/tutorial.md
        for computation_idx, (param_name, param_obj) in enumerate(new_model.named_parameters()):
            task_specific_grad = gradient_info[computation_idx]
            model_param_data[param_name] = param_obj - lr_task_specific * task_specific_grad # manual update

        # load the updated task-specific params (w/ gradient info!) into the model 
        new_model.load_state_dict(model_param_data)
            
        # use new model to predict 
        # note: we want a new sample from T_i
        # e.g., sample new draws from task, feed forward (e.g., get preds), compute loss, sum loss to meta_loss for later gradient use
        held_out_task_specific_loss = evaluation(new_model, T_i, item = False, num_samples=num_samples, force_new=True)
        # save meta-loss per task help from: 
        # https://www.youtube.com/watch?v=IkDw22a8BDE
        # https://lucainiaoge.github.io/download/PyTorch-create_graph-is-true_Tutorial_and_Example.pdf
        if meta_loss is None: 
            meta_loss = held_out_task_specific_loss # b/c we can't add a tensor to None obj
        else: meta_loss += held_out_task_specific_loss
        

    # backpropogate thru all tasks
    # use adam optimizer here!! 
    meta_loss /= T # b/c we want the mean -- divide by the number of tasks (T)
    meta_loss.backward() 
    metaoptimizer_update(meta_optimizer)
    
    print(model.params()[0][5:10])
    
    #print("Check updated? ", model._modules['hidden1']._parameters["weight"][:1])



tensor([[-0.8151],
        [ 0.9552],
        [-0.2384],
        [-0.1510],
        [ 0.0550]], grad_fn=<SliceBackward>)
tensor([[-0.8150],
        [ 0.9560],
        [-0.2375],
        [-0.1501],
        [ 0.0555]], grad_fn=<SliceBackward>)
tensor([[-0.8148],
        [ 0.9569],
        [-0.2371],
        [-0.1491],
        [ 0.0555]], grad_fn=<SliceBackward>)
tensor([[-0.8149],
        [ 0.9578],
        [-0.2366],
        [-0.1481],
        [ 0.0552]], grad_fn=<SliceBackward>)
tensor([[-0.8154],
        [ 0.9586],
        [-0.2360],
        [-0.1472],
        [ 0.0548]], grad_fn=<SliceBackward>)
tensor([[-0.8159],
        [ 0.9592],
        [-0.2353],
        [-0.1464],
        [ 0.0544]], grad_fn=<SliceBackward>)
tensor([[-0.8162],
        [ 0.9599],
        [-0.2348],
        [-0.1456],
        [ 0.0540]], grad_fn=<SliceBackward>)
tensor([[-0.8163],
        [ 0.9606],
        [-0.2344],
        [-0.1448],
        [ 0.0538]], grad_fn=<SliceBackward>)
tensor([[-0.8166],
        [ 0.9

  x = torch.tensor(x[:,None])
  y_true = torch.tensor(y_true[:,None])


<h1> Reptile

<h1> Few Shot learning with new meta-model

The model performs good few shot learning