In [2]:
pip install torchmeta

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
!nvidia-smi

Fri Apr 17 01:12:36 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.87.00    Driver Version: 418.87.00    CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:1A:00.0 Off |                  N/A |
| 28%   37C    P2    55W / 250W |    885MiB / 10989MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:1B:00.0 Off |                  N/A |
| 39%   70C    P2   257W / 250W |   5513MiB / 10989MiB |     98%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce RTX 208...  Off  | 00000000:3D:00.0 Off |                  N/A |
| 36%   

In [4]:
import torch
import torch.nn as nn
from torch.nn import init
import torch.nn.functional as F 
import numpy as np
from torchmeta.modules import (MetaModule, MetaConv2d, MetaBatchNorm2d,
                               MetaSequential, MetaLinear)
from torchmeta.modules.utils import get_subdict
from collections import OrderedDict
import gc

In [5]:
torch.cuda.set_device(3)

**SWRN Meta model**

In [6]:
class TemplateBank(nn.Module):
  def __init__(self,num_templates,input_channels,output_channels, kernel_size):
    super(TemplateBank,self).__init__()
    self.coefficients_shape=(num_templates,1,1,1,1)

    #the templates are convolutions windows, we n_templates of the same size in the bank
    templates=[torch.Tensor(input_channels,output_channels,kernel_size,kernel_size) for i in range(num_templates)]
    
    #stack the tensors, same form but now usable for pytorch
    self.templates=nn.Parameter(torch.stack(templates))
  def forward(self,coefficients):
    #print("Linear combination of the templates",(self.templates*coefficients).sum(0))
    #linear combination
    return (self.templates*coefficients).sum(0)

In [7]:
class SConv2d(MetaModule):
  def __init__(self,bank,stride=1,padding=1):
    super(SConv2d,self).__init__()
    self.stride , self.padding, self.bank= stride, padding, bank
    #soft parameter in front of the templates, determine by the shape of the bank
    self.coefficients=nn.Parameter(torch.zeros(bank.coefficients_shape))
  def forward(self,input,params=None):
    # these are the convolution parameters, we multiplied the linear coef to the templates
    #it's one tensor, create by the forward method of bank
    coeffs=OrderedDict(params)["coefficients"]
    
    parameters=self.bank(coeffs)
    #Performs a normal convolutions with the linear combination of the templates
    return F.conv2d(input,parameters,stride=self.stride,padding=self.padding)
 

In [8]:
class Block(MetaModule):
  def __init__(self,input_channels,output_channels,stride,bank=None):
    super(Block,self).__init__()
    self.bank=bank
    
    self.bn1=MetaBatchNorm2d(input_channels)
    if self.bank:
      #if we give a template bank to the bloc, the conv layers are the soft ones
      self.conv1=SConv2d(self.bank)
    else:
      self.conv1=MetaConv2d(input_channels,output_channels,kernel_size=3,stride=stride,padding=1,bias=False)
    
    self.bn2=MetaBatchNorm2d(output_channels)
    if self.bank:
      self.conv2=SConv2d(self.bank)
    else:
      #this conv does not increase number of channels
      self.conv2=MetaConv2d(output_channels,output_channels,kernel_size=3,stride=1,padding=1,bias=False)

    self.equalInOut=(input_channels==output_channels)
    self.convShortcut = (not self.equalInOut) and nn.Conv2d(input_channels, output_channels, kernel_size=1, stride=stride, padding=0, bias=False) or None
    #explain these two lines!

    self.relu=nn.ReLU(inplace=True)
  def forward(self,x,params=None):
    #print(x,"input, block")
    res=x
    out=self.relu(self.bn1(x,params=get_subdict(params,"bn1")))
    #print(out," after bn1, block")
    if not self.equalInOut:
      res=out
    out=self.conv1(out,params=get_subdict(params,"conv1"))
    #print(out,"after Sconv1, bn1")
    out=self.bn2(out,params=get_subdict(params,"bn2"))

    #print(out, "after Sconv1, bn1")
    out=self.conv2(self.relu(out),params=get_subdict(params,"conv2"))
    #print(out,"after sconv2")
    #explain next line!
    if self.convShortcut is not None: res = self.convShortcut(res)
    return out+res

In [9]:
class SWRN(MetaModule):
  def __init__(self,depth,width,num_templates,num_classes):
    super(SWRN,self).__init__()

    #successions of channels for each stage 0->1->2->3
    n_channels=[16,16*width,32*width,64*width]
    num_blocks=(depth-4)/6 #why this, find out]
    print(num_blocks)
    layers_per_bank=2*(num_blocks-1) # why is this
    print ('SWRN : Depth : {} , Widen Factor : {}, Templates per Group : {}'.format(depth, width, num_templates))

    self.num_classes=num_classes #for predictions output
    self.num_templates=num_templates #num of templates per group

    #layers
    #first argument is 3 for mini_image net (since input has 3 hannels, but omniglot it's 1)
    self.conv_3x3 = MetaConv2d(3, n_channels[0], kernel_size=3, stride=1, padding=1, bias=False)

    self.bank_1 = TemplateBank(self.num_templates, n_channels[1], n_channels[1], 3)
    self.stage_1 = self._make_layer(n_channels[0], n_channels[1], num_blocks, self.bank_1, 1)

    self.bank_2 = TemplateBank(self.num_templates, n_channels[2], n_channels[2], 3)
    self.stage_2 = self._make_layer(n_channels[1], n_channels[2], num_blocks, self.bank_2, 2)

    self.bank_3 = TemplateBank(self.num_templates, n_channels[3], n_channels[3], 3)
    self.stage_3 = self._make_layer(n_channels[2], n_channels[3], num_blocks, self.bank_3, 2)

    self.lastact = MetaSequential(MetaBatchNorm2d(n_channels[3]), nn.ReLU(inplace=True))
    self.avgpool = nn.AvgPool2d(21) #the kernel size of the average pooling is the size of that last layer
    #for mini-imagine net 21 does the job, for omniglot, 7
    self.classifier = MetaLinear(n_channels[3], num_classes)


#initialisations
    for i in range(1,4):
      coefficient_inits = torch.zeros((int(layers_per_bank),int(num_templates),1,1,1,1))
      nn.init.orthogonal_(coefficient_inits) # very important
      sconv_group=[]
      for name, module in self.named_modules():
            if isinstance(module,SConv2d) and "stage_%s"%i in name:
                sconv_group.append((name,module))
      for j,(name,module) in enumerate(sconv_group):
        module.coefficients.data=coefficient_inits[j]
            
    
    
    for m in self.modules():
      if isinstance(m, nn.Conv2d):
        init.kaiming_normal_(m.weight)
      elif isinstance(m, nn.BatchNorm2d):
        m.weight.data.fill_(1)
        m.bias.data.zero_()
      elif isinstance(m, nn.Linear):
        init.kaiming_normal_(m.weight)
        m.bias.data.zero_()
  def _make_layer(self,input_channels,output_channels,num_blocks,bank,stride=1):
    #creates num_blocks , first one is input_channels-->output_channels, the rest keep the same nb of channels
    #the blocks are defined previously
    blocks=[]
    blocks.append(Block(input_channels, output_channels, stride))
    dict_seq=OrderedDict()
    for i in range(1, int(num_blocks)): 
      blocks.append(Block(output_channels, output_channels, 1, bank))
    for i, block in enumerate(blocks):
      dict_seq[str(i)]=block
    #is this right? should the name just be i or everyhting that comes after?
    return MetaSequential(dict_seq) #creates a sequential of all the blocks in successions, that's one stage
  
  def forward(self,x,params=None):
    #print(x, "before conv_3x3, swrn")
    x = self.conv_3x3(x,params=get_subdict(params,"conv_3x3"))

    #print("after conv_3x3,swrn")
    #each stage is num_blocks blocks
    x = self.stage_1(x,params=get_subdict(params,"stage_1"))
    #print(x, "after stage_1, swrn")
    x = self.stage_2(x,params=get_subdict(params,"stage_2"))
    x = self.stage_3(x,params=get_subdict(params,"stage_3"))

    x = self.lastact(x,params=get_subdict(params,"lastact"))
    x = self.avgpool(x)
    x = x.view(x.size(0), -1) 
    #reshaping so that we have the appriopriate num of columns
    return self.classifier(x,params=get_subdict(params,"classifier"))


In [10]:
#quickly create a swrn model
def swrn(depth,width,num_templates,num_classes=10):
  model=SWRN(depth, width, num_templates, num_classes)
  return model

**Default Hyperparameters and variables**

In [11]:
#model
depth=28
width=3
bank_size=20
#optimization
learning_rate=0.001
momentum=0.9
schedule=[60,120,160]
gammas=[0.2,0.2,0.2]
#regularization
decay=0.0005
#acceleration
workers=2

**Training the model**

# **Utils**

In [12]:
def get_coefficients_params(model_named_parameters):
  coeffs_idx=[]
  i=0
  for name,_ in model_named_parameters:
    if "coefficients" in name:
      coeffs_idx.append(i)
    i+=1
  return coeffs_idx


# **Adapted MAML**

In [21]:
class MAML(object):
  def __init__(self,model,optimizer=None,step_size=0.1,loss_function=F.cross_entropy):
    #metamodel, the neural net for the tasks
    self.model=model

    self.optimizer=optimizer
    #the step size could be meta-learnable, but for now we put it fixed
    self.step_size=torch.tensor(step_size,dtype=torch.float32,requires_grad=False)
    self.loss_function=loss_function
    self.params_dict=OrderedDict(self.model.named_parameters())
  def accuracy(self,logits,targets):
    with torch.no_grad():
      _,predictions=torch.max(logits,dim=1)
      accuracy=torch.mean(predictions.eq(targets).float())

    return accuracy.item()

  def step(self,batch):
    outer_loss=0
    #average of accuracy accross tasks for query set
    outer_accuracy=0
    counter=0
    for task_id,task in enumerate(zip(*batch["train"],*batch["test"])):
      if counter>5:
            break
      counter+=1
      # the zip is now a array of 25 (one for each task) with 4 columns
      # train_inputs, train_target, test_inputs, test_target
      #each task in this zip is a batch for a specific task
      train_inputs,train_targets=task[0].cuda(),task[1].cuda() #support set
      test_inputs,test_targets=task[2].cuda(),task[3].cuda() #querry set
      
      #don't forget to pass it named_parameters, and shouldn't be an iterator
      train_logits=self.model(train_inputs,self.params_dict)#OrderedDict(self.model.named_parameters())
      inner_loss=self.loss_function(train_logits,train_targets)
      self.model.zero_grad()
      #the model will have parameters called meta_params
      grads=torch.autograd.grad(inner_loss,self.model.parameters())
      
      #Updating the parameters for that tast
      #this becomes a for loop if we do many training steps inside, default is 1
      params=OrderedDict()
      i=0
      '''for (name,param), grad in zip(self.model.named_parameters(),grads):
        #if name in ...:
        #find better way to do this
        if "coefficients" in name:
          params[name]=param-self.step_size*grad
        else:
          params[name]=param'''
    
      for (name,param), grad in zip(self.model.named_parameters(),grads):
            params[name]=param-self.step_size*grad
            
      
      
      #this step in the paper is outside the inner loop, we evaluate on query set
      #the query set of that task, using the newly learned params (theta i), and updtate the real theta with it
      #we can caluclate the loss for each i during each step, so we don't have to remember the theta i
      #assign theta i (params) to the model temporarly to evaluate
      test_logit=self.model(test_inputs,params=params)

      #do we really take the average of accuracy for each task in the batch?
      
      #!!!! We could add a dictionary to collect the task loss for a specific id.

      outer_loss+=self.loss_function(test_logit,test_targets)
      outer_accuracy+=self.accuracy(test_logit,test_targets)
    
    outer_accuracy=float(outer_accuracy)/counter  #float(len(batch["train"][0])) #len of a torch tensor?
    #computes gradient
    outer_loss.backward()
    #the optimizer should already be "loaded" with the model's params
    self.optimizer.step()

    return outer_loss.detach(),outer_accuracy

  def train(self,dataloader,max_batches=500):
    num_batches=0
    for batch in dataloader:
      if num_batches>=max_batches:
        break
      l,a=self.step(batch)
      print(l,a)
      num_batches+=1
  def step_evaluate(self,batch):
    outer_loss=0
    #average of accuracy accross tasks for query set
    outer_accuracy=0
    counter=0
    for task_id,task in enumerate(zip(*batch["train"],*batch["test"])):
      if counter>5:
            break
      counter+=1
      # the zip is now a array of 25 (one for each task) with 4 columns
      # train_inputs, train_target, test_inputs, test_target
      #each task in this zip is a batch for a specific task
      train_inputs,train_targets=task[0].cuda(),task[1].cuda() #support set
      test_inputs,test_targets=task[2].cuda(),task[3].cuda() #querry set
      
      #don't forget to pass it named_parameters, and shouldn't be an iterator
      train_logits=self.model(train_inputs,self.params_dict)#OrderedDict(self.model.named_parameters())
      inner_loss=self.loss_function(train_logits,train_targets)
      self.model.zero_grad()
      #the model will have parameters called meta_params
      grads=torch.autograd.grad(inner_loss,self.model.parameters())
      
      #Updating the parameters for that tast
      #this becomes a for loop if we do many training steps inside, default is 1
      params=OrderedDict()
      i=0
      '''for (name,param), grad in zip(self.model.named_parameters(),grads):
        #if name in ...:
        #find better way to do this
        if "coefficients" in name:
          params[name]=param-self.step_size*grad
        else:
          params[name]=param'''
    
      for (name,param), grad in zip(self.model.named_parameters(),grads):
            params[name]=param-self.step_size*grad
            
      
      
      #this step in the paper is outside the inner loop, we evaluate on query set
      #the query set of that task, using the newly learned params (theta i), and updtate the real theta with it
      #we can caluclate the loss for each i during each step, so we don't have to remember the theta i
      #assign theta i (params) to the model temporarly to evaluate
      test_logit=self.model(test_inputs,params=params)

      #do we really take the average of accuracy for each task in the batch?
      
      #!!!! We could add a dictionary to collect the task loss for a specific id.

      outer_loss+=self.loss_function(test_logit,test_targets)
      outer_accuracy+=self.accuracy(test_logit,test_targets)
    
    outer_accuracy=float(outer_accuracy)/counter  #float(len(batch["train"][0])) #len of a torch tensor?


    return outer_loss.detach(),outer_accuracy

  def evaluate(self,dataloader,max_batches=500):
    mean_outer_loss,mean_accuracy,count= 0., 0., 0

    for batch in dataloader:
      if num_batches>=max_batches:
        break
      outer_loss,outer_accuracy=self.step_evaluate(batch)
      print(outer_loss,outer_accuracy)

      mean_outer_loss+=outer_loss
      mean_accuracy+=outer_accuracy
      count+=1
    
    return float(mean_outer_loss)/float(count) , float(mean_accuracy)/float(count)

In [22]:
def conv_block(in_channels, out_channels, **kwargs):
  return MetaSequential(OrderedDict([
      ('conv', MetaConv2d(in_channels, out_channels, **kwargs)),
      ('norm', nn.BatchNorm2d(out_channels, momentum=1.,
          track_running_stats=False)),
      ('relu', nn.ReLU()),
      ('pool', nn.MaxPool2d(2))
  ]))


In [23]:
class MetaConvModel(MetaModule):
  def __init__(self,in_channels,out_features,hidden_size=64,feature_size=64):
    super(MetaConvModel,self).__init__()
    self.in_channels=in_channels
    self.out_features=out_features
    self.hidden_size=hidden_size
    self.feature_size=feature_size

    self.features = MetaSequential(OrderedDict([                                         
    ('layer1', conv_block(in_channels, hidden_size, kernel_size=3,
                          stride=1, padding=1, bias=True)),
    ('layer2', conv_block(hidden_size, hidden_size, kernel_size=3,
                          stride=1, padding=1, bias=True)),
    ('layer3', conv_block(hidden_size, hidden_size, kernel_size=3,
                          stride=1, padding=1, bias=True)),
    ('layer4', conv_block(hidden_size, hidden_size, kernel_size=4,
                          stride=1, padding=1, bias=True))
    ]))
    self.classifier = MetaLinear(feature_size, out_features, bias=True)
  def forward(self, inputs, params=None):
    features = self.features(inputs, params=get_subdict(params, 'features'))
    features = features.view((features.size(0), -1))
    logits = self.classifier(features, params=get_subdict(params, 'classifier'))
    return logits
    

In [24]:
def adjust_learning_rate(optimizer, epoch, gammas, schedule, loss):
  lr = args.learning_rate
  assert len(gammas) == len(schedule), "length of gammas and schedule should be equal"
  for (gamma, step) in zip(gammas, schedule):
    if (epoch >= step): lr = lr * gamma
    else: break
  for param_group in optimizer.param_groups: param_group['lr'] = lr
  return lr
def group_weight_decay(net, weight_decay, skip_list=()):
  decay, no_decay = [], []
  for name, param in net.named_parameters():
    if not param.requires_grad: continue
    if sum([pattern in name for pattern in skip_list]) > 0: no_decay.append(param)
    else: decay.append(param)
  return [{'params': no_decay, 'weight_decay': 0.}, {'params': decay, 'weight_decay': weight_decay}]

def accuracy(output, target, topk=(1,)):
  if len(target.shape) > 1: return torch.tensor(1), torch.tensor(1)
  
  with torch.no_grad():
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
      correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
      res.append(correct_k.mul_(100.0 / batch_size))
  return res

# **Getting the data**

In [17]:
from torchmeta.datasets import Omniglot, MiniImagenet
from torchmeta.transforms import ClassSplitter, Categorical, Rotation
from torchvision.transforms import ToTensor, Resize, Compose
from torchmeta.utils.data import BatchMetaDataLoader

In [18]:
#-------------HyperParameters----------------------
num_shots=5
num_ways=5
num_shots_test=5
batch_size=128
num_workers=1

In [19]:

dataset_transform = ClassSplitter(shuffle=True,
                                      num_train_per_class=num_shots,
                                      num_test_per_class=num_shots_test)
transform = Compose([Resize(84), ToTensor()])

meta_train_dataset = MiniImagenet("data",
                                  transform=transform,
                                  target_transform=Categorical(num_ways),
                                  num_classes_per_task=num_ways,
                                  meta_train=True,
                                  dataset_transform=dataset_transform,
                                  download=True)
meta_val_dataset = MiniImagenet("data",
                                transform=transform,
                                target_transform=Categorical(num_ways),
                                num_classes_per_task=num_ways,
                                meta_val=True,
                                dataset_transform=dataset_transform)
meta_test_dataset = MiniImagenet("data",
                                  transform=transform,
                                  target_transform=Categorical(num_ways),
                                  num_classes_per_task=num_ways,
                                  meta_test=True,
                                  dataset_transform=dataset_transform)


In [None]:
dataset_transform = ClassSplitter(shuffle=True,
                                      num_train_per_class=num_shots,
                                      num_test_per_class=num_shots_test)
class_augmentations = [Rotation([90, 180, 270])]
transform = Compose([Resize(28), ToTensor()])

meta_train_dataset = Omniglot("data",
                              transform=transform,
                              target_transform=Categorical(num_ways),
                              num_classes_per_task=num_ways,
                              meta_train=True,
                              class_augmentations=class_augmentations,
                              dataset_transform=dataset_transform,
                              download=True)
meta_val_dataset = Omniglot("data",
                            transform=transform,
                            target_transform=Categorical(num_ways),
                            num_classes_per_task=num_ways,
                            meta_val=True,
                            class_augmentations=class_augmentations,
                            dataset_transform=dataset_transform)
meta_test_dataset = Omniglot("data",
                             transform=transform,
                             target_transform=Categorical(num_ways),
                             num_classes_per_task=num_ways,
                             meta_test=True,
                             dataset_transform=dataset_transform)

In [29]:
meta_train_dataloader = BatchMetaDataLoader(meta_train_dataset,
                                                batch_size=batch_size,
                                                shuffle=True,
                                                num_workers=num_workers,
                                                pin_memory=True)

In [30]:
meta_test_dataloader=BatchMetaDataLoader(meta_test_dataset,
                                                batch_size=batch_size,
                                                shuffle=True,
                                                num_workers=num_workers,
                                                pin_memory=True)

In [31]:
torch.cuda.empty_cache()
gc.collect()

52

In [32]:
#exploring dataset
#train is the support set, test is the querry set
num_batches=0
for batch in meta_train_dataloader:
  if num_batches>=1:
    break
  num_batches+=1
  print(batch.keys())
  print(len(batch["train"]))
  print(len(batch["test"]))
  print(len(batch["train"][0]))
  print(len(batch["train"][0][0]))
  b=batch

  # one data point
  print(len(batch["train"][0][0][0]))
  #-------------
  
  print(len(batch["train"][0][0][0][0]))

  # batch["train"] contains (inputs,targets)
  # for batch["train"][0], it's a batch of 25 tasks, each containing a batch 
  #of data for the specific tast

dict_keys(['train', 'test'])
2
2
128
25
3
84


In [33]:
out_features=5
hidden_size=64
loss_function=torch.nn.CrossEntropyLoss().cuda()

del ModelConvMiniImagenet
torch.cuda.empty_cache()
gc.collect()

In [34]:
ModelConvMiniImagenet=swrn(depth,width,bank_size,out_features).cuda()
params = group_weight_decay(ModelConvMiniImagenet, decay, ['coefficients'])
optimizer=torch.optim.SGD(params, learning_rate, momentum=momentum, nesterov=(momentum > 0.0))
#optimizer=torch.optim.Adam(ModelConvMiniImagenet.parameters(), lr=0.001)

4.0
SWRN : Depth : 28 , Widen Factor : 3, Templates per Group : 20


What's left to do: 
- Figure out to make the net work when we don't specify params
- GPU Memory problem
- Make the avgpooling flexible so that it works for any task

In [35]:
p=OrderedDict(ModelConvMiniImagenet.named_parameters())
ModelConvMiniImagenet(b["train"][0][0].cuda(),p)

tensor([[-0.1018, -0.1010, -0.5224, -1.7084,  0.7644],
        [ 0.1508,  0.4815, -0.3205, -1.0312,  0.7340],
        [ 0.3114,  0.4095, -0.5738, -1.0861,  0.9009],
        [ 0.3265,  0.1411, -0.5086, -1.2018,  0.6469],
        [ 0.4937,  0.3533, -0.5104, -1.0988,  0.9076],
        [ 0.1622, -0.0220, -0.1499, -1.0974,  0.3214],
        [ 0.1444,  0.1908, -0.4073, -1.4917,  0.4803],
        [-0.0946,  0.5163, -0.3015, -0.5546,  0.2103],
        [ 0.3601,  0.0633, -0.0780, -1.2198,  0.6480],
        [ 0.3349,  0.2607, -0.1426, -1.1831,  0.5682],
        [ 0.4380,  0.6787, -0.5571, -1.0004,  0.6141],
        [ 0.0660,  0.0660, -0.1931, -1.1072,  1.0183],
        [ 0.4985,  0.3467, -0.3223, -1.0424,  0.9803],
        [ 0.2281,  0.8066,  0.0125, -1.4366,  1.1253],
        [ 0.5554,  0.5430,  0.5009, -1.4849,  0.9509],
        [-0.1531, -0.0030, -0.1139, -1.2432,  0.6067],
        [ 0.0454,  0.6807, -0.2266, -0.7038,  0.2354],
        [ 0.0964,  0.1785,  0.1232, -0.9070,  0.7796],
        [ 

In [53]:
metalearner=MAML(ModelConvMiniImagenet,optimizer,loss_function=loss_function)

In [54]:
epochs=50
for epoch in range(epochs):
  metalearner.train(meta_train_dataloader,100)

tensor(8.5626, device='cuda:7') 0.36666665474573773
tensor(9.2016, device='cuda:7') 0.3399999936421712
tensor(9.3776, device='cuda:7') 0.2733333234985669
tensor(9.1047, device='cuda:7') 0.3199999928474426
tensor(9.3582, device='cuda:7') 0.28666666398445767
tensor(9.2897, device='cuda:7') 0.3266666556398074
tensor(9.1872, device='cuda:7') 0.30666665236155194
tensor(8.9173, device='cuda:7') 0.37999999274810153
tensor(9.3240, device='cuda:7') 0.2799999912579854
tensor(8.8912, device='cuda:7') 0.30666665732860565
tensor(8.8623, device='cuda:7') 0.36666665474573773
tensor(8.9970, device='cuda:7') 0.3399999874333541
tensor(9.1316, device='cuda:7') 0.35333332419395447
tensor(9.3701, device='cuda:7') 0.2666666582226753
tensor(9.2653, device='cuda:7') 0.3333333233992259
tensor(9.0613, device='cuda:7') 0.3999999910593033
tensor(8.6044, device='cuda:7') 0.3666666621963183
tensor(8.7041, device='cuda:7') 0.41999998937050503
tensor(8.7803, device='cuda:7') 0.43999998768170673
tensor(9.3880, device=

In [55]:
          state={
  'epoch': epoch + 1,
  'state_dict': ModelConvMiniImagenet.state_dict(),
  'optimizer' : optimizer.state_dict(),
}
filename="model1_MiniImageNet_50epochs.pth.tar"
torch.save(state,filename)

In [56]:
epochs=50
for epoch in range(epochs):
  metalearner.train(meta_train_dataloader,100)

tensor(5.7172, device='cuda:7') 0.6666666467984518
tensor(6.6951, device='cuda:7') 0.5799999882777532
tensor(4.4539, device='cuda:7') 0.7466666400432587
tensor(4.8514, device='cuda:7') 0.6866666376590729
tensor(5.7232, device='cuda:7') 0.6599999815225601
tensor(5.8615, device='cuda:7') 0.6466666460037231
tensor(6.0363, device='cuda:7') 0.6133333146572113
tensor(5.7603, device='cuda:7') 0.6466666360696157
tensor(5.2790, device='cuda:7') 0.6733333071072897
tensor(6.4976, device='cuda:7') 0.5533333222071329
tensor(5.4717, device='cuda:7') 0.6533333112796148
tensor(4.8841, device='cuda:7') 0.6933333079020182
tensor(6.7071, device='cuda:7') 0.626666655143102
tensor(4.7951, device='cuda:7') 0.6733333021402359
tensor(6.0699, device='cuda:7') 0.619999979933103
tensor(5.3794, device='cuda:7') 0.6599999964237213
tensor(5.4084, device='cuda:7') 0.653333306312561
tensor(5.9171, device='cuda:7') 0.6066666593154272
tensor(5.6809, device='cuda:7') 0.6399999856948853
tensor(5.6691, device='cuda:7') 0.

In [57]:
          state={
  'epoch': epoch + 1,
  'state_dict': ModelConvMiniImagenet.state_dict(),
  'optimizer' : optimizer.state_dict(),
}
filename="model1_MiniImageNet_100epochs.pth.tar"
torch.save(state,filename)

In [58]:
epochs=100
for epoch in range(epochs):
  metalearner.train(meta_train_dataloader,100)

tensor(4.6974, device='cuda:7') 0.693333312869072
tensor(4.7345, device='cuda:7') 0.7133333086967468
tensor(4.2378, device='cuda:7') 0.7666666507720947
tensor(5.4049, device='cuda:7') 0.6799999674161276
tensor(4.8172, device='cuda:7') 0.6799999872843424
tensor(4.5697, device='cuda:7') 0.7133333086967468
tensor(5.2404, device='cuda:7') 0.699999988079071
tensor(3.9298, device='cuda:7') 0.7266666491826376
tensor(4.5104, device='cuda:7') 0.7199999789396921
tensor(4.3984, device='cuda:7') 0.7399999797344208
tensor(3.7048, device='cuda:7') 0.7866666316986084
tensor(4.9396, device='cuda:7') 0.6866666475931803
tensor(4.0075, device='cuda:7') 0.7933333218097687
tensor(5.1301, device='cuda:7') 0.6733333170413971
tensor(6.7276, device='cuda:7') 0.606666644414266
tensor(4.0672, device='cuda:7') 0.7399999797344208
tensor(4.6937, device='cuda:7') 0.699999988079071
tensor(4.9451, device='cuda:7') 0.7133333186308543
tensor(5.3588, device='cuda:7') 0.6933333029349645
tensor(3.7948, device='cuda:7') 0.7

In [59]:
          state={
  'epoch': epoch + 1,
  'state_dict': ModelConvMiniImagenet.state_dict(),
  'optimizer' : optimizer.state_dict(),
}
filename="model1_MiniImageNet_200epochs.pth.tar"
torch.save(state,filename)

In [60]:
epochs=100
for epoch in range(epochs):
  metalearner.train(meta_train_dataloader,100)

tensor(4.2840, device='cuda:7') 0.753333310286204
tensor(3.0059, device='cuda:7') 0.8733333150545756
tensor(3.7212, device='cuda:7') 0.7466666599114736
tensor(4.4153, device='cuda:7') 0.6999999781449636
tensor(4.8687, device='cuda:7') 0.6599999765555064
tensor(4.1044, device='cuda:7') 0.8066666424274445
tensor(3.8089, device='cuda:7') 0.7599999805291494
tensor(4.4312, device='cuda:7') 0.7799999713897705
tensor(3.6197, device='cuda:7') 0.7999999821186066
tensor(4.1208, device='cuda:7') 0.7266666491826376
tensor(4.6558, device='cuda:7') 0.77333332101504
tensor(3.5986, device='cuda:7') 0.7866666316986084
tensor(3.9380, device='cuda:7') 0.7666666507720947
tensor(3.5600, device='cuda:7') 0.7866666416327158
tensor(4.5166, device='cuda:7') 0.72666663924853
tensor(3.7924, device='cuda:7') 0.7199999690055847
tensor(3.8577, device='cuda:7') 0.7533333202203115
tensor(3.4873, device='cuda:7') 0.7999999821186066
tensor(4.1978, device='cuda:7') 0.7666666507720947
tensor(3.9155, device='cuda:7') 0.78

In [61]:
          state={
  'epoch': epoch + 1,
  'state_dict': ModelConvMiniImagenet.state_dict(),
  'optimizer' : optimizer.state_dict(),
}
filename="model1_MiniImageNet_300epochs.pth.tar"
torch.save(state,filename)

In [None]:
epochs=200
for epoch in range(epochs):
  metalearner.train(meta_train_dataloader,100)

tensor(3.2334, device='cuda:7') 0.8399999737739563
tensor(3.9488, device='cuda:7') 0.7933333118756613
tensor(4.5012, device='cuda:7') 0.7066666533549627
tensor(4.4929, device='cuda:7') 0.7399999797344208
tensor(2.9593, device='cuda:7') 0.8133333027362823
tensor(3.4234, device='cuda:7') 0.833333303531011
tensor(3.2281, device='cuda:7') 0.7799999813238779
tensor(4.4197, device='cuda:7') 0.6866666426261266
tensor(5.1194, device='cuda:7') 0.6866666475931803
tensor(3.6001, device='cuda:7') 0.7666666507720947
tensor(3.6191, device='cuda:7') 0.8133333027362823
tensor(3.8176, device='cuda:7') 0.753333310286204
tensor(3.5479, device='cuda:7') 0.7799999614556631
tensor(3.1381, device='cuda:7') 0.8133333027362823
tensor(2.6888, device='cuda:7') 0.8533333043257395
tensor(3.3668, device='cuda:7') 0.7933333019415537
tensor(3.0419, device='cuda:7') 0.8466666539510092
tensor(2.7460, device='cuda:7') 0.8533333241939545
tensor(2.3012, device='cuda:7') 0.879999985297521
tensor(1.7721, device='cuda:7') 0.

In [1]:
          state={
  'epoch': epoch + 1,
  'state_dict': ModelConvMiniImagenet.state_dict(),
  'optimizer' : optimizer.state_dict(),
}
filename="model1_MiniImageNet_500epochs.pth.tar"
torch.save(state,filename)

NameError: name 'epoch' is not defined

**Testing the model**

In [36]:
model=swrn(depth,width,bank_size,out_features).cuda()

4.0
SWRN : Depth : 28 , Widen Factor : 3, Templates per Group : 20


In [38]:
PATH = "model1_MiniImageNet_100epochs.pth.tar"
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint["state_dict"])

<All keys matched successfully>

In [39]:
optimizer_test=torch.optim.SGD(model.parameters(), learning_rate, momentum=momentum, nesterov=(momentum > 0.0))

In [40]:
metalearner_test=MAML(model,optimizer,loss_function=loss_function)

In [42]:
loss,accuracy=metalearner_test.evaluate(meta_test_dataloader,100)

tensor(6.1772, device='cuda:3') 0.6199999849001566
tensor(6.6615, device='cuda:3') 0.5466666469971339
tensor(5.9708, device='cuda:3') 0.6066666543483734
tensor(5.9989, device='cuda:3') 0.5866666485865911
tensor(5.9205, device='cuda:3') 0.6133333146572113
tensor(6.0814, device='cuda:3') 0.6199999898672104
tensor(7.1875, device='cuda:3') 0.4933333198229472
tensor(6.9369, device='cuda:3') 0.5266666561365128
tensor(7.1690, device='cuda:3') 0.5199999858935674
tensor(4.7703, device='cuda:3') 0.6466666460037231
tensor(6.1660, device='cuda:3') 0.626666655143102
tensor(6.0774, device='cuda:3') 0.5866666585206985
tensor(5.8960, device='cuda:3') 0.639999990661939
tensor(6.7315, device='cuda:3') 0.5533333222071329
tensor(7.7662, device='cuda:3') 0.5133333206176758
tensor(6.4036, device='cuda:3') 0.5733333081007004
tensor(5.2228, device='cuda:3') 0.6466666460037231
tensor(6.3508, device='cuda:3') 0.619999979933103
tensor(6.7447, device='cuda:3') 0.5533333271741867
tensor(6.0794, device='cuda:3') 0.

In [43]:
print("-------------------------------------------------------------")
print("average loss: ",loss, "  average accuracy:",accuracy)

-------------------------------------------------------------
average loss:  6.210397689068904   average accuracy: 0.5931147393551682


In [44]:
model=swrn(depth,width,bank_size,out_features).cuda()
PATH = "model1_MiniImageNet_200epochs.pth.tar"
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint["state_dict"])
optimizer_test=torch.optim.SGD(model.parameters(), learning_rate, momentum=momentum, nesterov=(momentum > 0.0))
metalearner_test=MAML(model,optimizer,loss_function=loss_function)
loss,accuracy=metalearner_test.evaluate(meta_test_dataloader,100)
print("-------------------------------------------------------------")
print("average loss: ",loss, "  average accuracy:",accuracy)

4.0
SWRN : Depth : 28 , Widen Factor : 3, Templates per Group : 20
tensor(6.7418, device='cuda:3') 0.5933333237965902
tensor(5.8602, device='cuda:3') 0.5933333188295364
tensor(6.9453, device='cuda:3') 0.5733333230018616
tensor(6.1128, device='cuda:3') 0.5933333188295364
tensor(7.5826, device='cuda:3') 0.5533333172400793
tensor(6.4395, device='cuda:3') 0.5799999833106995
tensor(6.8692, device='cuda:3') 0.5533333222071329
tensor(6.7853, device='cuda:3') 0.5999999791383743
tensor(6.1699, device='cuda:3') 0.5599999849994978
tensor(6.8157, device='cuda:3') 0.5533333122730255
tensor(6.7946, device='cuda:3') 0.5333333065112432
tensor(6.9981, device='cuda:3') 0.5066666603088379
tensor(7.0238, device='cuda:3') 0.5599999775489172
tensor(6.8933, device='cuda:3') 0.5666666577259699
tensor(6.9479, device='cuda:3') 0.5599999825159708
tensor(7.1957, device='cuda:3') 0.5666666527589163
tensor(5.4176, device='cuda:3') 0.6133333146572113
tensor(6.3261, device='cuda:3') 0.6133333096901575
tensor(5.1215, 

In [45]:
model=swrn(depth,width,bank_size,out_features).cuda()
PATH = "model1_MiniImageNet_300epochs.pth.tar"
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint["state_dict"])
optimizer_test=torch.optim.SGD(model.parameters(), learning_rate, momentum=momentum, nesterov=(momentum > 0.0))
metalearner_test=MAML(model,optimizer,loss_function=loss_function)
loss,accuracy=metalearner_test.evaluate(meta_test_dataloader,100)
print("-------------------------------------------------------------")
print("average loss: ",loss, "  average accuracy:",accuracy)

4.0
SWRN : Depth : 28 , Widen Factor : 3, Templates per Group : 20
tensor(6.6628, device='cuda:3') 0.5399999817212423
tensor(6.6511, device='cuda:3') 0.6066666543483734
tensor(7.7004, device='cuda:3') 0.5333333214124044
tensor(7.4703, device='cuda:3') 0.5599999924500784
tensor(7.8337, device='cuda:3') 0.4999999751647313
tensor(7.9258, device='cuda:3') 0.5666666527589163
tensor(6.2929, device='cuda:3') 0.5999999841054281
tensor(7.8769, device='cuda:3') 0.5333333214124044
tensor(7.9118, device='cuda:3') 0.5866666585206985
tensor(5.7220, device='cuda:3') 0.6133333047231039
tensor(8.7785, device='cuda:3') 0.5199999858935674
tensor(8.6911, device='cuda:3') 0.4266666571299235
tensor(8.5271, device='cuda:3') 0.46666665623585385
tensor(7.4772, device='cuda:3') 0.48666664958000183
tensor(7.7976, device='cuda:3') 0.5599999775489172
tensor(8.2715, device='cuda:3') 0.5199999759594599
tensor(8.4165, device='cuda:3') 0.47333332399527234
tensor(6.5131, device='cuda:3') 0.5733333230018616
tensor(6.480

In [46]:
model=swrn(depth,width,bank_size,out_features).cuda()
PATH = "model1_MiniImageNet_50epochs.pth.tar"
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint["state_dict"])
optimizer_test=torch.optim.SGD(model.parameters(), learning_rate, momentum=momentum, nesterov=(momentum > 0.0))
metalearner_test=MAML(model,optimizer,loss_function=loss_function)
loss,accuracy=metalearner_test.evaluate(meta_test_dataloader,100)
print("-------------------------------------------------------------")
print("average loss: ",loss, "  average accuracy:",accuracy)

4.0
SWRN : Depth : 28 , Widen Factor : 3, Templates per Group : 20
tensor(5.7607, device='cuda:3') 0.6466666509707769
tensor(7.4330, device='cuda:3') 0.49333331485589343
tensor(6.9412, device='cuda:3') 0.5999999890724818
tensor(6.3005, device='cuda:3') 0.5733333130677541
tensor(6.5444, device='cuda:3') 0.5599999775489172
tensor(6.3587, device='cuda:3') 0.5866666535536448
tensor(7.5122, device='cuda:3') 0.4933333198229472
tensor(6.9048, device='cuda:3') 0.5199999908606211
tensor(6.4025, device='cuda:3') 0.6399999707937241
tensor(6.4928, device='cuda:3') 0.5733333180348078
tensor(7.2055, device='cuda:3') 0.5133333206176758
tensor(7.0793, device='cuda:3') 0.5266666462024053
tensor(6.4458, device='cuda:3') 0.5933333188295364
tensor(7.1877, device='cuda:3') 0.5266666561365128
tensor(6.1445, device='cuda:3') 0.5599999924500784
tensor(6.0247, device='cuda:3') 0.6133333146572113
tensor(6.2842, device='cuda:3') 0.6199999849001566
tensor(5.7269, device='cuda:3') 0.6199999948342642
tensor(6.3291,