In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchsummary import summary
import numpy as np
import random
import math


In [2]:
# Configuration

population_size = 30 
numbers_of_generation = 10

In [3]:
class Reshape(nn.Module):
  def __init__(self, *args) -> None:
      super(Reshape, self).__init__()
      self.shape = args

  def forward(self, x):
    return x.view((x.size(0),) + self.shape)


# # The following models are equal.

# class Net(nn.Module):
# 	def __init__(self):
# 		super(Net, self).__init__()
# 		self.conv1 = nn.Conv2d(3, 6, 5)
# 		self.pool = nn.MaxPool2d(2, 2)
# 		self.conv2 = nn.Conv2d(6, 16, 5)
# 		self.fc1 = nn.Linear(16 * 5 * 5, 120)
# 		self.fc2 = nn.Linear(120, 84)
# 		self.fc3 = nn.Linear(84, 10)
	
# 	def forward(self, x):
# 		x = self.pool(F.relu(self.conv1(x)))
# 		x = self.pool(F.relu(self.conv2(x)))
# 		x = x.view(-1, 16 * 5 * 5)
# 		x = F.relu(self.fc1(x))
# 		x = F.relu(self.fc2(x))
# 		x = self.fc3(x)
# 		return x

# model = Net()



# model = nn.Sequential(
#         # Part1 consists of Conv and Pool 
#         nn.Conv2d(3, 6, 5),
#         nn.ReLU(),
#         nn.MaxPool2d(2, 2), 
#         nn.Conv2d(6, 16, 5),
#         nn.ReLU(),
#         nn.MaxPool2d(2, 2),
#         # Reshape(16 * 8 * 8),
#         nn.Flatten(),
#         # Part2 consists of Linear
#         nn.Linear(16 * 5 * 5, 120),
#         nn.Linear(120, 84),
#         nn.Linear(84, 10)
#         )


In [4]:
# Define hyper-parameters
batch_size = 64 # TODO
learning_rate = 1e-2 # TODO

# Load CIFAR-10 dataset
transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train_model(epochs, net):
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate)  
  net = net.to(device)

  for epoch in range(epochs):
      for i, (images, labels) in enumerate(train_loader):
          images, labels = images.to(device), labels.to(device)
          outputs = net(images)
          loss = criterion(outputs, labels)

          optimizer.zero_grad()
          loss.backward()
          optimizer.step()

      correct = 0
      total = 0
      with torch.no_grad():
          for images, labels in test_loader:
              images, labels = images.to(device), labels.to(device)
              outputs = net(images)
              _, predicted = torch.max(outputs.data, 1)
              total += labels.size(0)
              correct += (predicted == labels).sum().item()

      print('Epoch: {}/{}, Test Accuracy: {:.2f}'.format(epoch+1, epochs, correct/total))
  
  print('Final Accuracy: {:.2f}'.format(correct/total))
  return correct / total


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [5]:
'''
0 - Conv2d
1 - ReLu
2 - MaxPool2d
3 - AvgPool2d
4 - Linear
'''

class Individual:
  def __init__(self, part1, part2):
    self.part1 = part1
    self.part2 = part2
    self.score = {
        "acc": 0,
        "parameters": 0
    }

def decodeCNN(part1, part2):
  return nn.Sequential(
      *map(transferToLayer, part1),
      nn.Flatten(),
      *map(transferToLayer, part2),
  )


def transferToLayer(args):
  # after conv, the size remains the same
  mapping = {
      3: 1, # 3x3 kernel size -> padding = 1
      5: 2, # 5x5 kernel size -> padding = 2
      7: 3, # 7x7 kernel size -> padding = 3
  }
  layer_type = args[0]
  if layer_type == 0:
    return nn.Conv2d(in_channels=args[1], out_channels=args[2], kernel_size=args[3], padding=mapping[args[3]])
  elif layer_type == 1:
    return nn.ReLU()
  elif layer_type == 2:
    return nn.MaxPool2d(2, 2)
  elif layer_type == 3:
    return nn.AvgPool2d(2, 2)
  elif layer_type == 4:
    return nn.Linear(args[1], args[2])


In [6]:
def fill_score(individual):
  torch.cuda.empty_cache()
  model = decodeCNN(individual.part1, individual.part2)
  # individual.score['acc'] = train_model(10, model) # TODO
  individual.score['acc'] = random.randint(1, 100)
  individual.score['parameters'] = count_parameters(model)
  # model = model.to(device)
  # summary(model, (3, 32, 32))
  del model
  torch.cuda.empty_cache()

In [7]:
def initialization(size):
  part1_min = 3 # TODO
  part1_max = 10 # TODO
  # part2_min = 2
  # part2_max = 5
  kernel_max = 128 # TODO
  population = []
  
  for _ in range(size):
    part1 = []
    part2 = []
    output_size = 32 # CIFAR10: 32x32
    p1_length = random.randint(part1_min, part1_max)
    # p2_length = random.randint(part2_min, part2_max)

    # first layer must be conv
    prev_channels = random.randint(5, 10) # TODO
    part1.extend([
        (0, 3, prev_channels, random.choice([3, 5, 7])), # 0-conv2d, 3-in_channel, prev_channels-out_channel, random.choice([3, 5])-kernal size 3x3 or 5x5  
        (1,) # Relu
    ])

    # Part1
    for _ in range(2, p1_length + 1):
      if random.random() < 0.5: # TODO 50% add conv, 50% add pool 
        # Add Conv
        new_out = random.randint(prev_channels, kernel_max)
        part1.extend([
            (0, prev_channels, new_out, random.choice([3, 5, 7])),
            (1,)
        ])
        prev_channels = new_out

      else:
        # Add Pool
        # 32 -> 16 -> 8 -> 4 -> 2 minimum 2x2
        if output_size == 2:
          continue 
        if random.random() < 0.5: # TODO 50% add maxPool, 50% add avgPool
          # add MaxPool
          part1.append((2,))
        else:
          # AvgPool
          part1.append((3,))
        output_size //= 2

    # Part2  
    prev_features = prev_channels * output_size * output_size 
    
    part2.extend([
          (4, prev_features, 120),
          (4, 120, 84),
          (4, 84, 10),
    ])
    # for _ in range(p2_length + 1):
    #   cur_features = random.randint(10, max(10, int(math.sqrt(prev_features))))
    #   part2.append(
    #       (4, prev_features, cur_features)
    #   )
    #   prev_features = cur_features

    # part2.append(
    #     (4, prev_features, 10) # cifar10 has 10 classes
    # )

    population.append(Individual(part1, part2))

  for individual in population:
    fill_score(individual)
    

  return population


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)



# # test
# population = initialization(3)
# for individual in population:
#   torch.cuda.empty_cache()
#   model = decodeCNN(individual.part1, individual.part2)
#   individual.score['acc'] = train_model(3, model)
#   individual.score['parameters'] = count_parameters(model)
#   # model = model.to(device)
#   # summary(model, (3, 32, 32))
#   print(individual.score)
#   del model
#   torch.cuda.empty_cache()
  
  

In [18]:
from itertools import zip_longest 

def mutation(population):
  best = 5 # TODO
  size = len(population) // 2 # TODO
  next_generation = []

  for _ in range(size):
    best_idx = random.randint(0, best - 1)
    best_individual = population[best_idx]

    r1 = random.randint(0, len(population) - 1)
    while r1 == best_idx:
      r1 = random.randint(0, len(population) - 1)

    r2 = random.randint(0, len(population) - 1)
    while r2 in (r1, best_idx):
      r2 = random.randint(0, len(population) - 1)
    
    r1_minus_r2 = differentiate(population[r1], population[r2])
    new_individual = merge(best_individual, r1_minus_r2)
    next_generation.append(new_individual)
  
  
  for individual in next_generation:
    fill_score(individual)
  
  return next_generation
    
    
def differentiate(ind1, ind2):
  add_to_best = []
  p1 = p2 = 0
  # Remove relu
  seq1 = [x for x in ind1.part1 if x[0] != 1]
  seq2 = [x for x in ind2.part1 if x[0] != 1]

  while p1 < len(seq1) or p2 < len(seq2):
    if p1 == len(seq1):
      add_to_best.append(seq2[p2])
      p2 += 1
      continue
    if p2 == len(seq2):
      add_to_best.append(seq1[p1])
      p1 += 1
      continue
    if seq1[p1][0] == seq2[p2][0]:
      add_to_best.append((-1,)) # remain the smae
      p1 += 1
      p2 += 1
    else:
      add_to_best.append(seq1[p1])
      p1 += 1
      p2 += 1
      
  return add_to_best



def merge(best_individual, r1_minus_r2):
  new_part1 = []
  seq = [x for x in best_individual.part1 if x[0] != 1]
  count_pool = 0 # maximum 4
  p1 = p2 = 0
  in_channels = 3

 
  while p1 < len(seq):
      if p2 == len(r1_minus_r2):
          new_part1.append(seq[p1])
          p1 += 1
          continue
      if r1_minus_r2[p2][0] == -1:
          new_part1.append(seq[p1])
          p1 += 1
          p2 += 1
      else:
          new_part1.append(r1_minus_r2[p2])
          p1 += 1
          p2 += 1

      if new_part1[-1][0] == 0:
          replace = (new_part1[-1][0], in_channels, new_part1[-1][2], new_part1[-1][3])
          in_channels = replace[2]
          new_part1.pop()
          new_part1.extend([replace, (1, )])
      elif new_part1[-1][0] in (2, 3):
          count_pool += 1
          if count_pool > 4:
              new_part1.pop()
              count_pool -= 1
              
  features = int(in_channels * (32 * (1 / 2)**count_pool) * (32 * (1 / 2)**count_pool))
  new_part2 = [
      (4, features, 120),
      (4, 120, 84),
      (4, 84, 10),
  ]
  
  return Individual(new_part1, new_part2)



In [13]:
def get_score(err_max, err_min, para_max, para_min, err, para):
  # Normalization
  err = (err - err_min) / (err_max - err_min)
  para = (para - para_min) / (para_max - para_min)
  score = 0.7 * err + 0.3 * para # TODO weight
  return score

In [14]:

# Initialization
population = initialization(population_size)
err_max = 1 - min(x.score['acc'] for x in population)
err_min = 1 - max(x.score['acc'] for x in population)
para_max = max(x.score['parameters'] for x in population)
para_min = min(x.score['parameters'] for x in population)
population.sort(key=lambda x: get_score(err_max, err_min, para_max, para_min, x.score['acc'], x.score['parameters']))


In [19]:
for g in range(2, numbers_of_generation):
  # Mutation
  new_population = mutation(population)
  population.extend(new_population)
  err_max = 1 - min(x.score['acc'] for x in population)
  err_min = 1 - max(x.score['acc'] for x in population)
  para_max = max(x.score['parameters'] for x in population)
  para_min = min(x.score['parameters'] for x in population)
  population.sort(key=lambda x: get_score(err_max, err_min, para_max, para_min, x.score['acc'], x.score['parameters']))
  population = population[:population_size]


best_individual = population[0]
# fully training