Copyright 2021 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

# Recursively Fertile Self-replicating Neural Agents

Code base for reproducing the results and extra material of "Recursively Fertile Self-replicating Neural Agents", by Ettore Randazzo, Luca Versari and Alexander Mordvintsev. Paper published in ALIFE 2021.

In [None]:
#@title Imports and Notebook Utilities
%tensorflow_version 2.x

import numpy as np
import matplotlib.pylab as pl
import tensorflow as tf

import io
import math
import pathlib
import PIL
from IPython.display import Image, clear_output


def np2pil(a):
  if a.dtype in [np.float32, np.float64]:
    a = np.uint8(np.clip(a, 0, 1)*255)
  return PIL.Image.fromarray(a)

def imwrite(f, a, fmt=None):
  a = np.asarray(a)
  if isinstance(f, str):
    fmt = f.rsplit('.', 1)[-1].lower()
    if fmt == 'jpg':
      fmt = 'jpeg'
    f = open(f, 'wb')
  np2pil(a).save(f, fmt, quality=95)

def imencode(a, fmt='jpeg'):
  a = np.asarray(a)
  if len(a.shape) == 3 and a.shape[-1] == 4:
    fmt = 'png'
  f = io.BytesIO()
  imwrite(f, a, fmt)
  return f.getvalue()

def imshow(a, fmt='jpeg'):
  display(Image(data=imencode(a, fmt)))

def zoom(img, scale=4):
  img = np.repeat(img, scale, 0)
  img = np.repeat(img, scale, 1)
  return img


!nvidia-smi -L

# Prepare the target image

In [None]:
# parameters used in the resizing.
IMG_HEIGHT = 128
IMG_WIDTH = IMG_HEIGHT

def show_single_im(im, figsize=(3,3), reshape=False):
  if reshape:
    im = im.reshape([IMG_HEIGHT, IMG_WIDTH, 3])
  pl.figure(figsize=figsize)
  pl.imshow(im)
  pl.axis("off")
  pl.show()


def decode_img(img):
  # convert the compressed string to a 3D uint8 tensor
  img = tf.image.decode_jpeg(img, channels=3)
  # Use `convert_image_dtype` to convert to floats in the [0,1] range.
  img = tf.image.convert_image_dtype(img, tf.float32)
  # resize the image to the desired size.
  return tf.image.resize(img, [IMG_WIDTH, IMG_HEIGHT])


In [None]:
# We chose to use a rose as a target image. Here you can see more of them in
# case you are interested in testing this work with other images.
data_dir = tf.keras.utils.get_file(
    origin='https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
    fname='flower_photos', untar=True)
data_dir = pathlib.Path(data_dir)

image_count = len(list(data_dir.glob('*/*.jpg')))
print("Number of images in dataset:", image_count)
CLASS_NAMES = np.array([item.name for item in data_dir.glob('*') if item.name != "LICENSE.txt"])
print("Classes:", CLASS_NAMES)

roses = list(sorted(data_dir.glob('roses/*')))
print("Example roses:")
for image_path in roses[:10]:
    imshow(PIL.Image.open(str(image_path)))


In [None]:
# the target we choose to optimize for:
target = tf.reshape(decode_img(tf.io.read_file(str(roses[9]))), [-1, 3])
show_single_im(target.numpy(), figsize=(10, 10), reshape=True)

# Also generate the input values for the image encoding task.
r = 1.0 # change this if you want to have different ranges.
coord_range = tf.linspace(-r, r, IMG_HEIGHT)
y, x = tf.meshgrid(coord_range, coord_range, indexing='ij')
im_enc_input = tf.reshape(tf.stack([x, y], -1), [-1, 2])
print("target shape:", target.shape)
print("im_enc_input shape:", im_enc_input.shape)

In [None]:
# @title plot utils

def show_weight_divergence(all_w):
  wd_fp_l, wd_fl_l = [], []
  w_first, w_last = all_w[0], all_w[-1]
  for idx, w in enumerate(all_w):
    if idx > 0:
      wd_fp = tf.reduce_mean(tf.square(all_w[idx-1] - w))
      wd_fp_l.append(wd_fp)
    if idx < len(all_w)-1:
      wd_last = tf.reduce_mean(tf.square((w_last - w)))
      wd_fl_l.append(wd_last)

  pl.title("Weight divergence")
  x_l = np.arange(0, len(wd_fp_l)+1, dtype=np.int32)
  x_l_1_to_n = x_l[1:]
  x_l_0_to_nm1 = x_l[:-1]
  pl.plot(x_l_1_to_n, wd_fp_l, label="From parent")
  pl.plot(x_l_0_to_nm1, wd_fl_l, label="From last (sink)")
  pl.yscale("log")
  pl.xlabel("Number of replication steps")
  pl.legend()
  pl.show()

  p_data = [(x_l_1_to_n, wd_fp_l), (x_l_0_to_nm1, wd_fl_l)]
  return p_data


# Model architecture

In [None]:
from tensorflow.keras.layers import Dense

def getNumBits(n):
  return int(math.floor(math.log(n, 2.0)) + 1)

class SelfReplicator():
  def __init__(self, n_hidden, wo, size_hidden, standardize_weights,
               use_fixed_weights, switch_init_constant, num_extra_params):
    """
    params:
    n_hidden: number of hidden layers
    wo: sinusoidal period for the first layer. Taken by 
      "Implicit Neural Representations with Periodic Activation Functions",
      Sitzmann et al.
    size_hidden: size of all hidden layers
    standardize_weights: whether to standardize weights during self-replication.
    use_fixed_weights: whether the network also uses fixed weights (exp 2 and 3)
    switch_init_constant: value of the W_switch (here self.is_fixed_logit) 
      for initialization.
    num_extra_params: used in exp3 to add replication with variation inputs.

    """
    self.n_hidden = n_hidden
    self.size_hidden = size_hidden
    self.standardize_weights = standardize_weights
    self.input_aux_size = 3
    self.extra_params = num_extra_params
    self.n_outputs = 4 # rgb + w
    self.input_coord_size = self._computeInputCoordSize() 
    self.layers_sizes = [self.input_aux_size + self.input_coord_size + self.extra_params] + [self.size_hidden]*self.n_hidden + [self.n_outputs]
    self.wo = wo
    self.use_fixed_weights = use_fixed_weights
    self.switch_init_constant = switch_init_constant
    self.trainable_variables = []
    layers = []
    n = self.input_aux_size + self.input_coord_size + self.extra_params
    for i in range(n_hidden):
      n2 = size_hidden
      k = math.sqrt(6/n)
      layer_init = np.random.uniform(-k, k, [n, n2]).astype(np.float32)
      if i == 0:
        layer_init *= wo
      layer_init = tf.Variable(layer_init)
      self.trainable_variables.append(layer_init)
      bias_init = tf.Variable(np.zeros(n2, dtype=np.float32))
      self.trainable_variables.append(bias_init)
      layers.append((layer_init, bias_init))
      n = n2
    
    n2 = self.n_outputs
    k = math.sqrt(6/n)
    layer_init = np.random.uniform(-k, k, [n, n2]).astype(np.float32)
    layer_init = tf.Variable(layer_init)
    self.trainable_variables.append(layer_init)
    layers.append((layer_init,))

    self.layers = layers
    self.layer_inputs = self._createBinaryCoords()
    self.all_li = tf.concat(self.layer_inputs, 0)

    if self.use_fixed_weights:
      all_weights = tf.concat([tf.reshape(v, [-1]) for v in self.trainable_variables], 0)
      switch_np = np.full(all_weights.shape, self.switch_init_constant).astype(np.float32)
      self.is_fixed_logit = tf.Variable(switch_np)
      self.fixed_w = tf.Variable(all_weights)
      self.trainable_fixed_variables = [self.is_fixed_logit, self.fixed_w]

  def _computeInputCoordSize(self):
    # this is tricky because the first layer size depends on the input coord size.
    # therefore, we have to find a stable configuration.
    # We start from the second layers since these ones have fixed size.
    ls_except_first = [self.size_hidden]*self.n_hidden + [self.n_outputs]
    ics = 0
    for i in range(len(ls_except_first) -1):
      ls1, ls2 = ls_except_first[i], ls_except_first[i+1]
      ics += getNumBits(ls1*ls2)
      if i < len(ls_except_first) - 2:
        # has bias too
        ics += getNumBits(ls2)
    
    # the first layer bias is also fixed in size:
    ics += getNumBits(self.size_hidden)

    # now see what is the minimum number of bits that fits the weight matrix.
    bit_capacity = 2
    nb = 1
    # note we dont want to use a full zero encoding, so the bit_capacity will
    # have 1 subtracted to it. that is, with 1 bit you can only encode 1 value,
    # with 2 bits only 3 values and so on.
    all_other_inputs = self.input_aux_size + self.extra_params
    while bit_capacity - 1 < (nb + all_other_inputs + ics) * self.size_hidden:
      nb += 1
      bit_capacity *= 2
    ics += nb
    return ics

  def _createBinaryCoords(self):
    input_coord_size = self.input_coord_size
    layers_sizes = self.layers_sizes

    def __genCoords(n):
      nb = getNumBits(n)

      all_coords = []
      for i in range(1, n+1):
        bit_i = f"{i:b}"
        t_i = np.zeros([nb])
        for idx, c in enumerate(bit_i[::-1]):
          if c == '1':
            t_i[idx] = 1.
        all_coords.append(t_i)
      return np.stack(all_coords).astype(np.float32)

    layer_inputs = []
    left_ics = 0
    for i in range(len(layers_sizes) - 1):
      sl = layers_sizes[i]
      sr = layers_sizes[i+1]

      # weight
      current_ics = getNumBits(sl*sr)
      layer_input =  __genCoords(sl*sr)
      layer_input = tf.pad(layer_input, [(0,0), (left_ics, input_coord_size - left_ics - current_ics)])
      left_ics += current_ics
      layer_inputs.append(layer_input)

      # bias
      if i < len(layers_sizes) - 2:
        current_ics = getNumBits(sr)
        layer_input = __genCoords(sr)
        layer_input = tf.pad(layer_input, [(0,0), (left_ics, input_coord_size - left_ics - current_ics)])
        layer_inputs.append(layer_input)
    
    return layer_inputs

  def unflattenWeights(self, weights):
    layers = []
    for i in range((len(weights) -1)//2):
      layers.append(
          (weights[2*i], weights[2*i+1]))
    layers.append((weights[-1],))
    return layers

  def set_weights(self, new_weights):
    self.trainable_variables = [tf.Variable(v) for v in new_weights]
    self.layers = self.unflattenWeights(self.trainable_variables)

  def set_fixed_weights(self, is_fixed_logit, fixed_w):
    self.is_fixed_logit = tf.Variable(is_fixed_logit)
    self.fixed_w = tf.Variable(fixed_w)
    self.trainable_fixed_variables = [self.is_fixed_logit, self.fixed_w]
  
  def generateNewWeights(self, variation_inputs=None, weights = None):
    if weights is None:
      weights = self.layers
    weights_flat = tf.nest.flatten(weights)
    layers_sizes = self.layers_sizes
    new_weights = []
    for i, coords in enumerate(self.layer_inputs):
      l1w = self.synapses(coords, variation_inputs, weights)

      l = i // 2

      size_l = layers_sizes[l]
      size_r = layers_sizes[l+1]

      # compute mean of the two blocks and make sure the mean is the same.
      if self.standardize_weights:
        orig_m = tf.reduce_mean(weights_flat[i])
        new_m = tf.reduce_mean(l1w)
        l1w = l1w + (orig_m - new_m)

      if i % 2 == 0:
        # for weights, also preserve the standard deviation.
        orig_std = tf.math.reduce_std(weights_flat[i])
        new_std = tf.math.reduce_std(l1w)

        if self.standardize_weights:
          l1w = (l1w - orig_m) * (orig_std / new_std) + orig_m
        l1w = tf.reshape(l1w, [size_l, size_r])
        new_weights.append(l1w)
      else:
        l1b = tf.reshape(l1w, [ size_r])
        new_weights.append(l1b)

    return new_weights

  def noisyCopyWeights(self, weights=None, tolerance_std=0.02):
    if weights is None:
      weights = self.layers
    weights_flat = tf.nest.flatten(weights)
    new_weights = []
    for v in weights_flat:
      orig_std = tf.math.reduce_std(v)
      orig_std = tf.cond(tf.abs(orig_std) < 1e-6,lambda: 1e-3,lambda: orig_std)
      new_v = v + tf.random.normal(shape=v.shape) * orig_std * tolerance_std
      new_weights.append(new_v)
    return new_weights

  def createNewNetwork(self, variation_inputs=None):
    new_net = SelfReplicator(self.n_hidden, self.wo, self.size_hidden, self.standardize_weights,
                    self.use_fixed_weights, self.switch_init_constant, self.extra_params)
    new_weights = self.generateNewWeights(variation_inputs=variation_inputs)
    new_net.set_weights(new_weights)
    if self.use_fixed_weights:
      new_net.set_fixed_weights(self.is_fixed_logit, self.fixed_w)
    return new_net

  def deserialize(self, serialized_weights):
    layers_sizes = self.layers_sizes
    weights_sizes = []
    weights_dims = []
    for i in range(len(layers_sizes)-1):
      weights_sizes.append(layers_sizes[i]*layers_sizes[i+1])
      weights_dims.append((layers_sizes[i], layers_sizes[i+1]))
      if i < len(layers_sizes) - 2:
        weights_sizes.append(layers_sizes[i+1])
        weights_dims.append((layers_sizes[i+1],))
    all_weights_flat_split = tf.split(serialized_weights, weights_sizes)
    all_weights_flat = [tf.reshape(t, s) for t, s in zip(
        all_weights_flat_split, weights_dims)]
    all_weights = []
    for i in range((len(all_weights_flat) - 1)//2):
      all_weights.append((all_weights_flat[2*i], all_weights_flat[2*i+1]))
    all_weights.append((all_weights_flat[-1],))
    return all_weights

  def rgb(self, inputs, weights = None):
    if weights is None:
      weights = self.layers
    inputs = tf.pad(inputs, [(0,0), (0, 1)], constant_values=1.)
    inputs = tf.pad(inputs, [(0,0), (0, self.input_coord_size+self.extra_params)])
    return self.call(inputs, weights)[:, :3]

  def synapses(self, inputs, variation_inputs=None, weights = None):
    if weights is None:
      weights = self.layers
    inputs = tf.pad(inputs, [(0,0), (self.input_aux_size, 0)])
    # This code breaks if you call synapses without variation_inputs if it needs them.
    if variation_inputs is not None:
      variation_inputs = tf.broadcast_to(
          variation_inputs, [inputs.shape[0], variation_inputs.shape[0]])
      inputs = tf.concat([inputs, variation_inputs], -1)
    return self.call(inputs, weights)[:, -1]

  def merge_with_fixed(self, weights):
    weights_flat = tf.nest.flatten(weights)
    wf = tf.concat([tf.reshape(v, [-1]) for v in weights_flat], 0)
    is_fixed = tf.sigmoid(self.is_fixed_logit)
    new_w = wf * (1. - is_fixed) + self.fixed_w * is_fixed
    return self.deserialize(new_w)

  def call(self, inputs, weights = None):
    if weights is None:
      weights = self.layers
    if self.use_fixed_weights:
      weights = self.merge_with_fixed(weights)
    # weights: list of (matrix, bias)
    for n, l in enumerate(weights):
      if n == len(weights)-1:
        return inputs @ l[0]
      inputs = inputs @ l[0] + l[1]
      inputs = tf.math.sin(inputs)



## Model from [Chang et al.](https://arxiv.org/abs/1803.05859) for benchmarking

In [None]:
# Benchmark quine
from tensorflow.keras.layers import Dense

class BenchmarkQuine():
  def __init__(self, n_hidden, size_hidden,
               standardize_weights, proj_init=None):
    self.n_hidden = n_hidden
    self.n_outputs = 4
    self.size_hidden = size_hidden
    self.standardize_weights = standardize_weights
    self.trainable_variables = []
    self.input_coord_size = (size_hidden*size_hidden) * n_hidden + (size_hidden*self.n_outputs)
    layers = []
    # architecture has:
    # - random projection layer to size_hidden
    # n_hidden more hidden layers
    # n_outputs

    # random projections
    if proj_init is None:
      aux_size = 2 + 1
      self.aux_proj = np.random.uniform(-1., 1., [aux_size, size_hidden // 2])
      self.coord_proj = tf.keras.layers.Embedding(self.input_coord_size, size_hidden // 2)
    else:
      self.aux_proj = proj_init[0]
      self.coord_proj = proj_init[1]

    # weights
    ls_except_first = [self.size_hidden]*(self.n_hidden+1) + [self.n_outputs]
    for i in range(len(ls_except_first) -1):
      n = ls_except_first[i]
      n2 = ls_except_first[i+1]
      k = math.sqrt(2/n)
      layer_init = np.random.normal(0, k, [n, n2]).astype(np.float32)

      layer_init = tf.Variable(layer_init)
      self.trainable_variables.append(layer_init)
      # NO BIAS.
      layers.append((layer_init,))

    self.layers = layers
    self.layer_inputs = self._createCoords()
    self.all_li = tf.concat(self.layer_inputs, 0)


  def _createCoords(self):
    all_coords = np.arange(self.input_coord_size, dtype=np.int32)
    split_n = [self.size_hidden * self.size_hidden] * self.n_hidden + [self.size_hidden * self.n_outputs] 
    return tf.split(all_coords, split_n)

  def unflattenWeights(self, weights):
    layers = []
    for w in weights:
      layers.append((w,))
    return layers

  def set_weights(self, new_weights):
    self.trainable_variables = [tf.Variable(v) for v in new_weights]
    self.layers = self.unflattenWeights(self.trainable_variables)

  def generateNewWeights(self, weights = None):
    if weights is None:
      weights = self.layers
    weights_flat = tf.nest.flatten(weights)
    layers_sizes_except_first = [self.size_hidden]*(self.n_hidden+1) + [self.n_outputs]
    new_weights = []
    for i, coords in enumerate(self.layer_inputs):
      if self.n_outputs == 3:
        r,gr, b = tf.split(self.synapses(coords, weights), 3, -1)
        l1w = tf.reshape((r - gr) * b, [-1])
      else:
        l1w = self.synapses(coords, weights)
      l = i

      size_l = layers_sizes_except_first[l]
      size_r = layers_sizes_except_first[l+1]

      # compute mean of the two blocks and make sure the mean is the same.
      if self.standardize_weights:
        orig_m = tf.reduce_mean(weights_flat[i])
        new_m = tf.reduce_mean(l1w)
        l1w = l1w + (orig_m - new_m)

      # also perserve the standard deviation.
      orig_std = tf.math.reduce_std(weights_flat[i])
      new_std = tf.math.reduce_std(l1w)
      if self.standardize_weights:
        l1w = (l1w - orig_m) * (orig_std / new_std) + orig_m
      l1w = tf.reshape(l1w, [size_l, size_r])
      new_weights.append(l1w)

    return new_weights

  def createNewNetwork(self):
    new_net = BenchmarkQuine(self.n_hidden, self.size_hidden, 
                     self.standardize_weights, proj_init=(self.aux_proj, self.coord_proj))
    new_weights = self.generateNewWeights()
    new_net.set_weights(new_weights)
    return new_net

  def deserialize(self, serialized_weights):
    layers_sizes_except_first = [self.size_hidden]*(self.n_hidden+1) + [self.n_outputs]
    weights_sizes = []
    weights_dims = []
    for i in range(len(layers_sizes)-1):
      weights_sizes.append(layers_sizes[i]*layers_sizes[i+1])
      weights_dims.append((layers_sizes[i], layers_sizes[i+1]))
    all_weights_flat_split = tf.split(serialized_weights, weights_sizes)
    all_weights_flat = [tf.reshape(t, s) for t, s in zip(
        all_weights_flat_split, weights_dims)]
    all_weights = []
    for w in all_weights:
      all_weights.append((w,))
    return all_weights

  def rgb(self, inputs, weights = None):
    if weights is None:
      weights = self.layers
    inputs = tf.pad(inputs, [(0,0), (0, 1)], constant_values=1.)
    proj = inputs @ self.aux_proj
    proj = tf.pad(proj, [(0,0), (0, self.size_hidden // 2)])
    return self.call(proj, weights)[:, :3]

  def synapses(self, inputs, weights = None):
    if weights is None:
      weights = self.layers
    proj = self.coord_proj(inputs)
    proj = tf.pad(proj, [(0,0), (self.size_hidden // 2, 0)])
    return self.call(proj, weights)[:, -1]

  def call(self, inputs, weights = None):
    # This part does NOT do the random projection.
    # call synapses or rgb for it.
    if weights is None:
      weights = self.layers
    # weights: list of (matrix, bias)
    for n, l in enumerate(weights):
      if n == len(weights)-1:
        return inputs @ l[0]
      inputs = inputs @ l[0]
      inputs = tf.nn.selu(inputs)



# Experiment 1: Self-replication with only variable weights

In [None]:
#  Configurations used for the experiment 1.
n_hidden = 3
wo = 30
size_hidden = 64
standardize_weights=True
use_chang_quine = False

network = SelfReplicator(
    n_hidden, wo, size_hidden, standardize_weights, 
    use_fixed_weights=False, switch_init_constant=None, num_extra_params=0)

if use_chang_quine:
  n_hidden = 2
  size_hidden = 100
  standardize_weights=True
  network = BenchmarkQuine(n_hidden, size_hidden, standardize_weights)

print("Showing parameters:")
from functools import reduce
reduce_multiply = lambda l: reduce(lambda a,b: a*b, l)

tot = 0
for v in network.trainable_variables:
  print(v.shape, reduce_multiply(v.shape))
  tot += reduce_multiply(v.shape)
print("Total parameters: ", tot)

trainer = tf.keras.optimizers.Adamax(1e-3)

BATCH_SIZE = 256

def generate_new_dataset_iter(new_target):
  ds = tf.data.Dataset.from_tensor_slices((im_enc_input, new_target))
  ds = ds.cache().shuffle(1000).batch(BATCH_SIZE).repeat()
  return iter(ds)

minib_ds_iter = generate_new_dataset_iter(target)


In [None]:
# hyperparameters used during training

# Experiment where we use the same sink loss training regime used in Experiment 2
sink_exp = (10., 0., 0., 10000., 0)
# Experiment where we try to minimize the weight divergence of the child to the parent.
wd_exp = (10., 0., 100., 0., 1)
# Experiment where we add a loss to the child rgb image quality.
child_loss_exp = (10., 1., 1., 0., 1)
# Experiment where we only have weight divergence loss.
wd_min_exp = (0., 0., 1., 0., 1)

# Choose the experiment or create your own parameters.
hpars = wd_exp

pic_loss = hpars[0]
child_loss = hpars[1] 
weight_loss = hpars[2] 
sink_loss = hpars[3] 
# This hparam is responsible for recoursively generating a child and applying
# child_loss and weight_loss on them.
# Sink experiments do not need it and set it to 0.
n_iters = hpars[4]

In [None]:
# train and visualize

from functools import partial

def createStep(network, trainer, use_sink_loss):
  @tf.function
  def step(xt, yt, pic_loss, child_loss, weight_loss, sink_loss, n_iters, use_sink_loss):
    with tf.GradientTape(persistent=True) as g:
      y = network.rgb(xt)
      lpic = tf.reduce_mean(tf.square(y - yt)) * pic_loss

      target_wy = tf.concat([tf.reshape(v, [-1]) for v in network.trainable_variables], 0)

      if use_sink_loss:
        # sink loss: generate a neighbor and enforce it converges to the original.
        neighbor_weights = network.unflattenWeights(network.noisyCopyWeights(tolerance_std=0.02))
        new_w = network.generateNewWeights(weights=neighbor_weights)
        wy = tf.concat([tf.reshape(v, [-1]) for v in new_w], 0)
        lsink = tf.reduce_mean(tf.square(wy - target_wy)) * sink_loss
      else:
        lsink = 0

      weights = network.layers
      lchild_pic = 0
      lwd = 0
      multiplier = 1.
      for i in range(1, n_iters+1):
        new_w = network.generateNewWeights(weights=weights)
        wy = tf.concat([tf.reshape(v, [-1]) for v in new_w], 0)
        weights = network.unflattenWeights(new_w)

        new_y = network.rgb(xt, weights=weights)
        lchild_pic += tf.reduce_mean(tf.square(new_y - yt)) * child_loss * multiplier
        lwd += tf.reduce_mean(tf.square(wy - target_wy)) * weight_loss * multiplier

        multiplier /= 2.

      l = lpic + lchild_pic + lwd + lsink 

    grads = g.gradient(l, network.trainable_variables)
    trainer.apply_gradients(zip(grads, network.trainable_variables))

    return lpic, lchild_pic, lwd, lsink
  return partial(step, use_sink_loss=use_sink_loss)

use_sink_loss = sink_loss > 1e-6
step = createStep(network, trainer, use_sink_loss=use_sink_loss)

for i in range(200000):
  xt, yt = next(minib_ds_iter)
  lpic, lchild_pic, lwd, lsink = step(xt, yt, pic_loss, child_loss, weight_loss, sink_loss, n_iters)

  if i % 5000 == 0:
    # we need to compute the total y:
    y = network.rgb(im_enc_input)
    clear_output()
    print("step {}, losses: rgb:{} child_rgb:{} weight_divergence:{} sink:{}".format(i, lpic, lchild_pic, lwd, lsink))
    pl.figure(figsize=(15, 5))
    n_pl = 10
    pl.subplot(1, n_pl, 1)
    pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
    pl.axis("off")
    new_net = network
    child_target_father = True
    for i in range(2, n_pl + 1):
      new_net = new_net.createNewNetwork()
      y = new_net.rgb(im_enc_input)
      pl.subplot(1,n_pl,i)
      pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
      pl.axis("off")
    pl.show()


## Evaluations

In [None]:

new_net = network
pl.figure(figsize=(15, 5))
y = new_net.rgb(im_enc_input)
n_pl = 10
pl.subplot(1,n_pl,1)
pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
pl.axis("off")
for i in range(2, n_pl + 1):
  new_net = new_net.createNewNetwork()
  y = new_net.rgb(im_enc_input)
  pl.subplot(1,n_pl,i)
  pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
  pl.axis("off")
pl.show()

In [None]:
x_l = np.arange(1, 11, dtype=np.int32)

wd_fi_l = [] 
new_net = network

initial_w =  tf.concat([tf.reshape(v, [-1]) for v in new_net.trainable_variables], 0)
all_w = [initial_w]
for _ in range(10):
  new_net = new_net.createNewNetwork()

  new_w = tf.concat([tf.reshape(v, [-1]) for v in new_net.trainable_variables], 0)
  all_w.append(new_w)

_ = show_weight_divergence(all_w)

# Experiment 2: Self-replication with some fixed weights

In [None]:
#  Configurations used for the experiment 2.
n_hidden = 3
wo = 30
size_hidden = 64
standardize_weights=True
switch_init_constant= 0. # 50% fixed weights at the beginning.

network = SelfReplicator(
    n_hidden, wo, size_hidden, standardize_weights, 
    use_fixed_weights=True, switch_init_constant=switch_init_constant, 
    num_extra_params=0)

print("Showing parameters:")
from functools import reduce
reduce_multiply = lambda l: reduce(lambda a,b: a*b, l)

tot = 0
for v in network.trainable_variables:
  print(v.shape, reduce_multiply(v.shape))
  tot += reduce_multiply(v.shape)
print("Total parameters: ", tot)
print("Showing fixed parameters:")
tot = 0
for v in network.trainable_fixed_variables:
  print(v.shape, reduce_multiply(v.shape))
  tot += reduce_multiply(v.shape)
print("Total parameters: ", tot)

trainer = tf.keras.optimizers.Adamax(1e-3)

BATCH_SIZE = 256

def generate_new_dataset_iter(new_target):
  ds = tf.data.Dataset.from_tensor_slices((im_enc_input, new_target))
  ds = ds.cache().shuffle(1000).batch(BATCH_SIZE).repeat()
  return iter(ds)

minib_ds_iter = generate_new_dataset_iter(target)


In [None]:
# hyperparameters used during training

# parameters used in Experiment 2. Note there is an extra fixed_w_loss.
sink_exp = (10., 0., 0., 10000., 0, 1.)

# Choose the experiment or create your own parameters.
hpars = sink_exp

pic_loss = hpars[0]
child_loss = hpars[1] 
weight_loss = hpars[2] 
sink_loss = hpars[3]
# This hparam is responsible for recoursively generating a child and applying
# child_loss and weight_loss on them.
# Sink experiments do not need it and set it to 0.
n_iters = hpars[4]
fixed_w_loss = hpars[5]

In [None]:
# train and visualize!

from functools import partial

def createStep(network, trainer, use_sink_loss):
  @tf.function
  def step(xt, yt, pic_loss, child_loss, weight_loss, fixed_w_loss, sink_loss, n_iters, use_sink_loss):
    with tf.GradientTape(persistent=True) as g:
      y = network.rgb(xt)
      lpic = tf.reduce_mean(tf.square(y - yt)) * pic_loss

      target_wy = tf.concat([tf.reshape(v, [-1]) for v in network.trainable_variables], 0)

      is_fixed = tf.sigmoid(network.is_fixed_logit)
      is_fixed_mean = tf.reduce_mean(is_fixed)

      if use_sink_loss:
        # sink loss: generate a neighbor and enforce it converges to the original.
        neighbor_weights = network.unflattenWeights(network.noisyCopyWeights(tolerance_std=0.02))
        new_w = network.generateNewWeights(weights=neighbor_weights)
        wy = tf.concat([tf.reshape(v, [-1]) for v in new_w], 0)
        lsink = tf.reduce_mean(tf.square((wy - target_wy) * (1. - is_fixed))) * sink_loss
        # normalize such that it's independent from is-fixed value:
        lsink /= (1. - is_fixed_mean)
      else:
        lsink = 0


      weights = network.layers
      lchild_pic = 0
      lwd = 0
      multiplier = 1.
      for i in range(1, n_iters+1):
        new_w = network.generateNewWeights(weights=weights)
        wy = tf.concat([tf.reshape(v, [-1]) for v in new_w], 0)
        weights = network.unflattenWeights(new_w)

        new_y = network.rgb(xt, weights=weights)
        lchild_pic += tf.reduce_mean(tf.square(new_y - yt)) * child_loss * multiplier
        # the weight loss is different, because we ignore, partially, wrong classifications
        # for the ones that are fixed. This is likely redundant and can be removed.
        lw =  tf.reduce_mean(tf.square((wy - target_wy) * (1. - is_fixed))) * weight_loss * multiplier
        # normalize such that it's independent from is_fixed value:
        lw /= (1. - is_fixed_mean)
        lwd += lw


        multiplier /= 2.

      lfixedw = is_fixed_mean * fixed_w_loss

      l = lpic + lchild_pic + lwd + lfixedw + lsink

    all_vars = network.trainable_variables + network.trainable_fixed_variables
    grads = g.gradient(l, all_vars)

    trainer.apply_gradients(zip(grads, all_vars))

    return lpic, lchild_pic, lwd, is_fixed_mean, lsink
  return partial(step, use_sink_loss=use_sink_loss)

use_sink_loss = sink_loss > 1e-6
step = createStep(network, trainer, use_sink_loss=use_sink_loss)

for i in range(500000):
  # using test to train too, because we want to see the effect of learning
  # on the parameter space.
  xt, yt = next(minib_ds_iter)
  lpic, lchild_pic, lwd, is_fixed_mean, lsink = step(xt, yt, pic_loss, child_loss, weight_loss, fixed_w_loss, sink_loss, n_iters)

  if i % 5000 == 0:
    # we need to compute the total y:
    y = network.rgb(im_enc_input)
    clear_output()
    print("step {}, losses: rgb:{} child_rgb:{} weight_divergence:{} is_fixed_mean:{} sink:{}".format(i, lpic, lchild_pic, lwd, is_fixed_mean, lsink))
    pl.figure(figsize=(15, 5))
    n_pl = 9
    pl.subplot(1, n_pl, 1)
    pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
    pl.axis("off")
    new_net = network
    child_target_father = True
    for i in range(2, n_pl + 1):
      new_net = new_net.createNewNetwork()
      y = new_net.rgb(im_enc_input)
      pl.subplot(1,n_pl,i)
      pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
      pl.axis("off")
    pl.show()


## Alternatively load a saved model from Github

In [None]:
!wget -O saved_models.zip 'https://github.com/google-research/self-organising-systems/blob/master/self_replicating_nn/assets/saved_models.zip?raw=true'
!unzip -oq "saved_models.zip" -d "saved_models"
# Load the saved model:
new_net = SelfReplicator(n_hidden, wo, size_hidden, standardize_weights, 
                         use_fixed_weights=True, switch_init_constant=switch_init_constant,
                         num_extra_params=0)
new_net.set_weights(params[:-2])
new_net.set_fixed_weights(params[-2], params[-1])

network = new_net

## Evaluations

In [None]:
# Show sequence
y = network.rgb(im_enc_input)
n_pl = 20
sub_l = n_pl // 10 + (0 if n_pl % 10 == 0 else 1)
fh = int(sub_l * 1.5)
pl.figure(figsize=(15, fh))
pl.subplot(sub_l,10,1)
pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
pl.axis("off")
new_net = network

for i in range(2, n_pl + 1):
  new_net = new_net.createNewNetwork()
  y = new_net.rgb(im_enc_input)
  pl.subplot(sub_l,10,i)
  pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
  pl.axis("off")

  new_w = tf.concat([tf.reshape(v, [-1]) for v in new_net.trainable_variables], 0)

pl.show()

In [None]:
is_fixed = tf.sigmoid(network.is_fixed_logit)
is_fixed_mean = tf.reduce_mean(is_fixed)
print("Fixed mean", is_fixed_mean)

In [None]:
print("Showing that the variable weights matter:")
# Evidence that the variance of the variables is important:
# you can also try to fill with 0, but it would be more unfair.
zero_w = [tf.fill(v.shape, tf.reduce_mean(v)) for v in tf.nest.flatten(network.layers)]
#zero_w = [tf.fill(v.shape, 0.) for v in tf.nest.flatten(network.layers)]
zero_w = network.unflattenWeights(zero_w)

print("If the self-replication were full-zero (it would then shift the output to the mean of the group):")
y = network.rgb(im_enc_input, weights=zero_w)
pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
pl.axis("off")
pl.show()

print("Showing self-replication with such initialization:")
n_pl = 20
sub_l = n_pl // 10 + (0 if n_pl % 10 == 0 else 1)
fh = int(sub_l * 1.5)
pl.figure(figsize=(15, fh))
pl.subplot(sub_l,10,1)
pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
pl.axis("off")
new_w = zero_w

for i in range(2, n_pl + 1):
  new_w = network.generateNewWeights(weights=new_w)
  y = network.rgb(im_enc_input, weights=new_w)
  pl.subplot(sub_l,10,i)
  pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
  pl.axis("off")

pl.show()


In [None]:
# Weight divergence plot
n_pl = 300
sub_l = int(math.sqrt(n_pl))

new_net = network

initial_weights = tf.concat([tf.reshape(v, [-1]) for v in new_net.trainable_variables], 0)
all_w = [initial_weights]
is_fixed = tf.sigmoid(new_net.is_fixed_logit)
is_not_fixed = 1. - is_fixed
for i in range(1, n_pl + 1):
  new_net = new_net.createNewNetwork()
  y = new_net.rgb(im_enc_input)
  new_w = tf.concat([tf.reshape(v, [-1]) for v in new_net.trainable_variables], 0)
  all_w.append(new_w)

final_w = new_w

child_wd_fp, child_wd_fl = show_weight_divergence(all_w)


In [None]:
# We sample neighbors and see whether they converge to the final state of a 
# normal run
n_pl = 300

num_runs = 40
sub_l = num_runs // 10 + (0 if num_runs % 10 == 0 else 1)
fh = int(sub_l * 1.5)
pl.figure(figsize=(15, fh))

all_runs_wd_l = []

for r in range(num_runs):
  new_net = network

  wd_l = []

  flat_nw = network.noisyCopyWeights(tolerance_std=0.02)
  neighbor_weights = network.unflattenWeights(flat_nw)
  neigh_wy = tf.concat([tf.reshape(v, [-1]) for v in flat_nw], 0)
  lbefore = tf.reduce_mean(tf.square(neigh_wy - final_w))
  wd_l.append(lbefore)

  prev_layer = neighbor_weights
  is_fixed = tf.sigmoid(new_net.is_fixed_logit)
  is_not_fixed = 1. - is_fixed
  for i in range(1, n_pl + 1):
    new_w_l = network.generateNewWeights(weights=prev_layer)
    new_w = tf.concat([tf.reshape(v, [-1]) for v in new_w_l], 0)
    all_w.append(new_w)
    weight_divergence = tf.reduce_mean(tf.square(final_w - new_w))
    wd_l.append(weight_divergence)

    prev_layer = network.unflattenWeights(new_w_l)
  
  y = network.rgb(im_enc_input, weights=prev_layer)
  pl.subplot(sub_l,10,r+1)
  pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
  pl.axis("off")
  all_runs_wd_l.append(wd_l)

# Show the final images
print("Final states")
pl.show()

# Plot graph
X = np.array(all_runs_wd_l)
mu = X.mean(axis=0)
sigma = X.std(axis=0)

x_l = np.arange(1, len(mu)+1, dtype=np.int32)

pl.title("Weight divergence of random neighbors from sink")
pl.xlabel("Number of replication steps")
pl.plot(x_l, mu)
pl.yscale("log")
pl.fill_between(x_l, mu+sigma, mu-sigma, alpha=0.5)
pl.show()

pl.title("Weight divergence of random neighbors from sink")
pl.xlabel("Number of replication steps")
for wd_l in all_runs_wd_l:
  x_l = np.arange(1, len(wd_l)+1, dtype=np.int32)
  pl.plot(x_l, wd_l)
  pl.yscale("log")
pl.show()

In [None]:

# Join the two plots

pl.title("Weight divergence")
pl.plot(child_wd_fp[0], child_wd_fp[1], label="From parent")
pl.plot(child_wd_fl[0], child_wd_fl[1], label="From last (sink)")

X = np.array(all_runs_wd_l)
mu = X.mean(axis=0)
sigma = X.std(axis=0)

x_l = np.arange(1, len(mu)+1, dtype=np.int32)

pl.plot(x_l, mu, label="Random neighbor from sink", color="green")
pl.fill_between(x_l, mu+sigma, mu-sigma,  color="green", alpha=0.4)

pl.yscale("log")
pl.xlabel("Number of replication steps")
pl.legend()
pl.show()


In [None]:
is_fixed = tf.sigmoid(network.is_fixed_logit).numpy()
pl.title("Histogram for the proportion of fixed weights")
pl.hist(is_fixed, bins=100)
pl.show()



In [None]:
all_w_sizes = [tf.reshape(v, [-1]).shape[0] for v in network.trainable_variables]
print(all_w_sizes)

max_w_sw = 0.5
print("Showing all variable weights (where w_sw < {}) distributions.".format(max_w_sw))
print("note that the variable is multiplied by (1. - w_sw)")

for w_var, w_sw in zip(network.trainable_variables, tf.split(tf.sigmoid(network.is_fixed_logit), all_w_sizes)):
  print("Var shape:", w_var.shape)
  variable_w = tf.gather(tf.reshape(w_var, [-1]) * (1. - w_sw), tf.where(w_sw < max_w_sw)).numpy()
  pl.hist(variable_w, bins=100)
  pl.show()

# Experiment 3: Self-replication with variation

In [None]:
# generate all possible targets.

target_np = target.numpy()

def right_shift_np(im):
  return im[..., [2,0,1]]
def left_shift_np(im):
  return im[..., [1,2,0]]
def swap_outer_np(im):
  return im[..., [2,1,0]]

all_variation_targets = [
    right_shift_np(target_np),
    left_shift_np(target_np),
    swap_outer_np(target_np)]

# prepare the variation_inputs.
no_variation = tf.constant([0., 0., 0.])
rs_variation = tf.constant([1., 0., 0.])
ls_variation = tf.constant([0., 1., 0.])
chsw_variation = tf.constant([0., 0., 1.])

# does not include no_varition,
all_variation_inputs = [rs_variation, ls_variation, chsw_variation]

# plot the images
pl.figure(figsize=(6, 12))
pl.subplot(1,4,1)
y = target_np
pl.imshow(np.clip(zoom(y.reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
pl.axis("off")

pl.subplot(1,4,2)
y = all_variation_targets[0]
pl.imshow(np.clip(zoom(y.reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
pl.axis("off")

pl.subplot(1,4,3)
y = all_variation_targets[1]
pl.imshow(np.clip(zoom(y.reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
pl.axis("off")

pl.subplot(1,4,4)
y = all_variation_targets[2]
pl.imshow(np.clip(zoom(y.reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
pl.axis("off")

pl.show()

In [None]:
n_hidden = 3
wo = 30
size_hidden = 64
standardize_weights=True
switch_init_constant = 2. # Fixed weights are the majority at the beginnning.

network = SelfReplicator(
    n_hidden, wo, size_hidden, standardize_weights, 
    use_fixed_weights=True, switch_init_constant=switch_init_constant, 
    num_extra_params=3)

print("Showing parameters:")
from functools import reduce
reduce_multiply = lambda l: reduce(lambda a,b: a*b, l)

tot = 0
for v in network.trainable_variables:
  print(v.shape, reduce_multiply(v.shape))
  tot += reduce_multiply(v.shape)
print("Total parameters: ", tot)
print("Showing fixed parameters:")
tot = 0
for v in network.trainable_fixed_variables:
  print(v.shape, reduce_multiply(v.shape))
  tot += reduce_multiply(v.shape)
print("Total parameters: ", tot)

lr_sched = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
  [100000], [1e-3, 1e-4])
trainer = tf.keras.optimizers.Adamax(lr_sched)

BATCH_SIZE = 256

def generate_new_dataset_iter(new_target):
  ds = tf.data.Dataset.from_tensor_slices((im_enc_input, new_target, tf.constant(np.arange(len(im_enc_input), dtype=np.int32)[..., np.newaxis])))
  ds = ds.cache().shuffle(1000).batch(BATCH_SIZE).repeat()
  return iter(ds)

minib_ds_iter = generate_new_dataset_iter(target)

tot_steps = 0

In [None]:
# train and visualize.
import random

from functools import partial

@tf.function
def step(xt, yt, idxt, variation_inputs, t1, pic_loss, fixed_w_loss, sink_loss, switch_colors_loss, n_iters):
  with tf.GradientTape(persistent=True) as g:
    y = network.rgb(xt)
    lpic = tf.reduce_mean(tf.square(y - yt)) * pic_loss

    target_wy = tf.concat([tf.reshape(v, [-1]) for v in network.trainable_variables], 0)

    is_fixed = tf.sigmoid(network.is_fixed_logit)
    is_fixed_mean = tf.reduce_mean(is_fixed)

    # sink loss: generate a neighbor and enforce it converges to the original.
    neighbor_weights = network.unflattenWeights(network.noisyCopyWeights(tolerance_std=0.02))
    new_w = network.generateNewWeights(no_variation, weights=neighbor_weights)
    wy = tf.concat([tf.reshape(v, [-1]) for v in new_w], 0)
    lsink = tf.reduce_mean(tf.square((wy - target_wy) * (1. - is_fixed))) * sink_loss
    # normalize such that it's independent from is-fixed value:
    lsink /= (1. - is_fixed_mean)

    # Now apply, twice, a random transformation:
    # - right shift ch image
    # - left shift ch image
    # - swap outer channels
    # And then do a sink loss on that.

    weights = network.layers

    new_w = network.generateNewWeights(variation_inputs, weights=weights)
    wy = tf.concat([tf.reshape(v, [-1]) for v in new_w], 0)
    weights = network.unflattenWeights(new_w)
    new_y = network.rgb(xt, weights=weights)

    # generate the target y now:
    new_target = tf.gather(t1, idxt)
    lchild_pic = tf.reduce_mean(tf.square(new_y - new_target)) * switch_colors_loss
    lchild_pic *= 3 # there are 3 options here.

    # Now sink loss on child.
    neighbor_weights = network.unflattenWeights(network.noisyCopyWeights(weights=weights, tolerance_std=0.02))
    new_new_w = network.generateNewWeights(no_variation, weights=neighbor_weights)
    wy = tf.concat([tf.reshape(v, [-1]) for v in new_new_w], 0)
    target_wy = tf.concat([tf.reshape(v, [-1]) for v in new_w], 0)
    lsink_child = tf.reduce_mean(tf.square((wy - target_wy) * (1. - is_fixed))) * sink_loss
    # normalize such that it's independent from is-fixed value:
    lsink_child /= (1. - is_fixed_mean)
    lsink_child *= 3 # there are 3 options here.

    lfixedw = is_fixed_mean * fixed_w_loss

    l = lpic + lfixedw + lsink + lsink_child + lchild_pic

  all_vars = network.trainable_variables + network.trainable_fixed_variables
  grads = g.gradient(l, all_vars)

  trainer.apply_gradients(zip(grads, all_vars))

  return lpic, is_fixed_mean, lsink, lsink_child, lchild_pic

for i in range(500000):
  tot_steps += 1
  # using test to train too, because we want to see the effect of learning
  # on the parameter space.
  xt, yt, idxt = next(minib_ds_iter)
  idxt = tf.reshape(idxt, [-1])

  # generate the transformations.
  r = random.random()
  if r < 1/3:
    id1 = 0
  elif r < 2/3:
    id1 = 1
  else:
    id1 = 2
  variation_inputs = all_variation_inputs[id1]
  t1 = all_variation_targets[id1]
 
  pic_loss = 10.
  fixed_w_loss = 1.
  sink_loss = 10000 # 10000.
  switch_colors_loss = 10. # All losses are also multiplied by the chance of it happening.
  n_iters = 0
  lpic, is_fixed_mean, lsink, lsink_child, lchild_pic = step(
      xt, yt, idxt, variation_inputs, t1, pic_loss, fixed_w_loss, sink_loss, switch_colors_loss, n_iters)

  if i % 5000 == 0:
    #clear_output()
    # we need to compute the total y:
    clear_output()
    print("step {} (tot: {}), losses: rgb:{} is_fixed_mean:{} sink_loss:{}".format(i, tot_steps, lpic, is_fixed_mean, lsink))
    print("Child 1: rgb : {} (norm {}) sink_child_loss:{} (norm {})".format(lchild_pic, lchild_pic / 3 / switch_colors_loss, lsink_child, lsink_child / 3))
    images = []
    #show_batch(image_batch.numpy())
    print("Original generate 0 degrees offspring.")
    pl.figure(figsize=(15, 15))
    n_pl = 9
    pl.subplot(1,n_pl,1)
    y = network.rgb(im_enc_input)
    pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
    pl.axis("off")
    new_net = network
    child_target_father = True
    for i in range(2, n_pl + 1):
      new_net = new_net.createNewNetwork(no_variation)
   
      y = new_net.rgb(im_enc_input)
      pl.subplot(1,n_pl,i)
      pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
      pl.axis("off")
    pl.show()

    print("colored child: generate new offspring.")
    pl.figure(figsize=(15, 5))
    new_net = network.createNewNetwork(rs_variation)
    y = new_net.rgb(im_enc_input)
    pl.subplot(1,n_pl,1)
    pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
    pl.axis("off")
    for i in range(2, n_pl + 1):
      new_net = new_net.createNewNetwork(no_variation)

      y = new_net.rgb(im_enc_input)
      pl.subplot(1,n_pl,i)
      pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
      pl.axis("off")
    pl.show()
    print("colored child: generate new offspring.")
    pl.figure(figsize=(15, 5))
    new_net = network.createNewNetwork(ls_variation)
    y = new_net.rgb(im_enc_input)
    pl.subplot(1,n_pl,1)
    pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
    pl.axis("off")
    for i in range(2, n_pl + 1):
      new_net = new_net.createNewNetwork(no_variation)

      y = new_net.rgb(im_enc_input)
      pl.subplot(1,n_pl,i)
      pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
      pl.axis("off")
    pl.show()
    print("colored child: generate new offspring.")
    pl.figure(figsize=(15, 5))
    new_net = network.createNewNetwork(chsw_variation)
    y = new_net.rgb(im_enc_input)
    pl.subplot(1,n_pl,1)
    pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
    pl.axis("off")
    for i in range(2, n_pl + 1):
      new_net = new_net.createNewNetwork(no_variation)

      y = new_net.rgb(im_enc_input)
      pl.subplot(1,n_pl,i)
      pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
      pl.axis("off")
    pl.show()

    print("2nd gen colored child: rs rs")
    pl.figure(figsize=(15, 5))
    new_net = network.createNewNetwork(rs_variation)
    y = new_net.rgb(im_enc_input)
    pl.subplot(1,n_pl,1)
    pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
    pl.axis("off")
    new_net = new_net.createNewNetwork(rs_variation)
    y = new_net.rgb(im_enc_input)
    pl.subplot(1,n_pl,2)
    pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
    pl.axis("off")
    for i in range(3, n_pl + 1):
      new_net = new_net.createNewNetwork(no_variation)

      y = new_net.rgb(im_enc_input)
      pl.subplot(1,n_pl,i)
      pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
      pl.axis("off")
    pl.show()

    print("2nd gen colored child: rs ls")
    pl.figure(figsize=(15, 5))
    new_net = network.createNewNetwork(rs_variation)
    y = new_net.rgb(im_enc_input)
    pl.subplot(1,n_pl,1)
    pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
    pl.axis("off")
    new_net = new_net.createNewNetwork(ls_variation)
    y = new_net.rgb(im_enc_input)
    pl.subplot(1,n_pl,2)
    pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
    pl.axis("off")
    for i in range(3, n_pl + 1):
      new_net = new_net.createNewNetwork(no_variation)

      y = new_net.rgb(im_enc_input)
      pl.subplot(1,n_pl,i)
      pl.imshow(np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1))
      pl.axis("off")
    pl.show()



## Alternatively load a saved model from Github

In [None]:
# Load the saved model:
!wget -O saved_models.zip 'https://github.com/google-research/self-organising-systems/blob/master/self_replicating_nn/assets/saved_models.zip?raw=true'
!unzip -oq "saved_models.zip" -d "saved_models"
new_net = SelfReplicator(n_hidden, wo, size_hidden, standardize_weights, 
                use_fixed_weights=True, switch_init_constant=switch_init_constant, num_extra_params=3)
new_net.set_weights(params[:-2])
new_net.set_fixed_weights(params[-2], params[-1])

network = new_net

## Evaluations

In [None]:
def get_ops(ops_to_do, idx, default=no_variation):
  if idx < len(ops_to_do):
    return ops_to_do[idx]
  return default

def convert_y_to_im(y):
  return np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1)

def viz_sequence(all_im):
  n_pl = len(all_im)
  n_cols = 10
  n_rows = int(math.ceil(n_pl / n_cols))
  fh = int(n_rows * 1.5)
  pl.figure(figsize=(15, fh))
  for i, im in enumerate(all_im):
    pl.subplot(n_rows,n_cols,i+1)
    pl.imshow(im)
    pl.axis("off")

  pl.show()

def show_sequence(ops_to_do, default, n_pl, network=network, return_network=False):
  new_net = network
  all_im = []
  for i in range(1, n_pl + 1):
    y = new_net.rgb(im_enc_input)
    im = convert_y_to_im(y)
    all_im.append(im)

    new_net = new_net.createNewNetwork(get_ops(ops_to_do, i-1, default=default))

  viz_sequence(all_im)
  
  if return_network:
    return all_im, new_net
  return all_im

def evaluate_identity(n_pl, network=network, variation=no_variation, verbose=False):
  new_net = network

  all_im = []
  y = new_net.rgb(im_enc_input)
  im = convert_y_to_im(y)
  if not verbose:
    print("Initial image")
    pl.imshow(im)
    pl.axis("off")
    pl.show()
  else:
    all_im.append(im)

  initial_weights = tf.concat([tf.reshape(v, [-1]) for v in new_net.trainable_variables], 0)
  prev_w = initial_weights
  is_fixed = tf.sigmoid(new_net.is_fixed_logit)
  is_not_fixed = 1. - is_fixed
  all_w = [initial_weights]
  for i in range(1, n_pl + 1):
    new_net = new_net.createNewNetwork(variation)
    new_w = tf.concat([tf.reshape(v, [-1]) for v in new_net.trainable_variables], 0)
    all_w.append(new_w)

    prev_w = new_w

    if verbose:
      y = new_net.rgb(im_enc_input)
      im = convert_y_to_im(y)
      all_im.append(im)

  show_weight_divergence(all_w)

  if not verbose:
    print("Final image")
    y = new_net.rgb(im_enc_input)
    im = np.clip(zoom(y.numpy().reshape([IMG_HEIGHT, IMG_WIDTH, 3])), 0,1)
    pl.imshow(im)
    pl.axis("off")
    pl.show()
  else:
    print("Sequence:")
    viz_sequence(all_im)

In [None]:
# Examples
ops_to_do = [rs_variation, rs_variation]

all_im = show_sequence(ops_to_do, no_variation, 40)
show_single_im(all_im[0])

In [None]:
ops_to_do = []

all_im = show_sequence(ops_to_do, chsw_variation, 40)
show_single_im(all_im[-1])

These evaluations investigate the results of sink loss applied on our 4 in-training configurations.

We show the sequences of when we repeatedly apply a "no variation" mutation from a state trained with sink loss.

Note that:

1.   The network where we apply the sink loss during training is *NEVER* the sink. We find the sink after repeated applications of the replication function.
2.   We infer 1. from the plots, showing that there is a high divergence from the original network, but the divergence from the parent eventually converges to insignificant values.
3.   We also see that the "channel swap" network did not create a sink and it diverts to another sink (the left shift sink).
4.   We infer 3. from the visual change in the picture and a clear 2-mode behavior observed in the divergence from parent graph. Further evidence of the convergence to the "left shift sink" can be seen in other blocks down below.



In [None]:
# Showing if the states with sink loss are actual sinks

# If you set verbose to true, we will print the whole sequence of images.
verbose = True
n_pl_after_first = 299

print("net: id*{}".format(n_pl_after_first+1))
evaluate_identity(n_pl=1+n_pl_after_first, verbose=verbose)

print("net: ls id*{}".format(n_pl_after_first))
new_net = network.createNewNetwork(ls_variation)
evaluate_identity(n_pl=n_pl_after_first, network=new_net, verbose=verbose)

print("net: rs id*{}".format(n_pl_after_first))
new_net = network.createNewNetwork(rs_variation)
evaluate_identity(n_pl=n_pl_after_first, network=new_net, verbose=verbose)

# This one IS NOT a sink!
print("net: chsw id*{}".format(n_pl_after_first))
new_net = network.createNewNetwork(chsw_variation)
evaluate_identity(n_pl=n_pl_after_first, network=new_net, verbose=verbose)



Below, we investigate whether repeated applications of replication with variation converge to a sink.

We observe:

1.   All these replication with variation operations show a clear sink.
2.   This is undesired behavior and shows overfitting to training configurations.



In [None]:
# Showing if repeated applications of a variation cause a sink

# If you set this to true, we will print the whole sequence of images.
verbose = True
n_pl = 299

print("net: ls*{}".format(n_pl))
new_net = network.createNewNetwork(ls_variation)
evaluate_identity(n_pl=n_pl, variation=ls_variation, verbose=verbose)

print("net: rs*{}".format(n_pl))
new_net = network.createNewNetwork(rs_variation)
evaluate_identity(n_pl=n_pl, variation=rs_variation, verbose=verbose)

print("net: chsw*{}".format(n_pl))
new_net = network.createNewNetwork(chsw_variation)
evaluate_identity(n_pl=n_pl, variation=chsw_variation, verbose=verbose)


Below, we analyze the sinks found in these out of training transitions.

We can conclude that all these sinks are equivalent from observing the negligible values of their weight divergences.


In [None]:
# Analising the sinks to prove they are the same:

print("net_1: ls ls id*98")
ops_to_do = [ls_variation] * 2
all_im1, net_1 = show_sequence(ops_to_do, default=no_variation, n_pl=100, return_network=True)

print("net_2: rs*5 id*95")
ops_to_do = [rs_variation] * 5
all_im2, net_2 = show_sequence(ops_to_do, default=no_variation, n_pl=100, return_network=True)


net1_w = tf.concat([tf.reshape(v, [-1]) for v in net_1.trainable_variables], 0)
net2_w = tf.concat([tf.reshape(v, [-1]) for v in net_2.trainable_variables], 0)

weight_divergence = tf.reduce_mean(tf.square((net2_w - net1_w)))

print("weight divergence of net_1 vs net_2:", weight_divergence)

print("trained sink ls_net: ls id*99")
ops_to_do = [ls_variation]
all_im3, ls_net = show_sequence(ops_to_do, default=no_variation, n_pl=100, return_network=True)
ls_net_w = tf.concat([tf.reshape(v, [-1]) for v in ls_net.trainable_variables], 0)
weight_divergence = tf.reduce_mean(tf.square((ls_net_w - net1_w)))
print("weight divergence of net_1 vs ls_net:", weight_divergence)
weight_divergence = tf.reduce_mean(tf.square((ls_net_w - net2_w)))
print("weight divergence of net_2 vs ls_net:", weight_divergence)

print("net_3: chsw*9 id*91")
ops_to_do = [chsw_variation] * 9

all_im, net_3 = show_sequence(ops_to_do, default=no_variation, n_pl=100, return_network=True)
net3_w = tf.concat([tf.reshape(v, [-1]) for v in net_3.trainable_variables], 0)

weight_divergence = tf.reduce_mean(tf.square((ls_net_w - net3_w)))
print("weight divergence of net_3 vs ls_net:", weight_divergence)
weight_divergence = tf.reduce_mean(tf.square((net3_w - net2_w)))
print("weight divergence of net_3 vs net_2:", weight_divergence)
