In [0]:
# 'Licensed under the Apache License, Version 2.0'
#@title Imports + Global Variables + Helpers

import copy
import random
import itertools
import numpy as np
import matplotlib.pyplot as plt
import math
import tensorflow as tf

num_states = 2
num_actions = 2
gamma = 0.9

plt.style.use('classic')
color = (152./255, 225./255, 152./255)

#https://stackoverflow.com/questions/42157781/block-diagonal-matrices-in-tensorflow
def block_diagonal(matrices, dtype=tf.float64):
  r"""Constructs block-diagonal matrices from a list of batched 2D tensors.

  Args:
    matrices: A list of Tensors with shape [..., N_i, M_i] (i.e. a list of
      matrices with the same batch dimension).
    dtype: Data type to use. The Tensors in `matrices` must match this dtype.
  Returns:
    A matrix with the input matrices stacked along its main diagonal, having
    shape [..., \sum_i N_i, \sum_i M_i].

  """
  matrices = [tf.convert_to_tensor(matrix, dtype=dtype) for matrix in matrices]
  blocked_rows = tf.Dimension(0)
  blocked_cols = tf.Dimension(0)
  batch_shape = tf.TensorShape(None)
  for matrix in matrices:
    full_matrix_shape = matrix.get_shape().with_rank_at_least(2)
    batch_shape = batch_shape.merge_with(full_matrix_shape[:-2])
    blocked_rows += full_matrix_shape[-2]
    blocked_cols += full_matrix_shape[-1]
  ret_columns_list = []
  for matrix in matrices:
    matrix_shape = tf.shape(matrix)
    ret_columns_list.append(matrix_shape[-1])
  ret_columns = tf.add_n(ret_columns_list)
  row_blocks = []
  current_column = 0
  for matrix in matrices:
    matrix_shape = tf.shape(matrix)
    row_before_length = current_column
    current_column += matrix_shape[-1]
    row_after_length = ret_columns - current_column
    row_blocks.append(tf.pad(
        tensor=matrix,
        paddings=tf.concat(
            [tf.zeros([tf.rank(matrix) - 1, 2], dtype=tf.int32),
             [(row_before_length, row_after_length)]],
            axis=0)))
  blocked = tf.concat(row_blocks, -2)
  blocked.set_shape(batch_shape.concatenate((blocked_rows, blocked_cols)))
  return blocked

In [0]:
#@title Define the MDP
# define random reward function
r = np.random.uniform(-1, 1, num_states*num_actions)

# define random transition function
alphas = np.ones(num_states)
P = np.zeros((num_states*num_actions, num_states))
for sa in range(num_states*num_actions):
  P[sa] = np.random.dirichlet(alphas) 

# MDP defined in Section 5
# r = [[r(a1, s1), r(a2, s1)], [r(a1, s2), r(a2, s2)]]
r = np.array([-0.45, -0.1,  0.5,  0.5])

# P = [
# ..[P(s1| a1, s1), P(s2| a1, s1)],
# ..[P(s1| a2, s1), P(s2| a2, s1)],
# ..[P(s1| a1, s2), P(s2| a1, s2)],
# ..[P(s1| a2, s2), P(s2| a2, s2)]
#]
P = np.array([[ 0.7,  0.3],
       [ 0.99,  0.01],
       [ 0.2,  0.8],
       [ 0.99,  0.01]])

In [0]:
#@title Value Functions
num_samples = 50000
value_functions = []
for _ in range(num_samples):
  alphas = np.ones(num_actions)
  Pi = np.zeros((num_states, num_states*num_actions))
  for s in range(num_states):
    Pi[s, s*num_actions:(s+1)*num_actions] = np.random.dirichlet(alphas)

  P_pi = np.matmul(Pi, P)
  r_pi = np.matmul(Pi, r)

  V_pi = np.matmul(np.linalg.inv((np.eye(num_states) - gamma*P_pi)), r_pi)
  value_functions.append(V_pi)

xmin = min(V[0] for V in value_functions)
xmax = max(V[0] for V in value_functions)
ymin = min(V[1] for V in value_functions)
ymax = max(V[1] for V in value_functions)
eps = 0.2

def cfg_axes(ax):
  ax.spines['right'].set_visible(False)
  ax.spines['top'].set_visible(False)
  ax.set_xlim([xmin-eps, xmax+eps])
  ax.set_ylim([ymin-eps, ymax+eps])
  ax.tick_params(
      axis='both',
      which='both',
      bottom=False,
      top=False, 
      left=False,
      right=False,
      labelleft=False,
      labelbottom=False)
  
plt.figure(figsize=(5,5))
plt.scatter(*zip(*value_functions), c=color, edgecolors=color)
ax = plt.gca()
cfg_axes(ax)
plt.show()


In [0]:
#@title Value Functions of Deterministic Policies
set_deter_actions = []
for i in range(num_actions):
  deter_action = np.zeros(num_actions)
  deter_action[i] = 1
  set_deter_actions.append(deter_action)

value_functions_deter = []
for policies in itertools.product(set_deter_actions, repeat=num_states):
  Pi = np.zeros((num_states, num_states*num_actions))
  for s in range(num_states):
    Pi[s, s*num_actions:(s+1)*num_actions] = policies[s]

  P_pi = np.matmul(Pi, P)
  r_pi = np.matmul(Pi, r)

  V_pi = np.matmul(np.linalg.inv((np.eye(num_states) - gamma*P_pi)), r_pi)
  value_functions_deter.append(V_pi)
  
plt.figure(figsize=(5,5))
plt.scatter(*zip(*value_functions), color=color)
plt.scatter(*zip(*value_functions_deter), color='red', s=100)
ax = plt.gca()
cfg_axes(ax)
plt.show()

In [0]:
#@title Value functions of semi-deterministic policies
params_sd_policies = []
for s in range(num_states):
  for a in range(num_actions):
    p = np.zeros(num_actions)
    p[a] = 1
    params_sd_policies.append((s, p))

value_functions_semi_deter = []
for params in params_sd_policies:
  state, state_policy = params
 
  for _ in range(1000):    
    alphas = np.ones(num_actions)
    Pi = np.zeros((num_states, num_states*num_actions))
    for s in range(num_states):
      Pi[s, s*num_actions:(s+1)*num_actions] = np.random.dirichlet(alphas)
    Pi[state, state*num_actions:(state+1)*num_actions] = state_policy    

    P_pi = np.matmul(Pi, P)
    r_pi = np.matmul(Pi, r)
    V_pi = np.matmul(np.linalg.inv((np.eye(num_states) - gamma*P_pi)), r_pi)
    value_functions_semi_deter.append(V_pi)
    

plt.figure(figsize=(5,5))
plt.scatter(*zip(*value_functions), color=color)
plt.scatter(*zip(*value_functions_semi_deter), color='orange')
ax = plt.gca()
cfg_axes(ax)
plt.show()

In [0]:
#@title The line theorem
alphas = np.ones(num_actions)
Pi = np.zeros((num_states, num_states*num_actions))
for s in range(num_states):
  Pi[s, s*num_actions:(s+1)*num_actions] = np.random.dirichlet(alphas)

policies = []
state = random.randint(0, num_states-1)
for _ in range(1000):
  new_Pi = copy.deepcopy(Pi)
  new_Pi[state, state*num_actions:(state+1)*num_actions] = np.random.dirichlet(alphas)
  policies.append(new_Pi)

value_functions_lines = []
for pi in policies:
  P_pi = np.matmul(pi, P)
  r_pi = np.matmul(pi, r)
  V_pi = np.matmul(np.linalg.inv((np.eye(num_states) - gamma*P_pi)), r_pi)
  value_functions_lines.append(V_pi)

plt.figure(figsize=(5,5))
plt.scatter(*zip(*value_functions), color=color)
plt.scatter(*zip(*value_functions_lines), color='orange')
ax = plt.gca()
cfg_axes(ax)
plt.show()

In [0]:
#@title Value functions of mixtures of 2 deterministic policies 
# generate deterministic policies
pol = []
for policies in itertools.product(set_deter_actions, repeat=num_states):
  Pi = np.zeros((num_states, num_states*num_actions))
  for s in range(num_states):
    Pi[s, s*num_actions:(s+1)*num_actions] = policies[s]
  pol.append(Pi)

# pick two policies and their mixtures
policies = []
Pi_1 = pol[0]
Pi_2 = pol[3]
for x in np.linspace(0, 1, 500):
  new_Pi = x*Pi_1 + (1-x)*Pi_2
  policies.append(new_Pi)

value_functions_mixtures = []
for pi in policies:
  P_pi = np.matmul(pi, P)
  r_pi = np.matmul(pi, r)
  V_pi = np.matmul(np.linalg.inv((np.eye(num_states) - gamma*P_pi)), r_pi)
  value_functions_mixtures.append(V_pi)

plt.figure(figsize=(5,5))
plt.scatter(*zip(*value_functions), color=color)
plt.scatter(*zip(*value_functions_mixtures), color='orange')
ax = plt.gca()
cfg_axes(ax)
plt.show()

In [0]:
#@title Define starting points for learning dynamics

def idx(s, a):
  return s*num_actions + a

def get_value_function_from_logits(thetas):
  pis = [[1 / (1 + math.exp(-x)), math.exp(-x)/ (1 + math.exp(-x))] for x in thetas]
  Pi = np.zeros((num_states, num_states*num_actions))
  for s, pi in zip(range(num_states), pis):
    Pi[s, s*num_actions:(s+1)*num_actions] = pi

  P_pi = np.matmul(Pi, P)
  r_pi = np.matmul(Pi, r)

  V_pi = np.matmul(np.linalg.inv((np.eye(num_states) - gamma*P_pi)), r_pi)
  return V_pi

logits = [np.array([5., -5.]), np.array([-1., -5.]),  np.array([-1., 0.])]
value_functions_starting_points = []
for logit in logits:
  V_pi = get_value_function_from_logits(logit)
  value_functions_starting_points.append(V_pi)

plt.figure(figsize=(5,5))
plt.scatter(*zip(*value_functions), color=color)
plt.scatter(*zip(*value_functions_starting_points), color='red', s=100)
ax = plt.gca()
cfg_axes(ax)
plt.show()

In [0]:
#@title Value Iteration
fig, ax = plt.subplots(nrows=1, ncols=len(logits), figsize=(15, 5))
plt.subplots_adjust(wspace=0.1)
num_iterations = 50

for i, logit in enumerate(logits):
  value_functions_vi = [get_value_function_from_logits(logit)]
  for _ in range(num_iterations):
    V = np.zeros(num_states) # placeholder
    for s in range(num_states):
      V[s] = max(r[idx(s,a)] + gamma * np.dot(P[idx(s, a)], value_functions_vi[-1]) for a in range(num_actions))
    value_functions_vi.append(V)

  ax[i].scatter(*zip(*value_functions), color=color)

  cfg_axes(ax[i])
  ax[i].scatter(*zip(*value_functions_vi), c=np.arange(len(value_functions_vi)))
    
plt.show()

In [0]:
#@title Policy iteration
num_iterations = 2
fig, ax = plt.subplots(nrows=1, ncols=len(logits), figsize=(15, 5))
plt.subplots_adjust(wspace=0.1)

for i, logit in enumerate(logits):
  V_pi = get_value_function_from_logits(logit)
  value_functions_pi = [V_pi]
  for cnt in range(num_iterations):
    Pi = np.zeros((num_states, num_states*num_actions))
    for s in range(num_states):
      best_action = np.argmax([r[idx(s,a)] + gamma * np.dot(P[idx(s, a)], V_pi) for a in range(num_actions)]) 
      pi = np.zeros(num_actions)
      pi[best_action] = 1
      Pi[s, s*num_actions:(s+1)*num_actions] = pi

    P_pi = np.matmul(Pi, P)
    r_pi = np.matmul(Pi, r)
    V_pi = np.matmul(np.linalg.inv((np.eye(num_states) - gamma*P_pi)), r_pi)
    value_functions_pi.append(V_pi)

  ax[i].scatter(*zip(*value_functions), color=color)
  ax[i].scatter(*zip(*value_functions_pi), color='blue')
  cfg_axes(ax[i])
  ax[i].tick_params(axis='both', which='major', labelsize=16)

plt.show()

In [0]:
#@title Policy Gradient + Entropy Regularization + Natural Policy Gradient

learning_rate = 1.
num_iterations = 50
entropy = False
natural_gradients = True

fig, ax = plt.subplots(nrows=1, ncols=len(logits), figsize=(15, 5))
plt.subplots_adjust(wspace=0.1)
print "Entropy:", entropy
print "Natural Gradients:", natural_gradients

for i, logit in enumerate(logits):
  thetas = logit
  thetas = [tf.Variable(theta, dtype=tf.float64) for theta in thetas]
  pi = [[tf.nn.sigmoid(theta),1- tf.nn.sigmoid(theta)] for theta in thetas]
  block_pi = block_diagonal([tf.expand_dims(p, 0) for p in pi])

  P_pi = tf.matmul(block_pi, tf.constant(P))
  r_pi = tf.matmul(block_pi, tf.expand_dims(tf.constant(r), 1))
  V_pi = tf.matmul(tf.linalg.inv(tf.eye(num_states, dtype=tf.float64) - gamma*P_pi), r_pi)

  avg_return = tf.reduce_mean(V_pi)
  if entropy:
    for s in range(num_states):
      avg_return -= 1.*tf.reduce_mean(tf.multiply(pi[s], tf.math.log(pi[s])))
  
  opt = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
  grads_and_vars = opt.compute_gradients(-avg_return, thetas)
  
  # replace None gradients by zero
  log_pi_grad = []
  for k in range(num_states):
    gradients = [tf.gradients(tf.log(pi[k][j]), thetas) for j in range(num_actions)]
    processed_gradients = []
    for grad_list in gradients:
      new_gradients = [grad if grad is not None else tf.constant(0., dtype=tf.float64) for grad in grad_list]
      processed_gradients.append(tf.stack(new_gradients))
    log_pi_grad.append(processed_gradients)

  # define fisher information matrix
  fisher = tf.zeros([2, 2], dtype=tf.float64)
  for k, log_pi_state in enumerate(log_pi_grad): 
    for j, log_pi_state_action in enumerate(log_pi_state):
      fisher += pi[k][j] * tf.matmul(log_pi_state_action[:, None], log_pi_state_action[None, :])
  fisher_inv = tf.linalg.inv(fisher + 0.0001*tf.eye(2, dtype=tf.float64))
  
  # condition gradients
  grads = [gv[0] for gv in grads_and_vars]
  grads_tensor = tf.stack(grads)
  conditioned_grads = tf.matmul(fisher_inv, grads_tensor[:, None])
  conditioned_grads = [conditioned_grads[k, 0] for k in range(num_states)]
  conditioned_grads_and_vars = [(grad, old_grad_vars[1]) for grad, old_grad_vars in zip(conditioned_grads, grads_and_vars)]
  
  if natural_gradients:
    train_op = opt.apply_gradients(conditioned_grads_and_vars)
  else:
    train_op = opt.apply_gradients(grads_and_vars)

  value_functions_pg = []
  with tf.Session() as sess:
    tf.initialize_all_variables().run()
    for _ in range(num_iterations):
      value_functions_pg.append(sess.run(V_pi))
      _, fish = sess.run([train_op, fisher])  
      
  ax[i].scatter(*zip(*value_functions), color=color)
  ax[i].scatter(*zip(*value_functions_pg), c=np.arange(len(value_functions_pg)))
  cfg_axes(ax[i])
  ax[i].tick_params(axis='both', which='major', labelsize=16)

plt.show()

In [0]:
#@title Cross Entropy Methods
num_samples = 500
num_best_samples = 50
initial_cov = .5*np.eye(2)
num_iterations = 50
noise = True
print "Add noise: ", True

def get_cem_traj(mean, cov, num_iterations, noise):
  value_functions = []
  for cnt in range(num_iterations):
    value_functions.append(get_value_function_from_logits(mean))
    results = []
    for lgts in np.random.multivariate_normal(mean, cov, num_samples):
      results.append([lgts, np.mean(get_value_function_from_logits(lgts))])
      results = sorted(results, key=lambda x: -x[1])
    best_logits = np.array([lgts for lgts, _ in results[:num_best_samples]]) 
    mean = np.mean(best_logits, axis=0)
    if noise:
      cov = np.cov(best_logits.T) + 0.01*np.eye(2)
    else:
      cov = np.cov(best_logits.T)
  return value_functions

fig, ax = plt.subplots(nrows=1, ncols=len(logits), figsize=(15, 5))
plt.subplots_adjust(wspace=0.1)
for i, logit in enumerate(logits):
  value_functions_cem = get_cem_traj(logit, initial_cov, num_iterations, noise)
  ax[i].scatter(*zip(*value_functions), color=color)
  ax[i].scatter(*zip(*value_functions_cem), c=np.arange(len(value_functions_cem)))
  cfg_axes(ax[i])