In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import gym

import imitation_learning

In [2]:
num_episodes  = 100
num_steps     = 5
batch_size    = 2
n_iterations  = 10

input_size    = 5
action_size   = 4
learning_rate = 0.01

In [3]:
def discrete_policy_loss(p, a, r):
    """
    p - array (n_episodes x n_steps x state_size)
    a - array (n_episodes x n_steps x num_actions) - binary array
    r - array (n_episodes x n_steps)
    """
    n_episodes = a.shape[0]
    n_steps    = a.shape[1]
    n_actions  = a.shape[2]
    
    log_pr     = tf.zeros(shape=[n_steps, n_actions])
    
    for i in range(n_episodes):
        r_cum = tf.cumsum(r[i], reverse=True)
        r_cum = tf.expand_dims(r_cum, axis=1)
        
        log_pr += -tf.math.log(p)*a[i]*r_cum
        
    return tf.reduce_sum(log_pr)/n_episodes

def gradient(model, s, a, r):
    with tf.GradientTape() as t:
        loss = discrete_policy_loss(model(s),a,r)
    return t.gradient(loss, model.trainable_variables)

In [4]:
model = keras.Sequential([
    keras.layers.Input(shape=[input_size]),
    keras.layers.Dense(action_size, activation=tf.nn.softmax)
])

In [5]:
env = gym.make('FrozenLake-v0').env.__class__(
    map_name='4x4', is_slippery=True)

In [6]:
agent = imitation_learning.agent.DiscreteActionAgent(model)

In [7]:
sim   = imitation_learning.simulator.Simulator(env, agent)

In [8]:
optimizer   = tf.optimizers.SGD(learning_rate=0.0001, momentum=0.9)

global_step = tf.Variable(0)

In [9]:
inds = np.arange(num_episodes)
for it in range(n_iterations):
    
    I = np.random.choice(inds, size=batch_size)
    
    sb = np.ones((num_episodes, num_steps, input_size), dtype=np.float32)[I]
    ab = np.ones((num_episodes, num_steps, action_size), dtype=np.float32)[I]
    rb = np.ones((num_episodes, num_steps), dtype=np.float32)[I]
    
    g = gradient(model,sb,ab,rb)
    
    optimizer.apply_gradients(zip(g,model.trainable_variables), global_step)
    
    l = discrete_policy_loss(model(sb),ab,rb)
    
    print("{}: {}".format(it,l))

0: 204.18272399902344
1: 202.64610290527344
2: 200.51953125
3: 197.93185424804688
4: 195.01260375976562
5: 191.8876953125
6: 188.6752166748047
7: 185.4822998046875
8: 182.40228271484375
9: 179.51268005371094


In [10]:
p = model.predict(sb[0])
print(p)

[[0.09393696 0.24695668 0.33182007 0.3272863 ]
 [0.09393696 0.24695668 0.33182007 0.3272863 ]
 [0.09393696 0.24695668 0.33182007 0.3272863 ]
 [0.09393696 0.24695668 0.33182007 0.3272863 ]
 [0.09393696 0.24695668 0.33182007 0.3272863 ]]


In [13]:
a = agent.act(sb[0,0])
print(a)

(1, 5)
(4,)
3
