In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import gym

import imitation_learning

In [2]:
num_episodes  = 100
num_steps     = 20
batch_size    = 2
n_iterations  = 10
mem_size      = 10

input_size    = 1
action_size   = 4
learning_rate = 0.01

In [3]:
def discrete_policy_loss(p, a, r):
    """
    p - array (n_episodes x n_steps x state_size)
    a - array (n_episodes x n_steps x num_actions) - binary array
    r - array (n_episodes x n_steps)
    """
    n_episodes = a.shape[0]
    n_steps    = a.shape[1]
    n_actions  = a.shape[2]
    
    log_pr     = tf.zeros(shape=[n_steps, n_actions])
    
    for i in range(n_episodes):
        r_cum = tf.cumsum(r[i], reverse=True)
        r_cum = tf.expand_dims(r_cum, axis=1)
        
        log_pr += -tf.math.log(p)*a[i]*r_cum
        
    return tf.reduce_sum(log_pr)/n_episodes

def gradient(model, s, a, r):
    with tf.GradientTape() as t:
        loss = discrete_policy_loss(model(s),a,r)
    return t.gradient(loss, model.trainable_variables)

In [4]:
model = keras.Sequential([
    keras.layers.Input(shape=[input_size]),
    keras.layers.Dense(action_size, activation=tf.nn.softmax)
])

In [5]:
env = gym.make('FrozenLake-v0').env.__class__(
    map_name='4x4', is_slippery=True)

In [6]:
agent = imitation_learning.agent.DiscreteActionAgent(model)

In [7]:
sim   = imitation_learning.simulator.Simulator(env, agent)

In [8]:
optimizer   = tf.optimizers.SGD(learning_rate=0.0001, momentum=0.9)

global_step = tf.Variable(0)

In [9]:
inds = np.arange(num_episodes)

S = np.zeros((mem_size, num_steps, input_size))
A = np.zeros((mem_size, num_steps, action_size))
R = np.zeros((mem_size, num_steps))
count = 0
for it in range(n_iterations):
    
    sim.run(render=False, num_steps=num_steps)
    
    T = sim.tuples
    n = len(T)
    print(it,n)
    
    s = np.array([t[0] for t in T])
    if (len(s.shape) == 1): s = s[:,np.newaxis]
    S[count,:n,:] = s
    
    a = np.array([t[1] for t in T])
    for i,a_ in enumerate(a): A[count,i,a_] = 1 
    
    R[count,:n] = np.array([t[2] for t in T])
    
    count += 1
    if (count >= mem_size): count = 0

0 14
1 7
2 3
3 5
4 5
5 4
6 7
7 3
8 4
9 3
