In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import gym

import imitation_learning

In [2]:
num_steps     = 20
batch_size    = 4
n_iterations  = 100000

input_size    = 16
action_size   = 4
num_units     = 5
learning_rate = 0.001

In [3]:
def preprocess_state(s):
    S = np.zeros((1,input_size), np.float32)
    S[0,s] = 1
    return S   

In [4]:
policy = keras.Sequential([
            keras.layers.Input(shape=[input_size]),
            keras.layers.Dense(action_size, activation=tf.nn.softmax)
        ])

value_function = keras.Sequential([
            keras.layers.Input(shape=[input_size]),
            keras.layers.Dense(1, activation=tf.identity)
        ])

In [5]:
agent = imitation_learning.model.DiscreteActorCritic(
    policy, value_function, preprocess_state, learning_rate
)

In [6]:
env = gym.make('FrozenLake-v0').env.__class__(
    map_name='4x4', is_slippery=False)

In [7]:
sim   = imitation_learning.simulator.Simulator(env, agent)

In [8]:
t_good = [(0,1,0),(4,1,0),(8,2,0),(9,2,0),(10,1,0),(14,2,1)]

In [9]:
count = 0
avg_r = 0
rewards = []
for it in range(n_iterations):
    
    r = sim.run(render=False, num_steps=num_steps)
    
    avg_r = 0.9*avg_r + 0.1*r
    rewards.append(avg_r)
    T = sim.tuples

    S = np.zeros((1, num_steps, input_size), np.float32)
    A = np.zeros((1, num_steps, action_size), np.float32)
    R = np.zeros((1, num_steps,1), np.float32)
    I = np.zeros((1, num_steps,1), np.float32)
    
    n = len(T)
    for k,t in enumerate(T):
        S[0,k,t[0]] = 1
        A[0,k,t[1]] = 1
        I[0,k,0]    = 1
    
    R[0,:n,0] = np.cumsum([t[2] for t in T])[::-1]
    
    loss = agent.train(S, A, R, I)
    
    if it%2000 == 0:
        print("{} r={} loss={:.2f}".format(it,r,loss))

0 r=0.0 loss=1.71
2000 r=0.0 loss=-0.02
4000 r=0.0 loss=0.14
6000 r=0.0 loss=0.00
8000 r=0.0 loss=0.02
10000 r=0.0 loss=0.00
12000 r=0.0 loss=0.00
14000 r=0.0 loss=-0.34
16000 r=0.0 loss=-0.02
18000 r=0.0 loss=-0.13
20000 r=0.0 loss=-0.07
22000 r=0.0 loss=-0.02
24000 r=0.0 loss=-0.01
26000 r=0.0 loss=0.03
28000 r=0.0 loss=-0.02
30000 r=0.0 loss=0.00
32000 r=0.0 loss=-0.00
34000 r=0.0 loss=-0.03
36000 r=0.0 loss=0.00
38000 r=0.0 loss=-0.00
40000 r=0.0 loss=-0.03
42000 r=0.0 loss=0.08


E0528 08:02:42.222838 139644269979456 ultratb.py:149] Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/gabriel/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-01b6ea8b2776>", line 25, in <module>
    loss = agent.train(S, A, R, I)
  File "/home/gabriel/projects/imitation_learning/imitation_learning/model.py", line 27, in train
    self.opt.apply_gradients(zip(gp,self.policy.trainable_variables))
  File "/home/gabriel/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py", line 406, in apply_gradients
    self._distributed_apply, args=(grads_and_vars,), kwargs={"name": name})
  File "/home/gabriel/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py", line 1375, in merge_call
    return self._merge_call(merge_fn, args, kwargs)
  File "/home/gabriel/anaconda3/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py", line 1384, in

KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(6,6))
plt.plot(rewards)
plt.show()
plt.close()

In [None]:
sim.run(render=True, num_steps=num_steps)

In [None]:
print(0,list(policy.predict(preprocess_state(0))[0]))
print(4,list(policy.predict(preprocess_state(4))[0]))
print(8,list(policy.predict(preprocess_state(8))[0]))
print(9,list(policy.predict(preprocess_state(9))[0]))
print(10,list(policy.predict(preprocess_state(10))[0]))
print(14,list(policy.predict(preprocess_state(14))[0]))
print(1,list(policy.predict(preprocess_state(1))[0]))

In [None]:
print(0,list(value_function.predict(preprocess_state(0))[0]))
print(4,list(value_function.predict(preprocess_state(4))[0]))
print(8,list(value_function.predict(preprocess_state(8))[0]))
print(9,list(value_function.predict(preprocess_state(9))[0]))
print(10,list(value_function.predict(preprocess_state(10))[0]))
print(14,list(value_function.predict(preprocess_state(14))[0]))
print(1,list(value_function.predict(preprocess_state(1))[0]))

In [None]:
policy.trainable_variables

In [None]:
value_function.trainable_variables