In [23]:
import numpy as np
from trajectory import Trajectory, generate_trajectory, generate_trajectories, stochastic_policy_adapter
from solver import value_iteration, stochastic_policy_from_value_expectation
from snake_ladder import SnakeLadderWorld
import tensorflow as tf
import datetime

In [24]:
# define some consants
world_size = 20
shortcut_density = 0.1
success_prob = .9
n_trajectories_per_policy = 100

In [25]:
# create our world
world = SnakeLadderWorld(size=world_size, shortcut_density=shortcut_density)

# set up the reward function
reward = np.zeros(world.n_states)
reward[-1] = 1.0
# start state
start = [0]

# set up terminal states
terminal = [world.size - 1]

world.game_board

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  3, 15, 16,
       17,  5, 19])

In [26]:
# create some policies
# create "fixed" policies which each execeute one of the three actions w/ prob p (success_prob)
# randomly sample from all actions w/ prob 1 - p
# so excute one action with prob p + 1/3(1 - p) and others with 1/3(1 -  p)
policies_fixed = []

for i in range(3):
    def policy(state, action = i):
        if success_prob >= np.random.uniform():
            return action
        else:
            return np.random.choice(3)
    policies_fixed.append(policy)

In [27]:
# verify our fixed policies
samples = 10000
means = [0,0,0]
for i in range(samples):
    for j in range(3):
        means[j] += policies_fixed[j](None)
means = np.array(means) / samples
means

array([0.1031, 1.0006, 1.9025])

In [28]:
# generate an "optimal" policy w/ value iteration
discount = .7
weighting = lambda x: x

value = value_iteration(world.p_transition, reward, discount)
policy = stochastic_policy_from_value_expectation(world, value)
policy_exec = stochastic_policy_adapter(policy)

policy

array([[0.28792551, 0.32447227, 0.38760223],
       [0.28651586, 0.32199244, 0.39149171],
       [0.28713419, 0.3220313 , 0.39083451],
       [0.28727589, 0.32231397, 0.39041014],
       [0.28673386, 0.32326914, 0.389997  ],
       [0.28146156, 0.32716457, 0.39137387],
       [0.2790831 , 0.33572618, 0.38519072],
       [0.28218724, 0.34137534, 0.37643742],
       [0.29155258, 0.31970539, 0.38874204],
       [0.29146777, 0.3198841 , 0.38864812],
       [0.29221949, 0.31814977, 0.38963074],
       [0.29891561, 0.3120493 , 0.38903509],
       [0.30830371, 0.2823047 , 0.40939159],
       [0.18215328, 0.34065469, 0.47719203],
       [0.28739151, 0.31675507, 0.39585342],
       [0.28211351, 0.32536305, 0.39252344],
       [0.2768956 , 0.33436183, 0.38874257],
       [0.16357349, 0.39506418, 0.44136232],
       [0.33333333, 0.33333333, 0.33333333],
       [0.33333333, 0.33333333, 0.33333333]])

In [29]:
# generate trajectories w/ policy (s, a, s')
trajectories_optimal = list(generate_trajectories(n_trajectories_per_policy, world, policy_exec, start, terminal))

trajectories_optimal = [t._t for t in trajectories_optimal]
trajectories_optimal[0]

[(0, 1, 6), (6, 2, 12), (12, 1, 15), (15, 1, 19)]

In [30]:
# generate trajectories w/ fixed policies
trajectories_fixed_list = []
for i in range(3):
    trajectories = list(generate_trajectories(n_trajectories_per_policy, world, policies_fixed[i], start, terminal))
    trajectories = [t._t for t in trajectories]
    trajectories_fixed_list.append(trajectories)


#trajectories_fixed_list[0][0], trajectories_fixed_list[1][0]

In [31]:
trajectories_optimal[0]

[(0, 1, 6), (6, 2, 12), (12, 1, 15), (15, 1, 19)]

In [32]:
# create list of all trajectories, each is a 2d list

x_data = [np.matrix(t).tolist() for t in trajectories_optimal]
for i in range(3):
    x_data.extend([np.matrix(t).tolist() for t in trajectories_fixed_list[i]])

In [33]:
# label trajectories

# classes: 3 for expert, 0, 1, 2 for policy w/ actions 0,1,2 respectively
y_data = [3] * n_trajectories_per_policy
for i in range(3):
    y_data.extend([i] * n_trajectories_per_policy)

In [34]:
# shuffle x,y data together
import random
temp = list(zip(x_data,y_data))
random.shuffle(temp)
x_data, y_data = zip(*temp)

In [35]:
# convert trajectories to ragged tensors
x_data = tf.ragged.constant(x_data)
max_seq = int(x_data.bounding_shape()[-2])
print(max_seq)
x_data.bounding_shape()

621


<tf.Tensor: shape=(3,), dtype=int64, numpy=array([400, 621,   3])>

In [36]:
# convert y to numpy array
y_data = np.array(y_data)
y_data

array([2, 0, 3, 2, 0, 3, 3, 3, 2, 3, 0, 2, 2, 0, 2, 3, 2, 2, 1, 0, 1, 0,
       0, 0, 1, 3, 1, 2, 2, 2, 0, 3, 0, 3, 3, 1, 3, 3, 1, 2, 0, 1, 2, 2,
       1, 2, 1, 3, 3, 3, 2, 0, 0, 2, 3, 2, 0, 0, 1, 1, 0, 0, 1, 2, 0, 0,
       1, 2, 3, 3, 3, 0, 3, 1, 2, 1, 3, 1, 0, 3, 0, 3, 3, 2, 1, 2, 3, 0,
       2, 1, 3, 0, 1, 0, 2, 0, 3, 1, 3, 0, 1, 0, 0, 1, 2, 3, 3, 1, 2, 0,
       3, 1, 2, 1, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 0, 1, 3, 3, 2, 2, 1, 1,
       0, 2, 2, 0, 2, 1, 0, 1, 0, 3, 0, 0, 3, 3, 3, 3, 0, 3, 1, 3, 1, 0,
       2, 3, 0, 3, 0, 1, 0, 2, 1, 1, 0, 2, 0, 2, 1, 0, 0, 2, 0, 3, 2, 0,
       2, 0, 1, 3, 2, 1, 2, 1, 1, 1, 3, 1, 3, 1, 1, 3, 3, 0, 0, 2, 3, 1,
       1, 3, 0, 3, 3, 0, 3, 1, 0, 0, 1, 0, 1, 0, 3, 3, 1, 1, 3, 2, 1, 2,
       3, 3, 2, 1, 2, 1, 1, 2, 2, 0, 2, 0, 2, 1, 0, 1, 2, 3, 0, 3, 0, 2,
       3, 3, 1, 3, 1, 1, 1, 0, 1, 3, 0, 2, 3, 3, 2, 3, 0, 1, 1, 0, 2, 2,
       3, 1, 1, 3, 3, 2, 1, 3, 2, 2, 3, 0, 0, 0, 3, 3, 3, 1, 2, 3, 3, 0,
       1, 3, 0, 1, 3, 0, 0, 2, 1, 3, 1, 1, 0, 1, 2,

In [37]:
# split into train and test
test_prop = .20
test_n = int(len(y_data) * test_prop)
x_test = x_data[:test_n, :, :]
y_test = y_data[:test_n]
x_train = x_data[test_n:,:,:]
y_train = y_data[test_n:]
x_test.shape, x_train.shape, y_test.shape, y_train.shape

(TensorShape([80, None, None]), TensorShape([320, None, None]), (80,), (320,))

In [38]:
# create LSTM model
lstm_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(max_seq,3), dtype=tf.float32, ragged=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dropout(.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(4, activation='softmax')
])

In [39]:
lstm_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])
lstm_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 64)                17408     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_3 (Dense)              (None, 4)                 260       
Total params: 21,828
Trainable params: 21,828
Non-trainable params: 0
_________________________________________________________________


In [40]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)


lstm_model.fit(x_train,y_train, epochs=50, batch_size=int(n_trajectories_per_policy / 10), validation_data=(x_test,y_test), callbacks=[tensorboard_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f61bc2e1d90>