In [93]:
import numpy as np
from trajectory import Trajectory, generate_trajectory, generate_trajectories, stochastic_policy_adapter
from solver import value_iteration, stochastic_policy_from_value_expectation
from snake_ladder import SnakeLadderWorld
import tensorflow as tf
import datetime

In [94]:
# define some consants
world_size = 20
shortcut_density = 0.1
success_prob = .9
n_trajectories_per_policy = 5000

In [95]:
# create our world
world = SnakeLadderWorld(size=world_size, shortcut_density=shortcut_density)

# set up the reward function
reward = np.zeros(world.n_states)
reward[-1] = 1.0
# start state
start = [0]

# set up terminal states
terminal = [world.size - 1]

world.game_board

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  6, 12,  2, 14, 15, 16,
       17, 18, 19])

In [96]:
# create some policies
# create "fixed" policies which each execeute one of the three actions w/ prob p (success_prob)
# randomly sample from all actions w/ prob 1 - p
# so excute one action with prob p + 1/3(1 - p) and others with 1/3(1 -  p)
policies_fixed = []

for i in range(3):
    def policy(state, action = i):
        if success_prob >= np.random.uniform():
            return action
        else:
            return np.random.choice(3)
    policies_fixed.append(policy)

In [97]:
# verify our fixed policies
samples = 10000
means = [0,0,0]
for i in range(samples):
    for j in range(3):
        means[j] += policies_fixed[j](None)
means = np.array(means) / samples
means

array([0.0964, 0.9995, 1.8931])

In [98]:
# generate an "optimal" policy w/ value iteration
discount = .7
weighting = lambda x: x

value = value_iteration(world.p_transition, reward, discount)
policy = stochastic_policy_from_value_expectation(world, value)
policy_exec = stochastic_policy_adapter(policy)

policy

array([[0.28606134, 0.32108073, 0.39285793],
       [0.28559563, 0.32355058, 0.39085379],
       [0.28422682, 0.32699581, 0.38877737],
       [0.28169483, 0.33197396, 0.38633121],
       [0.28178412, 0.33727614, 0.38093973],
       [0.28440452, 0.32730325, 0.38829222],
       [0.28814351, 0.33203736, 0.37981912],
       [0.29879926, 0.30703066, 0.39417009],
       [0.29912652, 0.30588175, 0.39499173],
       [0.30046042, 0.3025449 , 0.39699468],
       [0.22616294, 0.32946972, 0.44436734],
       [0.30041501, 0.30093068, 0.39865431],
       [0.17516519, 0.34473778, 0.48009703],
       [0.28643147, 0.31892896, 0.39463957],
       [0.28100747, 0.32782317, 0.39116936],
       [0.27562207, 0.3371599 , 0.38721803],
       [0.27005555, 0.3472759 , 0.38266855],
       [0.26410717, 0.35850648, 0.37738635],
       [0.33333333, 0.33333333, 0.33333333],
       [0.33333333, 0.33333333, 0.33333333]])

In [100]:
# create list of policies
policies = policies_fixed
policies.append(policy_exec) # add expert policy to list

In [102]:
# generate trajectories with policies
trajectories_list = []
for i, policy in enumerate(policies):
    trajectories = list(generate_trajectories(n_trajectories_per_policy, world, policies[i], start, terminal))
    trajectories = [t._t for t in trajectories]
    trajectories_list.append(trajectories)

In [105]:
# a trajectory from policy 4
trajectories_list[3][0]

[(0, 1, 3), (3, 1, 6), (6, 0, 7), (7, 2, 15), (15, 2, 19)]

In [107]:
# create list of all trajectories, each is a 2d list

x_data = []
for i, policy in enumerate(policies):
    x_data.extend([np.matrix(t).tolist() for t in trajectories_list[i]])

In [108]:
# label trajectories

y_data = []
for i in range(len(policies)):
    y_data.extend([i] * n_trajectories_per_policy)

In [109]:
# shuffle x,y data together
import random
temp = list(zip(x_data,y_data))
random.shuffle(temp)
x_data, y_data = zip(*temp)

In [110]:
# convert trajectories to ragged tensors
x_data = tf.ragged.constant(x_data)
max_seq = int(x_data.bounding_shape()[-2])
print(max_seq)
x_data.bounding_shape()

397


<tf.Tensor: shape=(3,), dtype=int64, numpy=array([20000,   397,     3])>

In [111]:
# convert y to numpy array
y_data = np.array(y_data)
y_data

array([0, 3, 2, ..., 2, 0, 0])

In [112]:
# split into train and test
test_prop = .20
test_n = int(len(y_data) * test_prop)
x_test = x_data[:test_n, :, :]
y_test = y_data[:test_n]
x_train = x_data[test_n:,:,:]
y_train = y_data[test_n:]
x_test.shape, x_train.shape, y_test.shape, y_train.shape

(TensorShape([4000, None, None]),
 TensorShape([16000, None, None]),
 (4000,),
 (16000,))

In [113]:
# create LSTM model
lstm_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(max_seq,3), dtype=tf.float32, ragged=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dropout(.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(4, activation='softmax')
])

In [114]:
metrics = ['accuracy']

lstm_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=metrics)
lstm_model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 64)                17408     
_________________________________________________________________
dropout_5 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_11 (Dense)             (None, 4)                 260       
Total params: 21,828
Trainable params: 21,828
Non-trainable params: 0
_________________________________________________________________


In [115]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)


lstm_model.fit(x_train,y_train, epochs=100, batch_size=int(n_trajectories_per_policy / 10), validation_data=(x_test,y_test), callbacks=[tensorboard_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f2a6d2c8760>

In [117]:
from sklearn.metrics import confusion_matrix

y_predicted = lstm_model.predict_classes(x_test)

# get confusion matrix
cm = confusion_matrix(y_test, y_predicted)

# normalize diagonal entries
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# accuracy by class (indexes 0-2 refres to fixed policies)
# index 3 refres to expert policy
cm.diagonal()

array([0.98480243, 0.92562814, 0.91557789, 0.76637341])