# Testing actor functionallity

In [1]:
%matplotlib inline

import sys
import os
from pathlib import Path

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

SCRIPT_DIR = Path(os.path.dirname(os.path.realpath("__file__")))
sys.path.append(str(SCRIPT_DIR.parent))


from actor import ActorConfig, feed_forward_discrete_policy_constructor

In [2]:
# Tensorflow debug settings
tf.config.run_functions_eagerly(True)
tf.random.set_seed(0)

## Create actor model

In [3]:
sample_config = {
    "layer_sizes": [40, 40],
    "learning_rate": 0.001,
    "hidden_activation": "sigmoid"
}
model_config = ActorConfig(**sample_config)
actor_constructor = feed_forward_discrete_policy_constructor(3, 2)
model = actor_constructor(model_config)

## Test

We will train with 3 states that yield 3 different discounted rewards.
If the model learns to predict them, it's training functionality is working.

In [4]:
def reward(states, actions):
    rewards = []
    
    if actions[0] == 0.:
        rewards.append([1.])
    else:
        rewards.append([-1.])
                       
    if actions[1] == 1.:
        rewards.append([1.])
    else:
        rewards.append([-1.])
        
    if actions[2] == 1.:
        rewards.append([1.])
    else:
        rewards.append([-1.])
    
    return np.array(rewards)

actions = np.array([1, 0, 1])
states = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
reward(states, actions)

array([[-1.],
       [-1.],
       [ 1.]])

In [5]:
states = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
true_actions = np.array([0, 1, 1])

In [None]:
losses = []
for i in range(10000):
    
    actions = model.produce_actions(states).numpy()
    rewards = reward(states, actions)
    values, loss, gradients = model.train_step(states, actions, rewards)
    losses.append(loss)
    if not i % 1000:
        
        print("")
        print(f"Episode {i}")
        print(f"actions = {actions}")
        print(f"rewards = {rewards}")
        print(f"loss = {loss.numpy()}")

    
        


Episode 0
actions = [[0]
 [1]
 [1]]
rewards = [[1.]
 [1.]
 [1.]]
loss = 0.7379022836685181

Episode 1000
actions = [[1]
 [1]
 [1]]
rewards = [[-1.]
 [ 1.]
 [ 1.]]
loss = 0.000599520280957222

Episode 2000
actions = [[1]
 [1]
 [1]]
rewards = [[-1.]
 [ 1.]
 [ 1.]]
loss = 0.00014777350588701665


In [None]:
steps = list(range(10000))

plt.plot(steps, losses)

## Final model predictions

In [None]:
final_values = model(states)
final_values