<a href="https://colab.research.google.com/github/enakai00/colab_rlbook/blob/master/Chapter05/01_Neural_Network_Policy_Estimation_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%tensorflow_version 2.x 

In [0]:
import numpy as np
from tensorflow.keras import layers, models

In [0]:
class Gridworld:
  def __init__(self, size=8, goals=[7]):
    self.size = size
    self.goals = goals
    self.states = range(size)

  def move(self, s, a):
    if s in self.goals:
      return 0, s       # Reward, Next state

    s_new = s + a

    if s_new in self.goals:
      return 1, s_new   # Reward, Next state

    if s_new not in self.states:
      return -1, s      # Reward, Next state

    return -1, s_new    # Reward, Next state

In [0]:
class StateValue:
  def __init__(self, goals):
    self.goals = goals
    self.model = self.build_model()

  def build_model(self):
    state = layers.Input(shape=(1,))
    value = layers.Dense(1)(state)
    model = models.Model(inputs=[state], outputs=[value])
    model.compile(loss='mse')
    return model

  def get_value(self, s):
    if s in self.goals:
      return 0
    input_states = [np.array([s])]
    output_values = self.model.predict([input_states])
    value = output_values[0][0]
    return value

In [0]:
def show_values(world, state_value):
  print('[', end='')
  for s in world.states:
    print('{:5.1f}'.format(state_value.get_value(s)), end=' ')
  print(']')

In [0]:
def get_episode(world):
  episode = []
  s = np.random.randint(world.size-1)
  a = 1   # move to right
  while True:
    r, s_new = world.move(s, a)
    episode.append((s, r, s_new))
    if s_new in world.goals:
      break
    s = s_new

  return episode

In [0]:
def train(world, state_value, num):
  for c in range(num):
    print('Iteration {:2d}: '.format(c+1), end='')

    examples = []
    for _ in range(100):
      episode = get_episode(world)
      examples += get_episode(world)
    np.random.shuffle(examples)

    states = []
    labels = []
    for s, r, s_new in examples:
      states.append(np.array([s]))
      v_new = state_value.get_value(s_new)
      labels.append(np.array(r + v_new))

    state_value.model.fit(np.array(states), np.array(labels),
                          batch_size=50, epochs=100, verbose=0)
    show_values(world, state_value)

In [17]:
world = Gridworld()
state_value = StateValue(goals=world.goals)
state_value.model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 2         
Total params: 2
Trainable params: 2
Non-trainable params: 0
_________________________________________________________________


In [18]:
train(world, state_value, num=20)

Iteration  1: [  0.1   0.8   1.4   2.1   2.8   3.4   4.1   0.0 ]
Iteration  2: [  0.2   0.6   0.9   1.3   1.6   2.0   2.3   0.0 ]
Iteration  3: [ -0.1   0.1   0.4   0.6   0.8   1.0   1.3   0.0 ]
Iteration  4: [ -0.9  -0.6  -0.3  -0.1   0.2   0.5   0.7   0.0 ]
Iteration  5: [ -1.5  -1.2  -0.9  -0.5  -0.2   0.1   0.4   0.0 ]
Iteration  6: [ -2.2  -1.8  -1.3  -0.9  -0.5  -0.1   0.3   0.0 ]
Iteration  7: [ -2.8  -2.3  -1.8  -1.2  -0.7  -0.2   0.3   0.0 ]
Iteration  8: [ -3.5  -2.8  -2.2  -1.5  -0.9  -0.2   0.4   0.0 ]
Iteration  9: [ -4.1  -3.3  -2.5  -1.7  -1.0  -0.2   0.6   0.0 ]
Iteration 10: [ -4.5  -3.6  -2.8  -1.9  -1.0  -0.1   0.7   0.0 ]
Iteration 11: [ -4.8  -3.9  -2.9  -2.0  -1.0  -0.1   0.8   0.0 ]
Iteration 12: [ -4.9  -4.0  -3.0  -2.0  -1.0  -0.1   0.9   0.0 ]
Iteration 13: [ -5.0  -4.0  -3.0  -2.0  -1.0  -0.0   1.0   0.0 ]
Iteration 14: [ -5.0  -4.0  -3.0  -2.0  -1.0  -0.0   1.0   0.0 ]
Iteration 15: [ -5.0  -4.0  -3.0  -2.0  -1.0  -0.0   1.0   0.0 ]
Iteration 16: [ -5.0  -4.