## Q-Learning simple example
Guide from [valohai](https://valohai.com/blog/reinforcement-learning-tutorial-part-1-q-learning/)

In [19]:
import numpy as np
from dungeon import Dungeon, Action
np.random.seed(42)

In [20]:
def choose_action(d, table):
  r1 = table[Action.FORWARD, d.state]
  r2 = table[Action.BACKWARD, d.state]
  if r1 > r2: return Action.FORWARD
  elif r2 > r1: return Action.BACKWARD
  else:
    return Action(np.random.randint(0, 2))

In [21]:
d = Dungeon()

In [22]:
greedy_table = np.zeros((2, 5), dtype = np.int64)
for _ in range(2000):
  action = choose_action(d, greedy_table)
  d_new = Dungeon(d.get_new_state(action))
  reward = d_new.get_reward()
  greedy_table[action, d.state] += reward
  d = d_new

In [23]:
greedy_table

array([[   0,    0,    0,    0,    0],
       [3174,  336,   46,    4,    0]])

In [24]:
def choose_q_action(d, q_table, gambling_rate):
  if np.random.random() <= gambling_rate:
    return Action(np.random.randint(0, 2))
  else:
    return choose_action(d, q_table)

def update_q_table(eta, discount, q_table, d, d_new, action):
  reward = d_new.get_reward()
  actual_value = q_table[action, d.state]
  max_value = max(q_table[Action.FORWARD, d_new.state], q_table[Action.BACKWARD, d_new.state])
  return actual_value + eta * (reward + discount * max_value - actual_value)


In [30]:
eta = 0.1
discount = 0.95
gambling_rate = 1
nb_iterations = 4000

q_table = np.zeros((2, 5))
for _ in range(nb_iterations):
  action = choose_q_action(d, q_table, gambling_rate)
  gambling_rate -= 1/nb_iterations
  d_new = Dungeon(d.get_new_state(action))
  q_table[action, d.state] = update_q_table(eta, discount, q_table, d, d_new, action)
  d = d_new

In [31]:
q_table

array([[100.93168683, 107.75981273, 116.35587962, 124.61942965,
        121.97453412],
       [ 84.78026194,  84.02664969,  75.83539758,  72.22210351,
         90.35137767]])

In [27]:
d = Dungeon(4)
choose_action(d, q_table)

<Action.FORWARD: 0>