# Lower, Higher or Same

Agent will learn to guess if the next die would be lower than, higher than or as same as the current die.

Applied from https://towardsdatascience.com/simple-reinforcement-learning-q-learning-fcddc4b6fe56


In [None]:
import numpy as np
import random

In [None]:
# possible states of a 6 faces die 
possible_states = [0, 1, 2, 3, 4, 5]
state_size = len(possible_states)

# lo, hi, same
possible_actions = [0, 1, 2]
action_size = len(possible_actions)

# Initialize q-table values to 0
Q = np.zeros((state_size, action_size))

In [None]:
# Set the percent you want to explore
def pick_action(state, actions, Q, epsilon):
  if random.uniform(0, 1) < epsilon:
      """
      Explore: select a random action
      """
      # action = actions[random.randint(0, len(actions)-1)]
      action = random.randint(0, len(actions)-1)

  else:
      """
      Exploit: select the action with max value (future reward)
      """

      action = np.argmax(Q, axis=1)[state]

  return action

In [None]:
# Update Q values
def update_q(Q, cur_state, next_state, action, reward, gamma, lr):
  Q[cur_state, action] = Q[cur_state, action] + lr * (reward + gamma * np.max(Q[next_state, :]) - Q[cur_state, action])

In [None]:
# Apply action to environment for reward
def update(action, cur_state, special_rule = False):
  # current die
  cur_die = cur_state + 1

  # roll the die
  next_die = random.randint(1,state_size)

  # check result
  if next_die < cur_die and action == 0:
    reward = 1
  elif next_die > cur_die and action == 1:
    reward = 1
  elif next_die == cur_die and action == 2:
    reward = 1
  else:
    reward = -1
  
  next_state = next_die - 1
  
  return next_state, reward


In [None]:
episode_max = 10000
cur_episode = 0

gamma = 0.9 # discount factor for future reward
cur_state = 0 # number 1
epsilon = 0.5 # explore or exploit
lr = 0.1 # learning rate

while cur_episode < episode_max:

  action = pick_action(cur_state, possible_actions, Q, epsilon)
  next_state, reward = update(action, cur_state)
  update_q(Q, cur_state, next_state, action, reward, gamma, lr)

  cur_state = next_state
  cur_episode += 1
  epsilon *= 0.99


In [None]:
Q

array([[-0.1       ,  3.51083501, -0.23890985],
       [-0.181171  ,  3.20086943, -0.14245432],
       [-0.19405635,  2.89823776, -0.34692611],
       [ 2.99464376, -0.28519206, -0.2181431 ],
       [ 3.1378177 , -0.25716624, -0.22953163],
       [ 3.80684366, -0.23370692,  0.06632637]])

In [None]:
# test 1
cur_state = random.randint(0, state_size-1)
next_state = random.randint(0, state_size-1)

action = pick_action(cur_state, possible_actions, Q, epsilon)
possible_actions_txt = ['lower', 'higher', 'same']
print('current number', cur_state+1)
print('predict next:', possible_actions_txt[action])
print('actual next number:', next_state+1)

current number 3
predict next: higher
actual next number: 3


In [None]:
# test batch
test_num_max = 1000
correct_num = 0
epsilon = 0 # always exploit

for i in range(test_num_max):
  cur_state = random.randint(0, state_size-1)
  action = pick_action(cur_state, possible_actions, Q, epsilon)
  _, reward = update(action, cur_state)
  if reward == 1:
    correct_num += 1

print('accuracy:', correct_num/test_num_max)

accuracy: 0.685
