In [112]:
import numpy as np
import pandas as pd
import time

In [113]:
np.random.seed(2)

N_STATE = 6
ACTIONS = ['left', 'right']
EPSILON = 0.9
ALPHA = 0.1
LAMBDA = 0.9
MAX_EPISODES = 13
FRESH_TIME = 0.2

In [114]:
def build_q_table(n_state, actions):
    table = pd.DataFrame(
        np.zeros((n_state, len(actions))),
        columns=actions
    )
    return table

In [115]:
build_q_table(N_STATE, ACTIONS)

Unnamed: 0,left,right
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
5,0.0,0.0


In [116]:
def choose_action(state, q_table: pd.DataFrame):
    state_action = q_table.iloc[state, :]
    if np.random.uniform() > EPSILON or not state_action.all():
        return np.random.choice(ACTIONS)
    else:
        return ACTIONS[state_action.argmax()]

In [117]:
def get_env_feedback(state, action):
    new_state = 0
    reward = 0
    if action == 'right':
        if state == N_STATE - 2:
            new_state = 'terminal'
            reward = 1
        else:
            new_state = state + 1
    else:
        if  state:
            new_state = state - 1
    return new_state, reward

In [118]:
def update_env(state, episode, step_counter):
    env_list  = ['-'] * (N_STATE - 1) + ['T']
    if state == 'terminal':
        print(f'\rEpisode {episode + 1}: total step = {step_counter}', end='')
        time.sleep(2)
        print('\r                           ', end='')
    else:
        env_list[state] = 'o'
        print('\r' + ''.join(env_list), end='')
        time.sleep(FRESH_TIME)

In [119]:
def rl():
    q_table = build_q_table(N_STATE, ACTIONS)
    for episode in range(MAX_EPISODES):
        step_counter = 0
        state = 0
        is_tetminaled = False
        update_env(state, episode, step_counter)
        while not is_tetminaled:
            action = choose_action(state, q_table)
            new_state, reward = get_env_feedback(state, action)
            q_predict = q_table[action][state]
            if new_state != 'terminal':
                q_target = reward + LAMBDA * q_table.iloc[new_state, :].max()
            else:
                q_target = reward
                is_tetminaled = True

            q_table[action][state] += ALPHA * (q_target - q_predict)
            state = new_state
            step_counter += 1
            update_env(state, episode, step_counter)
    return q_table

In [120]:
Q_table = rl()
print('\r\nQ-table:\n')
Q_table

                           
Q-table:



Unnamed: 0,left,right
0,1e-06,0.005728
1,0.000271,0.032612
2,0.002454,0.111724
3,7.3e-05,0.343331
4,0.00081,0.745813
5,0.0,0.0
