## Import libraries

In [1]:
import numpy as np
import pandas as pd
import time

## Set variables

In [2]:
np.random.seed(2)
states = 6
actions = ['left', 'right']
episodes = 3
epsilon = 0.9
gamma = 0.9
alpha = 0.1

## Choose action

In [3]:
def choose_action(state, q_table):
    
    if (np.random.uniform() > epsilon or q_table.iloc[state].sum() == 0): # explore - random
        action = np.random.choice(actions)
    else:
        action = q_table.iloc[state].idxmax() # exploit - max Q value for this state

    return action

## Get new state and reward from environment

In [4]:
def get_env_feedback(state, action):
    
    reward = 0

    if (action == 'right'):
        state += 1

        if (state + 1 == states):
            reward = 1

    if (action == 'left'):

        if (state != 0):
            state -= 1
    
    return state, reward

## Display environment

In [5]:
def display_environment(state):
    
    environment = ''
    
    for position in range(states):
        
        if (position == states - 1):
            environment += 'T'
        
        elif (position == state):
            environment += 'o'
            
        else:
            environment += '-'
            
    print(environment, end="\r") 
    time.sleep(0.25)

## Main function

In [6]:
q_table = pd.DataFrame(np.zeros((states, len(actions))), columns=actions)

for episode in range(episodes):
    
    terminated = False
    state_0 = 0
    counter = 0
    
    while not terminated:
        
        action = choose_action(state_0, q_table)
        
        state_1, reward = get_env_feedback(state_0, action)
        
        if (reward == 1):
            q_target = reward
            terminated = True
            
        else:
            q_target = reward + (gamma * q_table.iloc[state_1].max())

        # update Q table
        q_predict = q_table.loc[state_0, action]
        q_table.loc[state_0, action] += alpha * (q_target - q_predict)
        
        state_0 = state_1
        display_environment(state_0)
        counter += 1
        
    print(counter)
        
print(q_table)

38---T
22---T
9----T
   left    right
0   0.0  0.00000
1   0.0  0.00000
2   0.0  0.00081
3   0.0  0.02520
4   0.0  0.27100
5   0.0  0.00000
