In [29]:
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
plt.style.use("seaborn")

GRID_SIZE = np.zeros((4,12))

ACTIONS = np.array([[-1, 0],   #0 up
                    [1, 0],    #1 down
                    [0, -1],   #2 left
                    [0, 1]])   #3 right

actions = np.arange(4)   #represent ACTIONS

EPOCHS = 250

STEP_SIZE = 0.5

DISCOUNT_RATE = 1

START = np.array([3,0])

TARGET = np.array([3,11])

class Environment:
    def step(self, State, Action):
        pos = State + Action
        for i in range(len(pos)):
            if pos[i] < 0:
                pos[i] = max(0,pos[i])
            else:
                pos[i] = min(GRID_SIZE.shape[i]-1,pos[i])
        
        done = False
        if (pos == TARGET).all() == True:
            done = True

        if pos[0] == 3 and 1 <= pos[1] <= 10: #fall off the cliff
            pos = START
            return pos, -100, done
        else:
            return pos, -1, done

In [30]:
def ε_greedy_policy(t,pos,Q_table):
    random_num = np.random.rand(1)
    if random_num > max(0.1,1/t):                #exploitation
        return np.argmax(Q_table[:,pos[0],pos[1]])
    else:                                        #exploration
        return np.random.choice(actions)

In [31]:
def q_learning(q_table, epochs = EPOCHS, alpha = STEP_SIZE, gamma = DISCOUNT_RATE):
    for t in range(epochs):
        env = Environment()
        state = START
        while True:
            action = ε_greedy_policy(t+1,state,q_table)
            next_state, reward, done = env.step(state, ACTIONS[action])
            
            q_table[action,state[0],state[1]] += alpha * (
                reward + gamma * max(q_table[:,next_state[0],next_state[1]])
                -q_table[action,state[0],state[1]]
            )
            
            state = next_state
            if done == True:
                break   

In [32]:
q_table = np.zeros((4, 4, 12))
q_learning(q_table)

In [33]:
def print_optimal_policy(Q_table):
    # display the optimal policy
    optimal_policy = []
    for i in range(0, 4):
        optimal_policy.append([])
        for j in range(0, 12):
            if ([i, j] == TARGET).all():
                optimal_policy[-1].append('G')
                continue
            bestAction = np.argmax(Q_table[:, i, j])
            if bestAction == 0:
                optimal_policy[-1].append('U')
            elif bestAction == 1:
                optimal_policy[-1].append('D')
            elif bestAction == 2:
                optimal_policy[-1].append('L')
            elif bestAction == 3:
                optimal_policy[-1].append('R')
    for row in optimal_policy:
        print(row)

In [34]:
print('Q-Learning Optimal Policy:')
print_optimal_policy(q_table)

Q-Learning Optimal Policy:
['D', 'R', 'U', 'R', 'R', 'R', 'R', 'R', 'D', 'R', 'R', 'D']
['D', 'U', 'R', 'R', 'R', 'R', 'D', 'R', 'R', 'D', 'D', 'D']
['R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'D']
['U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'U', 'G']
