In [1]:
WORLD_HEIGHT = 7
WORLD_WIDTH = 10
WIND = [0, 0, 0, 1, 1, 1, 2, 2, 1, 0]
WIND_PROBABILITY = .9

ACTION_UP = 1
ACTION_DOWN = 2
ACTION_LEFT = 3
ACTION_RIGHT = 4

EPSILON = 0.1
ALPHA = .5
REWARD = -1
START = [4,1]
GOAL = [4,8]
ACTIONS = [ACTION_UP,ACTION_DOWN,ACTION_LEFT,ACTION_RIGHT]

4-element Array{Int64,1}:
 1
 2
 3
 4

In [2]:
function step(state, action)
    i, j = state
    
    if rand() < WIND_PROBABILITY
        wind = WIND[j]
    else
        wind = 0
    end
    if action == ACTION_UP
        return [max(i - 1 - wind, 1), j]
        
    elseif action == ACTION_DOWN
         return [max(min(i + 1 - wind, WORLD_HEIGHT), 1), j]
        
    elseif action == ACTION_LEFT
        return [max(i - wind, 1), max(j - 1, 1)]
        
    elseif action == ACTION_RIGHT
        return [max(i - wind, 1), min(j + 1, WORLD_WIDTH)]
    end
end

step (generic function with 1 method)

In [93]:
step([7,1],2)

2-element Array{Int64,1}:
 7
 1

In [3]:
function episode(q_value)
    # track the total time steps in this episode
    time = 0
    
    # initialize state
    state = START
    # choose an action based on epsilon-greedy algorithm
    if rand(1)[1] < EPSILON
        action = ACTIONS[rand(1:4)]
    else
        values_ = q_value[state[1], state[2], :]
        action = []
        for i in enumerate(values_)
            action_,value_ = i
            if value_ == maximum(values_)
                push!(action,action_)
            end
        end
        action = action[rand(1:length(action))]
    end
    
    # keep going until get to the goal state
    while state != GOAL

        next_state = step(state,action)
        if rand(1)[1] < EPSILON
            next_action = ACTIONS[rand(1:4)]
        else
            values_ = q_value[next_state[1], next_state[2], :]
            next_action = []
            for i in enumerate(values_)
                action_,value_ = i
                if value_ == maximum(values_)
                    push!(next_action,action_)
                end
            end
            next_action = next_action[rand(1:length(next_action))]
        end
        # Sarsa update
        q_value[state[1], state[2], action] += ALPHA * (REWARD + q_value[next_state[1], next_state[2], next_action] - q_value[state[1], state[2], action])
        state = next_state
        action = next_action
        time += 1
    end

    return time
end

episode (generic function with 1 method)

In [4]:
function figure_6_3()
    q_value = zeros((WORLD_HEIGHT,WORLD_WIDTH,4))
    episode_limit = 300
    q_value
    steps = []
    ep = 0
    while ep < episode_limit
        push!(steps,episode(q_value))
        ep+=1
    end
    print(steps)
end


figure_6_3 (generic function with 1 method)

In [5]:
a = figure_6_3()

Any[833, 257, 164, 299, 155, 198, 46, 49, 178, 154, 153, 39, 22, 113, 32, 44, 50, 43, 123, 35, 53, 57, 89, 34, 83, 25, 74, 160, 25, 39, 55, 43, 99, 35, 143, 149, 64, 115, 42, 96, 17, 55, 74, 49, 33, 24, 91, 29, 41, 25, 19, 54, 61, 47, 20, 20, 26, 67, 57, 28, 31, 40, 26, 20, 40, 24, 22, 47, 27, 47, 23, 20, 20, 38, 46, 51, 20, 31, 97, 40, 29, 30, 35, 43, 29, 30, 20, 19, 24, 32, 47, 41, 23, 31, 40, 39, 58, 52, 45, 50, 27, 21, 27, 21, 23, 32, 33, 21, 26, 30, 32, 23, 24, 26, 29, 22, 18, 31, 21, 31, 30, 51, 26, 33, 22, 21, 19, 21, 26, 34, 20, 19, 20, 18, 21, 21, 21, 19, 31, 33, 37, 24, 23, 24, 24, 15, 18, 30, 28, 17, 28, 20, 36, 19, 27, 19, 24, 54, 41, 26, 27, 16, 66, 44, 20, 18, 22, 24, 19, 23, 21, 27, 26, 27, 30, 24, 21, 31, 17, 38, 23, 25, 16, 34, 25, 26, 19, 21, 29, 12, 23, 25, 21, 19, 21, 19, 19, 15, 24, 18, 19, 33, 29, 30, 24, 23, 23, 21, 19, 21, 30, 19, 20, 21, 40, 21, 25, 45, 31, 29, 18, 35, 21, 24, 19, 22, 19, 19, 26, 15, 16, 16, 18, 15, 15, 22, 19, 26, 46, 20, 23, 18, 25, 28, 22, 2

```python
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

# world height
WORLD_HEIGHT = 7

# world width
WORLD_WIDTH = 10

# wind strength for each column
WIND = [0, 0, 0, 1, 1, 1, 2, 2, 1, 0]

# possible actions
ACTION_UP = 0
ACTION_DOWN = 1
ACTION_LEFT = 2
ACTION_RIGHT = 3

# probability for exploration
EPSILON = 0.1

# Sarsa step size
ALPHA = 0.5

# reward for each step
REWARD = -1.0

START = [3, 0]
GOAL = [3, 7]
ACTIONS = [ACTION_UP, ACTION_DOWN, ACTION_LEFT, ACTION_RIGHT]

def step(state, action):
    i, j = state
    if action == ACTION_UP:
        return [max(i - 1 - WIND[j], 0), j]
    elif action == ACTION_DOWN:
        return [max(min(i + 1 - WIND[j], WORLD_HEIGHT - 1), 0), j]
    elif action == ACTION_LEFT:
        return [max(i - WIND[j], 0), max(j - 1, 0)]
    elif action == ACTION_RIGHT:
        return [max(i - WIND[j], 0), min(j + 1, WORLD_WIDTH - 1)]
    else:
        assert False

# play for an episode
def episode(q_value):
    # track the total time steps in this episode
    time = 0

    # initialize state
    state = START

    # choose an action based on epsilon-greedy algorithm
    if np.random.binomial(1, EPSILON) == 1:
        action = np.random.choice(ACTIONS)
    else:
        values_ = q_value[state[0], state[1], :]
        action = np.random.choice([action_ for action_, value_ in enumerate(values_) if value_ == np.max(values_)])

    # keep going until get to the goal state
    while state != GOAL:
        next_state = step(state, action)
        if np.random.binomial(1, EPSILON) == 1:
            next_action = np.random.choice(ACTIONS)
        else:
            values_ = q_value[next_state[0], next_state[1], :]
            next_action = np.random.choice([action_ for action_, value_ in enumerate(values_) if value_ == np.max(values_)])

        # Sarsa update
        q_value[state[0], state[1], action] += \
            ALPHA * (REWARD + q_value[next_state[0], next_state[1], next_action] -
                     q_value[state[0], state[1], action])
        state = next_state
        action = next_action
        time += 1
    return time

def figure_6_3():
    q_value = np.zeros((WORLD_HEIGHT, WORLD_WIDTH, 4))
    episode_limit = 500

    steps = []
    ep = 0
    while ep < episode_limit:
        steps.append(episode(q_value))
        # time = episode(q_value)
        # episodes.extend([ep] * time)
        ep += 1

    steps = np.add.accumulate(steps)

    plt.plot(steps, np.arange(1, len(steps) + 1))
    plt.xlabel('Time steps')
    plt.ylabel('Episodes')

    plt.savefig('../images/figure_6_3.png')
    plt.close()

    # display the optimal policy
    optimal_policy = []
    for i in range(0, WORLD_HEIGHT):
        optimal_policy.append([])
        for j in range(0, WORLD_WIDTH):
            if [i, j] == GOAL:
                optimal_policy[-1].append('G')
                continue
            bestAction = np.argmax(q_value[i, j, :])
            if bestAction == ACTION_UP:
                optimal_policy[-1].append('U')
            elif bestAction == ACTION_DOWN:
                optimal_policy[-1].append('D')
            elif bestAction == ACTION_LEFT:
                optimal_policy[-1].append('L')
            elif bestAction == ACTION_RIGHT:
                optimal_policy[-1].append('R')
    print('Optimal policy is:')
    for row in optimal_policy:
        print(row)
    print('Wind strength for each column:\n{}'.format([str(w) for w in WIND]))

if __name__ == '__main__':
    figure_6_3()
```