In [0]:
import numpy as np
import matplotlib.pyplot as plt


p_h = 0.55
p_l = 0.45
c_l = 10.0
c_h = 50.0
prize = 1000.0

high = 0
low = 1

actions = [high, low]

# number of actions
n_actions = 2

# two outcomes
win = 0
lose = 1

n_outcomes = 2

# number of rounds
d = 3

# states encoded as 0, 1, ..., 2d
states = np.arange(0,2*d+1) 
n_states = len(states)


class Game:
    def __init__(self, init_state):
        self.initial_state = init_state
        self.state = self.initial_state
        self.reward = 0.0
        self.is_terminal = False

    def outcome(self, action):
        if action == high:
            return np.random.binomial(1, p_h)
        else:
            return np.random.binomial(1, p_l)

    def step(self, action):
        if action == high:
            self.reward = -c_h
        else:
            self.reward = -c_l    
      
        is_won = self.outcome(action)
        
        if self.state == 2*d-1 and is_won == 1:
            self.state += 1
            self.reward += prize
            self.is_terminal = True
        elif self.state == 1 and is_won != 1:
            self.state -= 1
            self.is_terminal = True
        else:
            if is_won:
                self.state += 1
            else:
                self.state -= 1
            self.is_terminal = False
        return self.state, self.reward, self.is_terminal
      
    def reset(self):
        self.state = self.initial_state
        self.reward = 0.0
        self.is_terminal = False
        return self.state


def eps_greedy_policy(qsa, epsilon=0.1):
    if np.random.binomial(1, epsilon) == 1:
        return np.random.choice(actions)
    else:
        return np.random.choice([action_ for action_, value_ in enumerate(qsa) if value_ == np.max(qsa)])


def random_policy():
    if np.random.binomial(1, 0.5) == 1:
      return high
    else:
      return low

In [0]:
init_state = d
env = Game(init_state)
episodes = 500000

values = np.zeros(n_states)
eligibility = np.zeros_like(values)

#
# Hyperparameter setting
#
lam = 0.9 # param for sarsa(lambda)
gamma = 1.0 # discount factor 
alpha = 0.0001 # step size

In [0]:
for e in range(episodes):
    state = env.reset()
    done = False

    while not done:
        a = random_policy()
        next_state, r, done = env.step(a)

        delta = r + gamma * values[next_state] - values[state]
        eligibility[state] += 1

        for s in range(1,2*d):
            values[s] += alpha * delta * eligibility[s]
            eligibility[s] = lam * gamma * eligibility[s]

        state = next_state

In [0]:
print('Value function found by on-line TD(lambda):\n', values[1:2*d])

Value function found by on-line TD(lambda):
 [ 17.22770893  92.30665632 230.14413728 425.89006091 678.91628238]
