In [0]:
import numpy as np

# Define global variables for the problem
p_h = 0.55
p_l = 0.45
prize = 1000

c_h = 1
c_l = 0
energy_full = 10
energy_inc = 2
energy_prob = 0.2

high = 0
low = 1

# number of actions
n_actions = 2
actions = [high, low]

# two outcomes
win = 0
lose = 1

n_outcomes = 2

# number of rounds
d = 3


# state consists of round win difference and energy level; we abuse this by referring to round win difference as state

# states encoded as 0, 1, ..., 2d
states = np.arange(0,2*d+1) 
n_states = len(states)

In [0]:
def check_outcome(action):
    if action == high:
        is_won = np.random.binomial(1, p_h)
    else:
        is_won = np.random.binomial(1, p_l)
    return is_won


class Game:
    def __init__(self, init_state, full_energy, energy_prob, energy_added):
        self.initial_state = init_state
        self.state = self.initial_state
        self.reward = 0.0
        self.is_terminal = False
        self.full_energy = energy_full
        self.current_energy = self.full_energy
        self.energy_prob = energy_prob
        self.energy_added = energy_inc

    def energy_recovery(self):
        if np.random.binomial(1, self.energy_prob) == 1:
            return min(self.full_energy, self.current_energy + self.energy_added)
        else:
            return self.current_energy

    def step(self, action):
        is_won = check_outcome(action)
        if self.state == 2*d-1 and is_won:
            self.state += 1
            self.reward = prize
            self.is_terminal = True
        elif self.state == 1 and not is_won:
            self.state -= 1
            self.reward = 0.0
            self.is_terminal = True
        else:
            if is_won and action == high:
                self.state += 1
                self.is_terminal = False
            elif is_won and action == low:
                self.state += 1
                self.is_terminal = False
            elif not is_won and action == high:
                self.state -= 1
                self.is_terminal = False
            else:
                self.state -= 1
                self.is_terminal = False
            self.reward = 0.0

        if action == high:
            cost = c_h
        else:
            cost = c_l
          
        self.current_energy = max(self.current_energy - cost, 0)
        self.current_energy = self.energy_recovery()

        return self.state, self.current_energy, self.reward, self.is_terminal

    def reset(self):
        self.state = self.initial_state
        self.reward = 0.0
        self.is_terminal = False
        self.current_energy = self.full_energy
        return self.state, self.current_energy


def eps_greedy_policy(qsa, energy_level, epsilon=0.1):
    if energy_level < c_h:
        return low
    else:
        if np.random.binomial(1, epsilon) == 1:
            return np.random.choice(actions)
        else:
            return np.random.choice([action_ for action_, value_ in enumerate(qsa) if value_ == np.max(qsa)])

In [0]:
init_state = d
episodes = 500000

q_values = np.zeros((n_states, energy_full+1, n_actions))
eligibility = np.zeros_like(q_values)

#
# Hyperparameter setting
#
lam = 0.9 # param for sarsa(lambda)
gamma = 0.9 # discount factor 
alpha = 0.001 # step size


In [0]:
env = Game(init_state, energy_full, energy_prob, energy_inc)

for e in range(episodes):
    state, energy = env.reset()
    done = False
    action = eps_greedy_policy(q_values[state, energy, :], energy)
        
    while not done:
        next_state, next_energy, r, done = env.step(action)
        next_a = eps_greedy_policy(q_values[next_state, next_energy, :], next_energy)

        delta = r + gamma * q_values[next_state, energy, next_a] - q_values[state, energy, action]

        eligibility[state, energy, action] += 1

        for s in range(1,2*d):
          for en in range(energy_full+1):
            for a in range(n_actions):
                q_values[s, en, a] += alpha * delta * eligibility[s, en, a]
                eligibility[s, en, a] = gamma * lam * eligibility[s, en, a]

        state = next_state
        energy = next_energy
        action = next_a

In [0]:
print('Q values:\n', q_values[1:2*d])

optimal_policy = np.zeros((n_states,energy_full+1))

for state in range(0,2*d+1):
   for energy in range(0,energy_full+1):
      optimal_policy[state,energy] = np.argmax(q_values[state,energy,:])

print('Optimal policy:\n', optimal_policy[1:2*d])

Q values:
 [[[  0.          17.02427388]
  [ 22.43449882   1.49634712]
  [ 29.31680089   6.54855325]
  [ 17.54035569   5.67410212]
  [ 28.2833465   16.21981854]
  [ 25.60300972  49.03133304]
  [ 64.55225125  38.30094595]
  [ 73.27014168  53.02827195]
  [ 99.51828826  69.79246498]
  [ 87.79254081  60.03583213]
  [ 82.39985448  64.8114107 ]]

 [[  0.          48.21269964]
  [ 15.9118208   35.43575774]
  [ 42.79264786  69.54584533]
  [ 13.40610077  34.96736973]
  [113.8895724   48.60971978]
  [113.65028703 117.06424662]
  [103.6602832  125.50495232]
  [163.44958207 144.15734499]
  [207.8795699  168.7800485 ]
  [182.44420517 164.1674594 ]
  [187.70972634 148.88083235]]

 [[  0.         107.29006038]
  [  8.34219453  57.91490528]
  [ 86.86110908 150.85153687]
  [ 67.00435415  31.81690423]
  [218.03410257 110.38556314]
  [245.76946617 185.91898031]
  [215.59824661 204.34024208]
  [273.26875038 251.54451986]
  [337.22713435 307.16759145]
  [318.61892791 248.72948572]
  [307.71954257 279.38078