In [0]:
import numpy as np

# Define global variables for the problem

p_h = 0.55
p_l = 0.45

prize = 1000
c_h = 50
c_l = 10

high = 0
low = 1

# number of actions
n_actions = 2
actions = [high, low]

# two outcomes
win = 0
lose = 1

n_outcomes = 2

# number of rounds
d = 3

# states encoded as 0, 1, ..., 2d
states = np.arange(0,2*d+1) 
n_states = len(states)

In [0]:
class Game:
    def __init__(self, init_state):
        self.initial_state = init_state
        self.state = self.initial_state
        self.reward = 0.0
        self.is_terminal = False

    def outcome(self, action):
        if action == high:
            win = np.random.binomial(1, p_h)
        else:
            win = np.random.binomial(1, p_l)
        return win

    def step(self, action):
        if action == high:
            self.reward = -c_h
        else:
            self.reward = -c_l    
      
        is_won = self.outcome(action)
        
        if self.state == 2*d-1 and is_won:
            self.state += 1
            self.reward += prize
            self.is_terminal = True
        elif self.state == 1 and not is_won:
            self.state -= 1
            self.is_terminal = True
        else:
            if is_won:
                self.state += 1
            else:
                self.state -= 1
            self.is_terminal = False
        return self.state, self.reward, self.is_terminal

    def reset(self):
        self.state = self.initial_state
        self.reward = 0.0
        self.is_terminal = False
        return self.state


def behavior_policy():
    return np.random.binomial(1, 0.5)


In [0]:
init_state = d
env = Game(init_state)
episodes = 500000

q_values = np.zeros((n_states, n_actions))
c_values = np.zeros_like(q_values)

target_policy = np.zeros(n_states)

for state in states:
  target_policy[state] = np.random.choice([action_ for action_, value_ in enumerate(q_values[state,:]) if value_ == np.max(q_values[state,:])])

# No discount
gamma = 1.0

In [0]:
for i in range(1, episodes + 1):
  state = env.reset()
  history = []
  done = False
  t = 0

  while not done:
      a = behavior_policy()
      next_state, r, done = env.step(a)
        
      history.append([t, state, a, r])
      state = next_state
      t += 1

  g = 0.0
  w = 1.0
  for t, state, action, reward in history[::-1]:
      g = gamma * g + reward
      c_values[state, action] += w
      q_values[state, action] += w* (g - q_values[state, action]) / c_values[state, action]
      target_policy[state] = np.random.choice([action_ for action_, value_ in enumerate(q_values[state,:]) if value_ == np.max(q_values[state,:])])
      if target_policy[state] != action:
          break
      w *= 1.0 / 0.5
     


In [0]:
print(q_values[1:2*d,:])

[[ 36.67129386  60.41652975]
 [137.8118823  165.30151148]
 [317.45841813 310.08430213]
 [552.28495921 545.06155618]
 [773.04028586 785.258022  ]]


In [0]:
print(target_policy[1:2*d])

[1. 1. 0. 0. 1.]
