In [None]:
import numpy as np
import time
import matplotlib.pyplot as plt

# S&B 7.10

We are going to use the very simple example of a random walk on which the reward is $-1$ at each time step. The environment is deterministic and the policy is simply the probability of taking a right. This is similar to the Markov Reward Process described in earlier chapters, but we treat the transition probability as part of the policy rather than as a result of the environment. This way we can calculate the value function under one probability choice by simulating episodes using a different probability choice.

In [None]:
class RandomWalkEpisode():
    
    def __init__(self, p_right, width):
        self.p_right = p_right
        self.width = width
        
    def initial_state(self):
        return 0
    
    def check_terminal(self, s):
        return s > self.width
    
    def reward(self, s, a):
        return 1 if a == 1 else -1
        
    
    def next_action_state_reward(self, s):        
        if self.check_terminal(s):
            raise ValueError("Episode hasd already terminated.")
            
        take_right = np.random.binomial(1, self.p_right)
        if take_right:
            a = 1
        else: 
            a = -1
        
        r = self.reward(s, a)
        s_prime = max(s + a, 0) # You can't move further left than 0
        
        return a, s_prime, r
    
    
class NStepOffPolicyPrediction():
    
    """
    This class maintains a queue which contains the
    last seen N + 1 states, actions, and rewards. When a
    state, action, and reward are outside of the n-step 
    window they are removed from the queue. The queue will
    be shorter than length n if:
    
    1) The episode has less than n-steps
    2) The episode has just begun
    3) The episode is ending
    """
    
    def __init__(self, n, alpha, gamma, width, b, pi):
        self.n = n
        self.alpha = alpha
        self.gamma = gamma
        self.states = [] # Store states since this is a tabular case
        self.rewards = []
        self.actions = []
        self.V = np.zeros(width + 1)
        
        # Probability of taking a right under each policy
        self.b = b
        self.pi = pi
        
        self.t = 0
        self.tau = self.t - self.n
        self.T = np.inf
        
        # For performance
        self.gamma_factors = np.logspace(0, self.n - 1, self.n, base=self.gamma)
        
    def add_observation(self, observation_list, observation):
        # We pop elements if the list is sufficiently long or if the episode has terminated
        if len(observation_list) > self.n or self.T != np.inf:
            observation_list.pop(0)
            
        # When None is passed as an observation it is not appended (handles post episode termination case)
        if observation is not None:
            observation_list.append(observation)
    
    def increment_time(self):
        self.t +=1
        self.tau += 1
    
    def add_timestep(self, state_t_1, action_t_1, reward_t_2):
        state_none = state_t_1 is None
        action_none = action_t_1 is None
        reward_none = reward_t_2 is None
        
        if state_none or action_none or reward_none:
            assert state_none and action_none and reward_none, \
                "If one state variable is `None` all should be."
        
        self.add_observation(self.states, state_t_1)
        self.add_observation(self.actions, action_t_1)
        self.add_observation(self.rewards, reward_t_2)
        
    def G(self):
        n = min(self.n, len(self.rewards)) # If less than n rewards left we truncate sum
        
        G = np.dot(self.gamma_factors[:n], self.rewards[:self.n])
        
        if self.tau + self.n < self.T:
            s_tau_n = self.states[self.n]
            V = self.V[s_tau_n]
            G += self.gamma**n * V
            
        return G
    
    def rho(self):
        
        n = min(self.n, len(self.rewards)) # If less than n rewards left we truncate sum
        pi_values = np.where(np.array(self.actions[:n-1]) == 1, self.pi, 1-self.pi)
        b_values = np.where(np.array(self.actions[:n-1]) == 1, self.b, 1-self.b)
        
        assert len(pi_values.shape) == 1 and len(b_values.shape) == 1, "pi and b tensors have wrong rank, {}, {}".format(pi_values, b_values)
        
        rho = np.prod(pi_values/b_values)
        
        return rho
        
    
    def V_update(self):
        G = self.G()
        rho = self.rho()
        s_tau = self.states[0]
        
        delta_V = self.alpha * rho * (G - self.V[s_tau])
        
        return delta_V
    
    def reset(self):
        """
        Resets the state of the off policy predictor
        """
        
        self.features = []
        self.rewards = []
        self.actions = []
        
        self.t = 0
        self.tau = self.t - self.n
        self.T = np.inf
    
    def check_done(self):
        # Note that on the last iteration we pop first, then calculate the last G
        # THEN measure the length. This means we need to check the length equals 1
        # at the final timestep
        states_empty = len(self.states) == 1
        actions_empty = len(self.actions) == 1
        rewards_empty = len(self.rewards) == 1
        
        lists_empty = states_empty and actions_empty and rewards_empty
        time_limit = (self.tau == self.T - 1)
        
        # Just a consistency check that our implementation agrees with the pseudo-code
        # Note we expect to be one step ahead of pseudo-code when lists are empty
        assert (lists_empty == time_limit), "Time condition and queue being empty should match."
        
        if lists_empty:
            self.reset()
        
        return lists_empty
    
    def set_T_terminal(self):
        self.T = self.t + 1


class RunSimulator():
    
    def __init__(self, episodes, episode_simulator, value_predictor):
        self.episodes = episodes
        self.episode_simulator = episode_simulator
        self.value_predictor = value_predictor
        
    def simulate(self):
        V_array = np.empty((self.episodes, self.episode_simulator.width + 1))
        
        for episode in range(self.episodes):
            s = self.episode_simulator.initial_state()
            a, s_prime, r_prime = self.episode_simulator.next_action_state_reward(s)
            
            prediction_terminated = False
            episode_terminated = False
            
            self.value_predictor.add_timestep(s, a, r_prime)
            self.value_predictor.increment_time()
            
            s = s_prime
            
            while not prediction_terminated:
                # If next state is terminal pass next features as None
                if episode_terminated:
                    s = None
                    a = None
                    r_prime = None
                
                # Check at the beginning of each step if we've already terminated. If not, 
                # get the next state and check if that state if terminal
                if not episode_terminated:
                    a, s_prime, r_prime = self.episode_simulator.next_action_state_reward(s)
                    episode_terminated = self.episode_simulator.check_terminal(s_prime)
                    
                    if episode_terminated:
                        self.value_predictor.set_T_terminal()
                                        
                # Store next feature, action, and reward
                self.value_predictor.add_timestep(s, a, r_prime)
                
                # If tau >= 0 apply the required update to w
                if self.value_predictor.tau >= 0:
                    s_tau = self.value_predictor.states[0]
                    self.value_predictor.V[s_tau] += self.value_predictor.V_update()
                    V_array[episode] = self.value_predictor.V
                
                # Finally, increment t and tau, and check whehter we have applied 
                # value prediction to all data in the episode before moving to next episode
                s = s_prime
                
                prediction_terminated = self.value_predictor.check_done()
                if not prediction_terminated:
                    self.value_predictor.increment_time()
                            
        return V_array
                

In [None]:
def run(simulations, episodes, n, gamma, pi, b, width, alpha, per_decision=False):
    print("\nRUN SIMULATION WITH PARAMS:\n" + \
          "episodes: {}\n".format(episodes) + \
          "n: {}\n".format(n) + \
          "gamma: {}\n".format(gamma) + \
          "pi: {}\n".format(pi) + \
          "b: {}\n".format(b) + \
          "width: {}\n".format(width) + \
          "alpha: {}\n".format(alpha)
         )
    
    V_array_list = []
    
    t_i = time.time()
    for simulation in range(simulations):
        simulator = RandomWalkEpisode(b, width)
        predictor = NStepOffPolicyPrediction(n, alpha, gamma, width, b, pi) if not per_decision \
                    else NStepOffPolicyPerDecisionPrediction(n, alpha, gamma, width, b, pi)
        runner = RunSimulator(episodes, simulator, predictor)
        V_array = runner.simulate()
        V_array_list.append(V_array)
        
        if (simulation + 1) % 10 == 0:
            t_f = time.time()
            delta_t = round(t_f - t_i, 2)
            print("Simulations #{}-{} Completed: {}s".format(simulation + 1 - 10,
                                                             simulation + 1, 
                                                             delta_t))
            t_i = time.time()
    
    # Axes are simulation, episode, state
    return np.array(V_array_list)

def avg_mean_squared_error(V_array, V_ref):
    # Axes are simulations, episodes
    mean_squared_errors = np.sum(np.power(V_ref - V_array, 2), axis=2)/len(V_ref)
    avg_mse = np.mean(mean_squared_errors, axis=0)
    return avg_mse

def mean_squared_error(V_array, V_ref):
    # Axes are simulations, episodes
    mean_squared_errors = np.sum(np.power(V_ref - V_array, 2), axis=1)/len(V_ref)
    return mean_squared_errors

## I) Fixed Target Policy: Greedy Target Policy, Exploratory Data Policy

In [None]:
V_on_policy = run(100, 1000, 4, 1, 0.9, 0.9, 50, 0.05)
V_b_80 = run(100, 1000, 4, 1, 0.9, 0.8, 50, 0.05)
V_b_70 = run(100, 1000, 4, 1, 0.9, 0.7, 50, 0.05)
V_b_60 = run(100, 1000, 4, 1, 0.9, 0.6, 50, 0.05)

V_ref = np.mean(V_on_policy[:,-1,:], axis=0)

avg_MSE_on_policy = avg_mean_squared_error(V_on_policy, V_ref)
avg_MSE_b_80 = avg_mean_squared_error(V_b_80, V_ref)
avg_MSE_b_70 = avg_mean_squared_error(V_b_70, V_ref)
avg_MSE_b_60 = avg_mean_squared_error(V_b_60, V_ref)

MSE_b_80 = mean_squared_error(V_b_80[0], V_ref)
MSE_b_70 = mean_squared_error(V_b_70[0], V_ref)
MSE_b_60 = mean_squared_error(V_b_60[0], V_ref)

In [None]:
fig = plt.figure(figsize=(20,10))

fig.add_subplot(321)
plt.title("Average: 100 Simulations", size=20)
plt.plot(avg_MSE_on_policy, label="On-Policy")
plt.plot(avg_MSE_b_80, label="Off-Policy")
plt.legend(prop={'size': 10})
plt.ylabel("$b(Right)=80\%$", size=20)

fig.add_subplot(322)
plt.title("Single Simulation", size=20)
plt.plot(avg_MSE_on_policy, label="On-Policy")
plt.plot(MSE_b_80, label="Off-Policy")
plt.legend(prop={'size': 10})

fig.add_subplot(323)
plt.plot(avg_MSE_on_policy, label="On-Policy")
plt.plot(avg_MSE_b_70, label="Off-Policy")
plt.legend(prop={'size': 10})
plt.ylabel("$b(Right)=50\%$", size=20)

fig.add_subplot(324)
plt.plot(avg_MSE_on_policy, label="On-Policy")
plt.plot(MSE_b_70, label="Off-Policy")
plt.legend(prop={'size': 10})

fig.add_subplot(325)
plt.plot(avg_MSE_on_policy, label="On-Policy")
plt.plot(avg_MSE_b_60, label="Off-Policy")
plt.legend(prop={'size': 10})
plt.ylabel("$b(Right)=60\%$", size=20)

fig.add_subplot(326)
plt.plot(avg_MSE_on_policy, label="On-Policy")
plt.plot(MSE_b_60, label="Off-Policy")
plt.legend(prop={'size': 10})

plt.tight_layout()

Our solution was diverging for a while, until we realized we were generating episodes using the target policy instead of the sample policy. Once we addressed that we did not have exponential divergence (as anticipated, since the expectation of G was then correct) but now our solutions converge to a high error for smaller $b$. We're not sure why this new problem is occurring. 

It turns out that replacing $G$ with $V(S_t)$ when rho_t equals zero is NOT equivalent to thes formula control variate formula in the book.(even though it seems conceptually like this would be the case). Whe we tried the latter our solution had a lot more variance, even in the $b=80\%$ case, which is the simplest and works with this formula...

It turns out the convergent but high error solutions were due to bugs in the implementation of the policy prediction algorithm. In essense, we were not logging the time or states correctly:
+ The time step $\tau$ was $+1$ ahead of where it should have been. This makes sense for Sarsa because of the fact we have $A_{t+1}$ at time $t$. 
+ We did not increment the time step after logging the initial state and action, so we were one time step behind from the get go.
+ Needed to fix how we handle termination since policy prediction does not follow the exact same termination sequence as Sarsa.

Those fixes were sufficent to get this working!

In [None]:
class NStepOffPolicyPerDecisionPrediction(NStepOffPolicyPrediction):    
    def G(self):
        n = min(self.n, len(self.rewards)) # If less than n rewards left we truncate sum        
        
        # Note that whether the horizon is beyond the end of the episode
        # determines how we initialize G in the recursion
        if self.tau + self.n < self.T:
            s_tau_n = self.states[self.n]
            V = self.V[s_tau_n]
            G = V
        else:
            G = 0
            
        reverse_indices = list(range(n))
        reverse_indices.reverse()
            
        # The actual recursion, which is computed backwards
        for i in reverse_indices:
            rho = self.pi/self.b if self.actions[i] == 1 else (1 - self.pi)/(1 - self.b)
            # Note the control variate is calculated at the S_t corresponding to G_t:h
            s_i = self.states[i]
            r_i_1 = self.rewards[i]
                
            G = rho * (r_i_1 + self.gamma * G) + (1 - rho) * self.V[s_i]
            
        return G
    
    def V_update(self):
        G = self.G()
        s_tau = self.states[0]
        
        delta_V = self.alpha * (G - self.V[s_tau])
        
        return delta_V

In [None]:
V_on_policy_pd = run(100, 1000, 4, 1, 0.9, 0.9, 50, 0.05, per_decision=True)
V_b_80_pd = run(100, 1000, 4, 1, 0.9, 0.8, 50, 0.05, per_decision=True)
V_b_70_pd = run(100, 1000, 4, 1, 0.9, 0.7, 50, 0.05, per_decision=True)
V_b_60_pd = run(100, 1000, 4, 1, 0.9, 0.6, 50, 0.05, per_decision=True)

V_ref_pd = np.mean(V_on_policy[:,-1,:], axis=0)

avg_MSE_on_policy_pd = avg_mean_squared_error(V_on_policy_pd, V_ref_pd)
avg_MSE_b_80_pd = avg_mean_squared_error(V_b_80_pd, V_ref_pd)
avg_MSE_b_70_pd = avg_mean_squared_error(V_b_70_pd, V_ref_pd)
avg_MSE_b_60_pd = avg_mean_squared_error(V_b_60_pd, V_ref_pd)

MSE_b_80_pd = mean_squared_error(V_b_80_pd[0], V_ref_pd)
MSE_b_70_pd = mean_squared_error(V_b_70_pd[0], V_ref_pd)
MSE_b_60_pd = mean_squared_error(V_b_60_pd[0], V_ref_pd)

In [None]:
fig = plt.figure(figsize=(20,10))

fig.add_subplot(321)
plt.title("Average: 100 Simulations", size=20)
plt.plot(avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(avg_MSE_b_80_pd, label="Off-Policy")
plt.legend(prop={'size': 10})
plt.ylabel("$b(Right)=80\%$", size=20)

fig.add_subplot(322)
plt.title("Single Simulation", size=20)
plt.plot(avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(MSE_b_80_pd, label="Off-Policy")
plt.legend(prop={'size': 10})

fig.add_subplot(323)
plt.plot(avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(avg_MSE_b_70_pd, label="Off-Policy")
plt.legend(prop={'size': 10})
plt.ylabel("$b(Right)=70\%$", size=20)

fig.add_subplot(324)
plt.plot(avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(MSE_b_70_pd, label="Off-Policy")
plt.legend(prop={'size': 10})

fig.add_subplot(325)
plt.plot(avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(avg_MSE_b_60_pd, label="Off-Policy")
plt.legend(prop={'size': 10})
plt.ylabel("$b(Right)=60\%$", size=20)

fig.add_subplot(326)
plt.plot(avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(MSE_b_60_pd, label="Off-Policy")
plt.legend(prop={'size': 10})

plt.tight_layout()

Now we compare the per-decison formula's performance to the standard formula's performance. As we can see below, the per-decision formula performs better as expected! The effect is extremely slight however. We attribute this to the fact that the data-generating policy is much more exploratory, so it tends to collect more data than the target policy would have **per episode**. This in turn makes the performance per episode from both approaches likely near optimal.

It could be worth plotting performance of both methods against cumulative timestep rather than episode. The authors specified the per-decision methodology specifically to be more efficient per data sample. It may be that the gap is larger in per-decision space.

In [None]:
fig = plt.figure(figsize=(20,20))

fig.add_subplot(4, 4, 1)
plt.title("Average: 100 Simulations", size=20)
plt.plot(avg_MSE_on_policy, label="On-Policy - Standard")
plt.plot(avg_MSE_on_policy_pd, label="On-Policy - Per Decision")
plt.legend(prop={'size': 10})
plt.ylabel("On-Policy: $b(Right)=90\%$", size=20)

fig.add_subplot(4, 4, 2)
plt.title("Single Simulation", size=20)
plt.plot(mean_squared_error(V_on_policy[0], V_ref_pd), label="On-Policy - Standard")
plt.plot(mean_squared_error(V_on_policy_pd[0], V_ref_pd), label="On-Policy - Per Decision")
plt.legend(prop={'size': 10})

fig.add_subplot(4, 4, 3)
plt.title("All Simulations: Standard", size=20)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(V_on_policy[index], V_ref_pd))
plt.yscale('log')
plt.ylim(10**(-9), 10**(4))

fig.add_subplot(4, 4, 4)
plt.title("All Simulations: Per Decision", size=20)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(V_on_policy_pd[index], V_ref_pd))
plt.yscale('log')
plt.ylim(10**(-9), 10**(4))

fig.add_subplot(4, 4, 5)
plt.plot(avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(avg_MSE_b_80, label="Off-Policy - Standard")
plt.plot(avg_MSE_b_80_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})
plt.ylabel("$b(Right)=80\%$", size=20)

fig.add_subplot(4, 4, 6)
plt.plot(avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(MSE_b_80, label="Off-Policy - Standard")
plt.plot(MSE_b_80_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})

fig.add_subplot(4, 4, 7)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(V_b_80[index], V_ref_pd))
plt.yscale('log')
plt.ylim(10**(-9), 10**(4))

fig.add_subplot(4, 4, 8)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(V_b_80_pd[index], V_ref_pd))
plt.yscale('log')
plt.ylim(10**(-9), 10**(4))

fig.add_subplot(4, 4, 9)
plt.plot(avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(avg_MSE_b_70, label="Off-Policy - Standard")
plt.plot(avg_MSE_b_70_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})
plt.ylabel("$b(Right)=70\%$", size=20)

fig.add_subplot(4, 4, 10)
plt.plot(avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(MSE_b_70, label="Off-Policy - Standard")
plt.plot(MSE_b_70_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})

fig.add_subplot(4, 4, 11)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(V_b_70[index], V_ref_pd))
plt.yscale('log')
plt.ylim(10**(-9), 10**(4))

fig.add_subplot(4, 4, 12)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(V_b_70_pd[index], V_ref_pd))
plt.yscale('log')
plt.ylim(10**(-9), 10**(4))

fig.add_subplot(4, 4, 13)
plt.plot(avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(avg_MSE_b_60, label="Off-Policy - Standard")
plt.plot(avg_MSE_b_60_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})
plt.ylabel("$b(Right)=60\%$", size=20)

fig.add_subplot(4, 4, 14)
plt.plot(avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(MSE_b_60, label="Off-Policy - Standard")
plt.plot(MSE_b_60_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})

fig.add_subplot(4, 4, 15)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(V_b_60[index], V_ref_pd))
plt.yscale('log')
plt.ylim(10**(-9), 10**(4))

fig.add_subplot(4, 4, 16)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(V_b_60_pd[index], V_ref_pd))
plt.yscale('log')
plt.ylim(10**(-9), 10**(4))
    
plt.tight_layout()

## II) Fixed Target Policy: Exploratory Target Policy, Greedy Data Policy

In [None]:
ii_V_b_90 = run(100, 1000, 4, 1, 0.6, 0.9, 50, 0.05)
ii_V_b_80 = run(100, 1000, 4, 1, 0.6, 0.8, 50, 0.05)
ii_V_b_70 = run(100, 1000, 4, 1, 0.6, 0.7, 50, 0.05)
ii_V_on_policy = run(100, 1000, 4, 1, 0.6, 0.6, 50, 0.05)

ii_V_ref = np.mean(ii_V_on_policy[:,-1,:], axis=0)

ii_avg_MSE_b_90 = avg_mean_squared_error(ii_V_b_90, ii_V_ref)
ii_avg_MSE_b_80 = avg_mean_squared_error(ii_V_b_80, ii_V_ref)
ii_avg_MSE_b_70 = avg_mean_squared_error(ii_V_b_70, ii_V_ref)
ii_avg_MSE_on_policy = avg_mean_squared_error(ii_V_on_policy, ii_V_ref)

ii_MSE_b_90 = mean_squared_error(ii_V_b_90[0], ii_V_ref)
ii_MSE_b_80 = mean_squared_error(ii_V_b_80[0], ii_V_ref)
ii_MSE_b_70 = mean_squared_error(ii_V_b_70[0], ii_V_ref)

In [None]:
ii_V_b_90_pd = run(100, 1000, 4, 1, 0.6, 0.9, 50, 0.05, per_decision=True)
ii_V_b_80_pd = run(100, 1000, 4, 1, 0.6, 0.8, 50, 0.05, per_decision=True)
ii_V_b_70_pd = run(100, 1000, 4, 1, 0.6, 0.7, 50, 0.05, per_decision=True)
ii_V_on_policy_pd = run(100, 1000, 4, 1, 0.6, 0.6, 50, 0.05, per_decision=True)

ii_V_ref_pd = np.mean(ii_V_on_policy_pd[:,-1,:], axis=0)

ii_avg_MSE_b_90_pd = avg_mean_squared_error(ii_V_b_90_pd, ii_V_ref_pd)
ii_avg_MSE_b_80_pd = avg_mean_squared_error(ii_V_b_80_pd, ii_V_ref_pd)
ii_avg_MSE_b_70_pd = avg_mean_squared_error(ii_V_b_70_pd, ii_V_ref_pd)
ii_avg_MSE_on_policy_pd = avg_mean_squared_error(ii_V_on_policy_pd, ii_V_ref_pd)

ii_MSE_b_90_pd = mean_squared_error(ii_V_b_90_pd[0], ii_V_ref_pd)
ii_MSE_b_80_pd = mean_squared_error(ii_V_b_80_pd[0], ii_V_ref_pd)
ii_MSE_b_70_pd = mean_squared_error(ii_V_b_70_pd[0], ii_V_ref_pd)

Now we compare the per-decison formula's performance to the standard formula's performance. As we can see below, the per-decision formula performs worse... This is surprising given what the authors anticipated in the book. It is interesting that when we flip around the nature of the target and data-generating policies (exploratory vs. greedy) it changes the performance of the different approaches so much.

However, given the fact the results in the previous section make so much sense we feel confident about the results here.

In [None]:
fig = plt.figure(figsize=(20,20))

fig.add_subplot(441)
plt.title("Average: 100 Simulations", size=20)
plt.plot(ii_avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(ii_avg_MSE_b_90, label="Off-Policy - Standard")
plt.plot(ii_avg_MSE_b_90_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})
plt.ylabel("$b(Right)=90\%$", size=20)

fig.add_subplot(442)
plt.title("Single Simulation", size=20)
plt.plot(ii_avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(ii_MSE_b_90, label="Off-Policy - Standard")
plt.plot(ii_MSE_b_90_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})

fig.add_subplot(443)
plt.title("All Simulations: Standard", size=20)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(ii_V_b_90[index], ii_V_ref_pd))
plt.yscale('log')
plt.ylim(10**(-5), 10**(8))

fig.add_subplot(444)
plt.title("All Simulations: Per Decision", size=20)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(ii_V_b_90_pd[index], ii_V_ref_pd))
plt.yscale('log')
plt.ylim(10**(-5), 10**(8))

fig.add_subplot(445)
plt.plot(ii_avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(ii_avg_MSE_b_80, label="Off-Policy - Standard")
plt.plot(ii_avg_MSE_b_80_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})
plt.ylabel("$b(Right)=80\%$", size=20)

fig.add_subplot(446)
plt.plot(ii_avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(ii_MSE_b_80, label="Off-Policy - Standard")
plt.plot(ii_MSE_b_80_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})

fig.add_subplot(447)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(ii_V_b_80[index], ii_V_ref_pd))
plt.yscale('log')
plt.ylim(10**(-5), 10**(8))

fig.add_subplot(448)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(ii_V_b_80_pd[index], ii_V_ref_pd))
plt.yscale('log')
plt.ylim(10**(-5), 10**(8))

fig.add_subplot(449)
plt.plot(ii_avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(ii_avg_MSE_b_70, label="Off-Policy - Standard")
plt.plot(ii_avg_MSE_b_70_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})
plt.ylabel("$b(Right)=70\%$", size=20)

fig.add_subplot(4, 4, 10)
plt.plot(ii_avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(ii_MSE_b_70, label="Off-Policy - Standard")
plt.plot(ii_MSE_b_70_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})

fig.add_subplot(4, 4, 11)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(ii_V_b_70[index], ii_V_ref_pd))
plt.yscale('log')
plt.ylim(10**(-5), 10**(8))

fig.add_subplot(4, 4, 12)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(ii_V_b_70_pd[index], ii_V_ref_pd))
plt.yscale('log')
plt.ylim(10**(-5), 10**(8))

fig.add_subplot(4, 4, 13)
plt.plot(ii_avg_MSE_on_policy, label="On-Policy - Standard")
plt.plot(ii_avg_MSE_on_policy_pd, label="On-Policy - Per Decision")
plt.legend(prop={'size': 10})
plt.ylabel("On-Policy: $b(Right)=60\%$", size=20)

fig.add_subplot(4, 4, 14)
plt.plot(mean_squared_error(ii_V_on_policy[0], ii_V_ref_pd), label="On-Policy - Standard")
plt.plot(mean_squared_error(ii_V_on_policy_pd[0], ii_V_ref_pd), label="On-Policy - Per Decision")
plt.legend(prop={'size': 10})

fig.add_subplot(4, 4, 15)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(ii_V_on_policy[index], ii_V_ref_pd))
plt.yscale('log')
plt.ylim(10**(-5), 10**(8))

fig.add_subplot(4, 4, 16)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(ii_V_on_policy_pd[index], ii_V_ref_pd))
plt.yscale('log')
plt.ylim(10**(-5), 10**(8))
    
plt.tight_layout()

## III) Fixed Data Generating Policy: Exponentially Small Target Policies 

We expect that the small sampling ratios occurring for most actions will bias the model towards previously seen values, since it will tend to bootstrap more. This should induce a lot of variance in the standard formula case, and should perform better in the per-decision case because of the control variate term. There will be a large amplifying effect for unlikely actions like taking a left, but we expect this to be overwhelmed by the sheer number of "right" actions taken.

In [None]:
iii_V_pi_0 = run(100, 1000, 4, 1, 1, 0.9, 50, 0.05)
iii_V_pi_1 = run(100, 1000, 4, 1, 0.1, 0.9, 50, 0.05)
iii_V_pi_2 = run(100, 1000, 4, 1, 0.01, 0.9, 50, 0.05)
iii_V_pi_3 = run(100, 1000, 4, 1, 0.001, 0.9, 50, 0.05)
iii_V_on_policy = run(100, 1000, 4, 1, 0.9, 0.9, 50, 0.05)

iii_V_ref = np.mean(iii_V_on_policy[:,-1,:], axis=0)

iii_avg_MSE_pi_0 = avg_mean_squared_error(iii_V_pi_0, iii_V_ref)
iii_avg_MSE_pi_1 = avg_mean_squared_error(iii_V_pi_1, iii_V_ref)
iii_avg_MSE_pi_2 = avg_mean_squared_error(iii_V_pi_2, iii_V_ref)
iii_avg_MSE_pi_3 = avg_mean_squared_error(iii_V_pi_3, iii_V_ref)
iii_avg_MSE_on_policy = avg_mean_squared_error(iii_V_on_policy, iii_V_ref)

iii_MSE_pi_0 = mean_squared_error(iii_V_pi_0[0], iii_V_ref)
iii_MSE_pi_1 = mean_squared_error(iii_V_pi_1[0], iii_V_ref)
iii_MSE_pi_2 = mean_squared_error(iii_V_pi_2[0], iii_V_ref)
iii_MSE_pi_3 = mean_squared_error(iii_V_pi_3[0], iii_V_ref)
iii_MSE_on_policy = mean_squared_error(iii_V_on_policy[0], iii_V_ref)

In [None]:
iii_V_pi_0_pd = run(100, 1000, 4, 1, 1, 0.9, 50, 0.05, per_decision=True)
iii_V_pi_1_pd = run(100, 1000, 4, 1, 0.1, 0.9, 50, 0.05, per_decision=True)
iii_V_pi_2_pd = run(100, 1000, 4, 1, 0.01, 0.9, 50, 0.05, per_decision=True)
iii_V_pi_3_pd = run(100, 1000, 4, 1, 0.001, 0.9, 50, 0.05, per_decision=True)
iii_V_on_policy_pd = run(100, 1000, 4, 1, 0.9, 0.9, 50, 0.05, per_decision=True)

iii_V_ref_pd = np.mean(iii_V_on_policy_pd[:,-1,:], axis=0)

iii_avg_MSE_pi_0_pd = avg_mean_squared_error(iii_V_pi_0_pd, iii_V_ref_pd)
iii_avg_MSE_pi_1_pd = avg_mean_squared_error(iii_V_pi_1_pd, iii_V_ref_pd)
iii_avg_MSE_pi_2_pd = avg_mean_squared_error(iii_V_pi_2_pd, iii_V_ref_pd)
iii_avg_MSE_pi_3_pd = avg_mean_squared_error(iii_V_pi_3_pd, iii_V_ref_pd)
iii_avg_MSE_on_policy_pd = avg_mean_squared_error(iii_V_on_policy_pd, iii_V_ref_pd)

iii_MSE_pi_0_pd = mean_squared_error(iii_V_pi_0_pd[0], iii_V_ref_pd)
iii_MSE_pi_1_pd = mean_squared_error(iii_V_pi_1_pd[0], iii_V_ref_pd)
iii_MSE_pi_2_pd = mean_squared_error(iii_V_pi_2_pd[0], iii_V_ref_pd)
iii_MSE_pi_3_pd = mean_squared_error(iii_V_pi_3_pd[0], iii_V_ref_pd)
iii_MSE_pi_on_policy_pd = mean_squared_error(iii_V_on_policy_pd[0], iii_V_ref_pd)

In [None]:
fig = plt.figure(figsize=(20,20))

fig.add_subplot(5, 4, 1)
plt.title("Average: 100 Simulations", size=20)
plt.plot(iii_avg_MSE_on_policy, label="On-Policy - Standard")
plt.plot(iii_avg_MSE_on_policy_pd, label="On-Policy - Per Decision")
plt.legend(prop={'size': 10})
plt.ylabel("On-Policy: $b(Right)=90\%$", size=20)

fig.add_subplot(5, 4, 2)
plt.title("Single Simulation", size=20)
plt.plot(iii_MSE_pi_on_policy_pd, label="On-Policy - Standard")
plt.plot(iii_MSE_pi_on_policy_pd, label="On-Policy - Per Decision")
plt.legend(prop={'size': 10})

fig.add_subplot(5, 4, 3)
plt.title("All Simulations: Standard", size=20)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(iii_V_on_policy_pd[index], iii_V_ref_pd))
plt.yscale('log')

fig.add_subplot(5, 4, 4)
plt.title("All Simulations: Per Decision", size=20)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(iii_V_on_policy_pd[index], iii_V_ref_pd))
plt.yscale('log')

fig.add_subplot(5, 4, 5)
plt.plot(iii_avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(iii_avg_MSE_pi_0, label="Off-Policy - Standard")
plt.plot(iii_avg_MSE_pi_0_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})
plt.ylabel("$b(Right)=100\%$", size=20)

fig.add_subplot(5, 4, 6)
plt.plot(iii_avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(iii_MSE_pi_0, label="Off-Policy - Standard")
plt.plot(iii_MSE_pi_0_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})

fig.add_subplot(5, 4, 7)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(iii_V_pi_0[index], iii_V_ref_pd))
plt.yscale('log')

fig.add_subplot(5, 4, 8)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(iii_V_pi_0_pd[index], iii_V_ref_pd))
plt.yscale('log')

fig.add_subplot(5, 4, 9)
plt.plot(iii_avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(iii_avg_MSE_pi_1, label="Off-Policy - Standard")
plt.plot(iii_avg_MSE_pi_1_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})
plt.ylabel("$b(Right)=10\%$", size=20)
plt.yscale('log')


fig.add_subplot(5, 4, 10)
plt.plot(iii_avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(iii_MSE_pi_1, label="Off-Policy - Standard")
plt.plot(iii_MSE_pi_1_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})
plt.yscale('log')


fig.add_subplot(5, 4, 11)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(iii_V_pi_1[index], iii_V_ref_pd))
plt.yscale('log')

fig.add_subplot(5, 4, 12)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(iii_V_pi_1_pd[index], iii_V_ref_pd))
plt.yscale('log')

fig.add_subplot(5, 4, 13)
plt.plot(iii_avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(iii_avg_MSE_pi_2, label="Off-Policy - Standard")
plt.plot(iii_avg_MSE_pi_2_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})
plt.ylabel("$b(Right)=1\%$", size=20)
plt.yscale('log')


fig.add_subplot(5, 4, 14)
plt.plot(iii_avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(iii_MSE_pi_2, label="Off-Policy - Standard")
plt.plot(iii_MSE_pi_2_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})
plt.yscale('log')


fig.add_subplot(5, 4, 15)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(iii_V_pi_2[index], iii_V_ref_pd))
plt.yscale('log')

fig.add_subplot(5, 4, 16)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(iii_V_pi_2_pd[index], iii_V_ref_pd))
plt.yscale('log')

fig.add_subplot(5, 4, 17)
plt.plot(iii_avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(iii_avg_MSE_pi_3, label="Off-Policy - Standard")
plt.plot(iii_avg_MSE_pi_3_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})
plt.ylabel("$b(Right)=0.1\%$", size=20)
plt.yscale('log')


fig.add_subplot(5, 4, 18)
plt.plot(iii_avg_MSE_on_policy_pd, label="On-Policy")
plt.plot(iii_MSE_pi_3, label="Off-Policy - Standard")
plt.plot(iii_MSE_pi_3_pd, label="Off-Policy - Per-Decision")
plt.legend(prop={'size': 10})
plt.yscale('log')


fig.add_subplot(5, 4, 19)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(iii_V_pi_3[index], iii_V_ref_pd))
plt.yscale('log')

fig.add_subplot(5, 4, 20)
for index in np.arange(0, 100, 1):
    plt.plot(mean_squared_error(iii_V_pi_3_pd[index], iii_V_ref_pd))
plt.yscale('log')

    
plt.tight_layout()

## IV) Fixed Target Policies  Policy: Exponentially Small Data Generating Policies 

Tackling this case would require changing the MDP under consideration (to have termination on the left), which in turn would require a significant investment of time. We believe we have explored the problem sufficiently for our purposes thus far.

We would expect that in exploring this case we would end up with high-variance estimated value functions, given the amplifying effect of the importance sampling ratio for the most likely action "right". This should overwhelm the dampening effect of the importance sampling ratio on the "left" action, which is much less likely. The control variate should help stabilize the estimated results for left actions. Thus we expect then the per-decision formula would perform better in this case as well.

We defer exploration of this to another time -- maybe after refactoring the code above to make it way less repetitive.