In [None]:
import numpy as np
import seaborn as sns
import copy
import time
import matplotlib.pyplot as plt
import pickle

# S&B 8.4

In running Dyna-Q, Dyna-Q+, and Dyna-Q+ Modified on the maze example we've noticed a couple of issues: 
+ When the episode is terminated at an arbitrary timestep the policy implied by the Q function for the Dyna-Q+ and Dyna-Q+ Modified cases often is not the optimal policy. It is a noisy, exploratory policy. The question is how can we retrieve the optimal policy at an arbitrary termination point consistently? **Maybe we can just save the most recent policy at the episode with the highest reward!**
+ There is a lot of variance in the number of timesteps required to complete the first episode. This makes comparison between methodologies difficult. **We're trying taking the average of many runs to address this, but the variance may be too large still.**
+ In the case of Dyna-Q+ the addition of $\tau$ is cumulative across simulated timesteps which seems like it could quickly drown out any actual simulated signal. **We need to add and remove $\tau$ to Q somehow. Maybe we subtract the previous $\tau$? I THINK THIS IS BREAKING Dyna-Q+**

I think I have gotten enough out of this to pause for now. If I feel the need to come back to this exercise later I will. The main point is that Dyna-Q+ can handle non-static situations. I don't think we necessarily got the exercise clean enough to understand the difference between Dyna-Q+ and Dyna-Q+ Modified but we can come back to that if it makes sense.

In [None]:
gridworld = np.ones((9, 6))
gridworld[1:, 2] = 0
start = ([3], [0])
end = ([8], [5])
gridworld[start] = 0.25
gridworld[end] = 0.75
ax = sns.heatmap(gridworld.T)
ax.invert_yaxis()

In [None]:
gridworld_after = copy.deepcopy(gridworld)
gridworld_after[8, 2] = 1
ax = sns.heatmap(gridworld_after.T)
ax.invert_yaxis()

In [None]:
# This would be better as a class, but I'm just saving time
def dyna_q(gridworld, dyna_mode = "dyna", epsilon = 0.1, gamma = 0.95, 
           alpha = 0.1, n = 50, timesteps = 6000, kappa = 0.1, verbose=True):
    
    map_instance = copy.deepcopy(gridworld)
    
    assert dyna_mode in ["dyna", "dyna_plus", "dyna_plus_mod"], "dyna_mode must take allowed value"
    
    rewards = []
    action_map = {0: "left", 1: "right", 2: "up", 3: "down"}
    action_array = np.array([[-1, 0], [1, 0], [0, 1], [0, -1]])
    # x, y, action
    Q = np.zeros((9, 6, 4), dtype=float)
    tau = np.zeros((9, 6, 4), dtype=int)
    model_s_prime = np.full((9, 6, 4, 2), -1, dtype=int) # Note the problem doesn't allow negative coordinates
    model_reward = np.full((9, 6, 4), -1, dtype=int) # It also does not allow negative rewards

    # Training code
    episode_start = True

    t_i = time.time()
    for step in range(timesteps): 
        if episode_start:
            s = np.array([3, 0])
            episode_start = False
        else:
            s = s_prime
            
        if step == int(timesteps/2) - 1:
            # Open the right side of the map
            map_instance[8, 2] = 1
            
        if (step + 1) % int(timesteps/10) == 0 and verbose: 
            t_f = time.time()
            print("Steps #{}-{} Completed: {}s".format(step + 1 - int(timesteps/10), 
                                                       step + 1, round(t_f - t_i, 2)))
            t_i = t_f

        is_not_greedy = np.random.binomial(1, epsilon)

        if is_not_greedy: 
            action = np.random.randint(1, 4, 1)[0]
        else:
            action_vals = Q[s[0], s[1]]
            if dyna_mode == "dyna_plus_mod":
                action_vals = (Q + kappa * np.sqrt(tau))[s[0], s[1]]
            else:
                action_vals = Q[s[0], s[1]]
            action = np.random.choice(np.where(action_vals == action_vals.max())[0])

        s_prime = s + action_array[action]

        if not (0 <= s_prime[0] <= 8) or not (0 <= s_prime[1] <= 5):
            # No movement if we leave the grid
            s_prime = s
            reward = 0
        elif map_instance[s_prime[0], s_prime[1]] == 0:
            # No movement if we hit a wall
            s_prime = s
            reward = 0
        elif np.all(s == np.array([8, 5])):
            # Episode terminates
            episode_start = True
            reward = 1
        else:
            # Next timestep in episode
            reward = 0

        Q_max = np.max(Q[s_prime[0], s_prime[1]])
        Q[s[0], s[1], action] += alpha * (reward + gamma * Q_max - Q[s[0], s[1], action])

        # Keep track of rewards
        rewards.append(reward)

        # Increment tau everywhere, reset to zero at latest state-action and end state
        tau += 1
        tau[8, 5, :] = 0
        tau[s[0], s[1], action] = 0

        # Update model -- seems to improve performance to model other actions
        # Suggestion from here: https://towardsdatascience.com/reinforcement-learning-model-based-planning-methods-extension-572dfee4cceb
        for a in range(3):
            if a == action:
                model_s_prime[s[0], s[1], a] = s_prime
                model_reward[s[0], s[1], a] = reward
            else:
                model_s_prime[s[0], s[1], a] = s
                model_reward[s[0], s[1], a] = 0

        # Get previously observed state-actions, pick them randomly
        observed_state_actions = np.argwhere(model_s_prime >= 0)
        num_state_actions = len(observed_state_actions)
        rad_indices = np.random.randint(0, num_state_actions, n)

        # Simulate randomly selected previously seen experiences
        for index in rad_indices:
            state_action = observed_state_actions[index]
            sim_s_prime = model_s_prime[state_action[0], state_action[1], state_action[2]]
            sim_reward = model_reward[state_action[0], state_action[1], state_action[2]]
            
            if dyna_mode == "dyna_plus":
                sim_reward += kappa * np.sqrt(tau[state_action[0], state_action[1], state_action[2]])
            
            assert np.all(sim_s_prime >= np.array([0, 0])), "Negative coordinate simulated"
            assert sim_reward >= 0, "Negative reward simulated"
                
            Q_max_sim = np.max(Q[sim_s_prime[0], sim_s_prime[1]])
            Q[state_action[0], state_action[1], state_action[2]] += \
                alpha * (sim_reward + gamma * Q_max_sim - Q[state_action[0], state_action[1], state_action[2]])
                
    return Q, np.array(rewards), model_s_prime, model_reward


In [None]:
def plot_policy(Q, ax=None):
    if ax==None:
        ax = sns.heatmap(gridworld_after.T)
    else:
        sns.heatmap(gridworld_after.T, ax=ax)
    pi = np.argmax(Q, axis=2)
    grid_shape = gridworld_after.shape
    action_array = np.array([[-1, 0], [1, 0], [0, 1], [0, -1]])

    for i in range(grid_shape[0]):
        for j in range(grid_shape[1]):
            if gridworld_after[i, j] == 0:
                continue
            plt.arrow(i + 0.5 , j + 0.5, 
                      0.25*action_array[pi[i, j]][0], 
                      0.25*action_array[pi[i, j]][1], 
                      width=0.05, head_width=0.2,
                      color="blue", alpha=0.9)

    ax.invert_yaxis()
    
def simulate_policy(Q, ax=None):
    if ax==None:
        ax = sns.heatmap(gridworld_after.T)
    else:
        sns.heatmap(gridworld_after.T, ax=ax)
    pi = np.argmax(Q, axis=2)
    grid_shape = gridworld_after.shape
    action_array = np.array([[-1, 0], [1, 0], [0, 1], [0, -1]])
    
    s = np.array([3, 0])
    end = np.array([8, 5])
    
    counter = 0
    while np.any(s != end):
        action = action_array[pi[s[0], s[1]]]
        
        plt.arrow(s[0] + 0.5 , s[1] + 0.5, 
                  0.25*action[0], 
                  0.25*action[1], 
                  width=0.05, head_width=0.2,
                  color="blue", alpha=0.9)
        
        s += action
        try:
            if gridworld_after[s[0], s[1]] == 0:
                break
            if np.any(s < 0):
                break
        except IndexError:
            break
            
        counter += 1
        if counter > 20:
            break

    ax.invert_yaxis()

In [None]:
Q_dyna, R_dyna, model_s_prime_dyna, model_reward_dyna = dyna_q(gridworld, 
                                                               dyna_mode = "dyna",
                                                               kappa=0,
                                                               n=10)

In [None]:
Q_plus, R_plus, model_s_prime_plus, model_reward_plus = dyna_q(gridworld, 
                                                               dyna_mode = "dyna_plus",
                                                               kappa=10**(-2),
                                                               n=50)

In [None]:
Q_plus_mod, R_plus_mod, model_s_prime_plus_mod, model_reward_plus_mod = dyna_q(gridworld, 
                                                                               dyna_mode = "dyna_plus_mod",
                                                                               kappa=10**(-1),
                                                                               n=50)

In [None]:
fig = plt.figure(figsize=(20, 20))

ax = fig.add_subplot(321)
plot_policy(Q_dyna, ax=ax)
plt.ylabel('Dyna-Q', size=30)
plt.title('Policy Map', size=30)
ax = fig.add_subplot(322)
simulate_policy(Q_dyna, ax=ax)
plt.title('Policy Trajectory', size=30)

ax = fig.add_subplot(323)
plot_policy(Q_plus, ax=ax)
plt.ylabel('Dyna-Q+', size=30)
ax = fig.add_subplot(324)
simulate_policy(Q_plus, ax=ax)

ax = fig.add_subplot(325)
plot_policy(Q_plus_mod, ax=ax)
plt.ylabel('Dyna-Q+ Modified', size=30)
ax = fig.add_subplot(326)
simulate_policy(Q_plus_mod, ax=ax)

plt.tight_layout()

In [None]:
fig = plt.figure(figsize=(20,8))
plt.plot(np.cumsum(R_dyna), label="Dyna-Q")
plt.plot(np.cumsum(R_plus), label="Dyna-Q+")
plt.plot(np.cumsum(R_plus_mod), label="Dyna-Q+ Modified")
plt.legend(prop={'size': 20})
plt.ylabel("Cumulative reward", size=20)
plt.yticks(size=20)
plt.xlabel("Time steps", size=20)
plt.xticks(size=20)
plt.tight_layout()

In [None]:
# there's too much variance in the length of the first episode -- need to take an average
num_sim = 20
timesteps = 6000
R_dyna = np.empty((num_sim, timesteps))
R_plus = np.empty((3 * num_sim, timesteps))
R_plus_mod = np.empty((3 * num_sim, timesteps))
R_map = {"dyna": R_dyna, "dyna_plus": R_plus, "dyna_plus_mod": R_plus_mod}

for mode in ["dyna", "dyna_plus", "dyna_plus_mod"]:
    for k_index, k in enumerate([10**(-1), 10**(-2), 10**(-3)]):
        for sim in range(num_sim):
            if mode == "dyna" and k_index != 0:
                continue
            
            t_i = time.time()
            Q, R, model_s_prime, model_reward = dyna_q(gridworld, 
                                                       dyna_mode=mode, 
                                                       verbose=False, 
                                                       timesteps=6000,
                                                       n=50,
                                                       kappa=k)
            
            if mode != "dyna":
                R_map[mode][num_sim * k_index + sim] = R
            else:
                R_map[mode][sim] = R
                
            t_f = time.time()
            print("Mode: {}, Simulation: {}, k-index: {}, Array Index {}, Completed. Time: {}s".format(mode,
                                                                                       sim,
                                                                                       k_index,
                                                                                       num_sim * k_index + sim,
                                                                                       round(t_f - t_i, 2)))

In [None]:
R_dyna_avg = np.mean(np.cumsum(R_dyna, axis=1), axis=0)
R_plus_avg_0 = np.mean(np.cumsum(R_plus[0:num_sim], axis=1), axis=0)
R_plus_avg_1 = np.mean(np.cumsum(R_plus[num_sim:2*num_sim], axis=1), axis=0)
R_plus_avg_2 = np.mean(np.cumsum(R_plus[2*num_sim:3*num_sim], axis=1), axis=0)
R_plus_mod_avg_0 = np.mean(np.cumsum(R_plus_mod[0:num_sim], axis=1), axis=0)
R_plus_mod_avg_1 = np.mean(np.cumsum(R_plus_mod[num_sim:2*num_sim], axis=1), axis=0)
R_plus_mod_avg_2 = np.mean(np.cumsum(R_plus_mod[2*num_sim:3*num_sim], axis=1), axis=0)

In [None]:
fig = plt.figure(figsize=(20,8))
plt.plot(R_dyna_avg, label="Dyna-Q", color="grey")
plt.plot(R_plus_avg_0, label="Dyna-Q+, $\kappa=10^{-1}$", color="blue", linestyle="dashed")
plt.plot(R_plus_avg_1, label="Dyna-Q+, $\kappa=10^{-2}$", color="blue", linestyle="dotted")
plt.plot(R_plus_avg_2, label="Dyna-Q+, $\kappa=10^{-3}$", color="blue", linestyle="dashdot")
plt.plot(R_plus_mod_avg_0, label="Dyna-Q+ Modified, $\kappa=10^{-1}$", color="orange", linestyle="dashed")
plt.plot(R_plus_mod_avg_1, label="Dyna-Q+ Modified, $\kappa=10^{-2}$", color="orange", linestyle="dotted")
plt.plot(R_plus_mod_avg_2, label="Dyna-Q+ Modified, $\kappa=10^{-3}$", color="orange", linestyle="dashdot")
plt.legend(prop={'size': 20})
plt.ylabel("Cumulative reward", size=20)
plt.yticks(size=20)
plt.xlabel("Time steps", size=20)
plt.xticks(size=20)
plt.vlines(timesteps/2, 0, 
           max(np.max(R_dyna_avg), np.max(R_plus_avg), np.max(R_plus_mod_avg)),
           linestyle='dashed')
plt.tight_layout()

# S&B 8.8

In [None]:
class TrajectorySamplingDemo():
    # Note: if we can draw the state samples ahead of time as an array
    # and then vectorize the updates that would probably make this a lot faster.
    
    def __init__(self, sample_traj=False, epsilon = 0.1, num_states = 1000, 
                 term_prob=0.1, num_actions=2, b=1, verbose=True):
    
        ## PARAMETERS
        self.sample_traj = sample_traj
        self.epsilon = epsilon
        self.num_states = num_states 
        self.term_prob = term_prob
        self.num_actions = num_actions
        self.b = b 
        self.term_prob = term_prob
        self.verbose = verbose
    
        ## TENSORS DEFINING RL PROBLEM
        # Randomly select the start state
        self.s_0 = np.random.randint(0, high=num_states, size=1)[0]
        self.s_final = self.num_states
        # Axes: S, A, b new States. Value: S'
        self.state_action_transitions = self.init_state_actions(num_states, num_actions, b)
        # Axes: S, A. Value: R for (S, A) -> Terminal state transition
        self.terminal_rewards = np.random.normal(size=(num_states, num_actions))
        # Axes: S, A, b new States. Value: R for (S,A) -> S' transition
        self.state_action_rewards = np.random.normal(size=(num_states, num_actions, b))
        
        ## Q TENSOR
        # S, A
        self.Q = np.zeros((num_states, num_actions), dtype=float)
        
        ## STATE VARIABLES FOR EPISODES
        self.episode_start = True
        
        ## SET STATE AND UPDATE FUNCTIONS
        if sample_traj:
            self.next_action = self.trajectory_action
            self.next_state = self.trajectory_state
        else: 
            self.next_action = self.random_action
            self.next_state = self.random_state
            
        ## RETURN ARRAY
        self.start_state_values = []
        
    def init_state_actions(self, num_states, num_actions, b):
        state_action_transitions = np.empty((num_states, num_actions, b), dtype=int)
        
        for s in range(num_states):
            for a in range(num_actions):
                state_action_transitions[s, a] = np.random.choice(range(num_states), 
                                                                  size=b, 
                                                                  replace=False)
            
        return state_action_transitions
        
    def Q_updates(self, updates, Q_update_freq, v_est_update, v_est_episodes, v_est_step_bound=10**6):
        s = self.s_0
        value_estimates = []
        value_devs = []
        t_updates = []
        
        for update in range(updates):
            t_i_update = time.time()
            
            if (update + 1) % Q_update_freq == 0 and self.verbose:
                delta_t_updates = round(sum(t_updates)/len(t_updates), 2)
                t_updates = []
                print("Updates #{}-{} Completed, Avg. Time: {}s".format(update + 1 - Q_update_freq, 
                                                                        update + 1,
                                                                        delta_t_updates))
            
            if (update + 1) % v_est_update == 0:
                t_i_estimate = time.time()
                v_est, v_std = self.calc_start_value(v_est_episodes, step_bound=v_est_step_bound)
                value_estimates.append(v_est)
                value_devs.append(v_std)
                t_f_estimate = time.time()
                
                if self.verbose:
                    delta_t_estimate = round(t_f_estimate - t_i_estimate, 2)
                    print("V(S_0) Estimate At Update #{}, Time: {}s".format(update + 1,
                                                                            delta_t_estimate))
            
            if s == self.s_final:
                s = self.s_0
                continue
            
            a = self.next_action(s)
            
            s_primes = self.state_action_transitions[s, a] # Axes: b next states
            rewards = self.state_action_rewards[s, a] # Axes: b next states
            t_reward = self.terminal_rewards[s, a] # Scalar
            
            self.Q[s, a] = (0.9) * (1/self.b) * np.sum(rewards + np.max(self.Q[s_primes, :])) + 0.1 * t_reward
            
            s = self.next_state(s, a)
            
            t_f_update = time.time()
            t_updates.append(t_f_update-t_i_update)
            
        return np.array(value_estimates), np.array(value_devs)
            
    def simulate_episode(self):
        s = self.s_0
        cum_reward = 0
        steps = 0
        
        while s != self.s_final:
            # Next action and next state
            a = self.trajectory_action(s)
            s_prime = self.trajectory_state(s, a)
            
            # Transition reward
            if s_prime != self.s_final:
                s_prime_index = np.where(self.state_action_transitions[s, a] == s_prime)[0][0]
                reward = self.state_action_rewards[s, a, s_prime_index]
            else:
                reward = self.terminal_rewards[s, a]
            
            # Set next state to current state and add reward
            cum_reward += reward
            steps += 1
            s = s_prime
            
        return cum_reward, steps
            
    def calc_start_value(self, num_episodes, step_bound=10**6):
        # Monte Carlo is more efficient for calculating the value of 
        # a single state
        assert num_episodes > 0, "The number of episodes must be positive"
        
        cum_steps = 0
        episodes = 0
        v_list = []
        
        while episodes < num_episodes and cum_steps < step_bound:
            v_est, steps = self.simulate_episode()
            v_list.append(v_est)
            cum_steps += steps
            episodes += 1
            
        v = np.array(v_list)
        v_avg = np.mean(v)
        v_std = np.sqrt(np.sum(np.power(v - v_avg, 2))/len(v_list))
            
        return v_avg, v_std
        
    def trajectory_state(self, s, a):
        is_terminal = np.random.binomial(1, self.term_prob) 
        
        if is_terminal:
            s_prime = self.s_final
        else:
            s_prime = np.random.choice(self.state_action_transitions[s, a])
            
        return s_prime
            
    def trajectory_action(self, s, force_greedy=False):
        is_not_greedy = np.random.binomial(1, self.epsilon)

        if is_not_greedy and not force_greedy: 
            a = np.random.randint(0, 2, 1)[0]
        else:
            action_vals = self.Q[s]
            a = np.random.choice(np.where(action_vals == action_vals.max())[0])
            
        return a
        
    def random_state(self, s, a):
        return np.random.randint(0, high=self.num_states, size=1)[0]
        
    def random_action(self, s): 
        return np.random.randint(0, 2, 1)[0]

In [None]:
def multiple_runs(sample_traj, num_sim, updates, Q_update_freq, 
                  num_updates, v_est_update, sim_update=10, **params):

    mean_curves = np.empty((num_sim, num_updates))
    error_bars = np.empty((num_sim, num_updates))

    t_i = time.time()
    for sim in range(num_sim):
        demo = TrajectorySamplingDemo(sample_traj=sample_traj, verbose=False, **params)
        val_ests, val_devs = demo.Q_updates(updates, 
                                            Q_update_freq, 
                                            v_est_update, 
                                            np.inf, 
                                            v_est_step_bound=10**4)

        mean_curves[sim] = val_ests
        error_bars[sim] = val_devs

        if (sim + 1) % sim_update == 0:
            t_f = time.time()
            print("Simulations #{}-{} Completed, Time: {}s".format(sim+1-sim_update,
                                                                   sim+1,
                                                                   round(t_f-t_i, 2)))
            t_i = time.time()
            
    return mean_curves, error_bars

## Replicate $\|\mathscr{S}\|=1,000$ Case, with $b \in \{1, 3, 10\}$

In [None]:
num_sim = 200
updates = 20000
Q_update_freq = 1000
num_updates = int(updates/Q_update_freq)
v_est_update = 1000 # Doesn't make a difference when verbose is set to false

# Seems like scaling is independent of b, but 10x number of states increases run time by 3x
# I find the latter surprising since there isn't any looping that depends on the number of states
base_params = {"epsilon": 0.1, 
               "num_states": 1000, 
               "term_prob": 0.1,
               "num_actions": 2}

params_b1 = {**base_params, "b": 1}
params_b3 = {**base_params, "b": 3}
params_b5 = {**base_params, "b": 10}

params_list_1000 = [params_b1, params_b3, params_b5]

In [None]:
for sample_traj in [False, True]:
    for params in params_list_1000:
        
        traj_string = "trajectory" if sample_traj else "random"
        b_string = str(params["b"])

        print("### SAMPLING TYPE: {}. B={} ###".format(traj_string, b_string))
        mean_curves, error_bars = multiple_runs(sample_traj, 
                                                num_sim, 
                                                updates, 
                                                Q_update_freq, 
                                                num_updates, 
                                                v_est_update,
                                                sim_update=20,
                                                **params)

        means = np.mean(mean_curves, axis=0)
        stds = np.sqrt(np.sum(np.power(mean_curves  - means.reshape(1,20), 2), axis=0)/num_sim)

        means.dump("ex_88_s1000_outputs/" + traj_string + "_b" + b_string + "_" + "avg" + ".pkl")
        stds.dump("ex_88_s1000_outputs/" + traj_string + "_b" + b_string + "_" + "std" + ".pkl")

In [None]:
fig = plt.figure(figsize=(20, 10))
color_map = {"1": "blue", "3": "green", "10": "red"}
linestyle_map = {"trajectory": "dashed", "random": "solid"}

for params in params_list_1000:
    for sample_traj in [False, True]:
        
        traj_string = "trajectory" if sample_traj else "random"
        b_string = str(params["b"])
        
        means = np.load("ex_88_s1000_outputs/" + traj_string + "_b" + b_string + "_" + "avg" + ".pkl", 
                        allow_pickle=True)
        stds = np.load("ex_88_s1000_outputs/" + traj_string + "_b" + b_string + "_" + "std" + ".pkl", 
                       allow_pickle=True)
        
        color = color_map[b_string]
        linestyle = linestyle_map[traj_string]
        plt.errorbar(np.arange(1000, 21000, 1000), means, color=color, 
                     linestyle=linestyle, label="sampling: {}, b={}".format(traj_string, b_string))
        plt.fill_between(np.arange(1000, 21000, 1000), means - stds, means + stds,
                 color=color, alpha=0.1)
        
plt.legend(prop={'size': 15})
plt.xlabel("Computation Steps", size=30)
plt.xticks(size=20)
plt.ylabel("$v(s_0)$", size=30)
plt.yticks(size=20)
plt.title("Initial State Value vs. Number of $Q$-Function Updates", size=30)
plt.tight_layout()

## Simulate $\|\mathscr{S}\|=10,000$ Case, with $b=3$

In [None]:
num_sim = 100
updates = 200000
Q_update_freq = 10000
num_updates = int(updates/Q_update_freq)
v_est_update = 10000 # Doesn't make a difference when verbose is set to false

# Seems like scaling is independent of b, but 10x number of states increases run time by 3x
# I find the latter surprising since there isn't any looping that depends on the number of states
base_params = {"epsilon": 0.1, 
               "num_states": 10000, 
               "term_prob": 0.1,
               "num_actions": 2}

params_b1 = {**base_params, "b": 1}
params_b3 = {**base_params, "b": 3}

params_list_1000 = [params_b1, params_b3]

In [None]:
for sample_traj in [False, True]:
    for params in params_list_1000:
        
        traj_string = "trajectory" if sample_traj else "random"
        b_string = str(params["b"])

        print("### SAMPLING TYPE: {}. B={} ###".format(traj_string, b_string))
        mean_curves, error_bars = multiple_runs(sample_traj, 
                                                num_sim, 
                                                updates, 
                                                Q_update_freq, 
                                                num_updates, 
                                                v_est_update,
                                                sim_update=10,
                                                **params)

        means = np.mean(mean_curves, axis=0)
        stds = np.sqrt(np.sum(np.power(mean_curves  - means.reshape(1,20), 2), axis=0)/num_sim)

        means.dump("ex_88_s10000_outputs/" + traj_string + "_b" + b_string + "_" + "avg" + ".pkl")
        stds.dump("ex_88_s10000_outputs/" + traj_string + "_b" + b_string + "_" + "std" + ".pkl")

In [None]:
fig = plt.figure(figsize=(20, 10))
color_map = {"1": "blue", "3": "green"}
linestyle_map = {"trajectory": "dashed", "random": "solid"}

for params in params_list_1000:
    for sample_traj in [False, True]:
        
        traj_string = "trajectory" if sample_traj else "random"
        b_string = str(params["b"])
        
        means = np.load("ex_88_s10000_outputs/" + traj_string + "_b" + b_string + "_" + "avg" + ".pkl", 
                        allow_pickle=True)
        stds = np.load("ex_88_s10000_outputs/" + traj_string + "_b" + b_string + "_" + "std" + ".pkl", 
                       allow_pickle=True)
        
        color = color_map[b_string]
        linestyle = linestyle_map[traj_string]
        plt.errorbar(np.arange(10000, 210000, 10000), means, color=color, 
                     linestyle=linestyle, label="sampling: {}, b={}".format(traj_string, b_string))
        plt.fill_between(np.arange(10000, 210000, 10000), means - stds, means + stds,
                 color=color, alpha=0.1)
        
plt.legend(prop={'size': 15})
plt.xlabel("Computation Steps", size=30)
plt.xticks(size=20)
plt.ylabel("$v(s_0)$", size=30)
plt.yticks(size=20)
plt.title("Initial State Value vs. Number of $Q$-Function Updates", size=30)
plt.tight_layout()