# Reinforcement learning applied to Gym environments

This notebook should be run on a local runtime (most tests take a very long time to complete).

List of Python dependencies:
```
pip install 'imageio==2.4.0'
pip install pyvirtualdisplay
pip install numpy
pip install tensorflow
pip install tf_agents
pip install pillow
pip install matplotlib
pip install cv2
pip install conda
pip install opencv-python
pip install gym[box2d]
```

To start a local jupyter notebook:
```
jupyter notebook --NotebookApp.allow_origin='https://colab.research.google.com' --port=8888 --NotebookApp.port_retries=0

```

In [None]:
import os
import time
import gym
import random
import pickle
import numpy as np
from tensorflow.keras.models import load_model as tf_load_model
from tensorflow.keras import Sequential
from tensorflow.keras import backend as k_backend
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import relu, linear
from tensorflow.keras.losses import mean_squared_error, Huber
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.models import clone_model
from tensorflow import where as tf_where
from collections import deque
from scipy.stats import multivariate_normal
import matplotlib
import matplotlib.pyplot as plt
import imageio
import base64
from PIL import Image, ImageDraw
import IPython

FIGSIZE = (20, 10)
FONTSIZE = 14

SEED = 42

random.seed(SEED)
np.random.seed(SEED)

# "Classic" DQN, DQN with Target newtork, Double DQN

In [None]:
def masked_huber_loss(mask_value, clip_delta):
  def f(y_true, y_pred):
    error = y_true - y_pred
    cond = k_backend.abs(error) < clip_delta
    mask_true = k_backend.cast(k_backend.not_equal(y_true, mask_value), k_backend.floatx())
    masked_squared_error = 0.5 * k_backend.square(mask_true * (y_true - y_pred))
    linear_loss  = mask_true * (clip_delta * k_backend.abs(error) - 0.5 * (clip_delta ** 2))
    huber_loss = tf_where(cond, masked_squared_error, linear_loss)
    return k_backend.sum(huber_loss) / k_backend.sum(mask_true)
  f.__name__ = 'masked_huber_loss'
  return f

def masked_mean_squared_error(mask_value, clip_delta):
  def f(y_true, y_pred):
    mask_true = k_backend.cast(k_backend.not_equal(y_true, mask_value), k_backend.floatx())
    if clip_delta > 0:
      masked_squared_error = k_backend.clip(k_backend.square(mask_true * (y_true - y_pred)), (-1) * clip_delta, clip_delta)
    else:
      masked_squared_error = k_backend.square(mask_true * (y_true - y_pred))
    return k_backend.sum(masked_squared_error) / k_backend.sum(mask_true)
  f.__name__ = 'masked_mean_squared_error'
  return f

# This object must be passed as parameter `custom_objects` when loading a saved model
CUSTOM_OBJECTS = {'masked_huber_loss': masked_huber_loss, 'masked_mean_squared_error': masked_mean_squared_error}

def get_output_input_dim_from_env(env):
  output_dim = env.action_space.n
  input_dim = env.observation_space.shape[0]
  return output_dim, input_dim

def create_dqn_network(hyperparams, env):
  output_dim, input_dim = get_output_input_dim_from_env(env)
  model = Sequential()
  # Input layer -> first layer
  model.add(Dense(hyperparams['layer_params'][0], input_dim=input_dim, activation=relu))
  # Rest of hidden layers
  for lr in hyperparams['layer_params'][1:]:
    model.add(Dense(lr, activation=relu))
  # Output layer
  model.add(Dense(output_dim, activation=linear))

  # Compile model
  optimizer = None
  if hyperparams['optimizer'] == 'Adam':
    optimizer = Adam(learning_rate=hyperparams['learning_rate'])

  loss = None
  if hyperparams['loss'] == 'MSE':
    if hyperparams['error_clipping']:
      loss = masked_mean_squared_error(0.0, 1.0)
    else:
      loss = masked_mean_squared_error(0.0, 0)
  elif hyperparams['loss'] == 'Huber':
    loss = masked_huber_loss(0.0, 1.0)
  model.compile(loss=loss, optimizer=optimizer)
  return model

def create_replay_buffer(hyperparams):
  buffer = deque(maxlen=hyperparams['max_buffer_size'])
  return buffer

# Sample should be a tuple: (state, action, reward, next_state, done)
def add_sample_to_buffer(hyperparams, buffer, sample):
  # Deque automatically removes elements from opposite end when maxlen is reached
  buffer.append(sample)

def get_batch_from_buffer(buffer, batch_size):
  sample_batch = random.sample(buffer, batch_size)
  return sample_batch

def get_action(hyperparams, episode, state, epsilon, model, max_action):
  # Explore: select random action with probability epsilon
  if episode < hyperparams['explore_only_episodes'] or np.random.rand() < epsilon:
    return random.randrange(max_action)
  # Exploit: select best predicted action
  exploit_actions = model.predict(state)
  return np.argmax(exploit_actions[0])

def extract_data_from_batch(batch):
  states = np.array([i[0] for i in batch])
  actions = np.array([i[1] for i in batch])
  rewards = np.array([i[2] for i in batch])
  next_states = np.array([i[3] for i in batch])
  dones = np.array([i[4] for i in batch])
  # Squeeze: remove axes of length 1 from array
  states = np.squeeze(states)
  next_states = np.squeeze(next_states)
  return states, actions, rewards, next_states, dones

def learn(hyperparams, output_dim, rewards, buffer, model, target_model):
  sample_batch = get_batch_from_buffer(buffer, hyperparams['batch_size'])
  states, actions, rewards, next_states, dones = extract_data_from_batch(sample_batch)
  target_vec = None
  
  # Double Q-learning
  if hyperparams['double_q_learning']:
    target_vec = np.zeros((hyperparams['batch_size'], output_dim))

    # Calc Q(s', a')
    model_ns_predictions = model.predict_on_batch(next_states)

    # Calc Q'(s', a')
    target_model_ns_predictions = target_model.predict_on_batch(next_states)

    indexes = np.array([i for i in range(hyperparams['batch_size'])])

    targets = rewards + hyperparams['gamma'] * target_model_ns_predictions[[indexes], np.argmax(model_ns_predictions, axis=1)] * (1 - dones)
    
    target_vec[[indexes], [actions]] = targets
  else:
    # Calc array of yj
    # Note: set yj = rj for terminal states (1 - done = 0)
    targets = None
    if hyperparams['use_target_network']:
    # otherwise set yj = rj + gamma * (max Q value on next state using target model)
      targets = rewards + hyperparams['gamma'] * (np.amax(target_model.predict_on_batch(next_states), axis=1)) * (1 - dones)
    else:
      targets = rewards + hyperparams['gamma'] * (np.amax(model.predict_on_batch(next_states), axis=1)) * (1 - dones)
    
    # Calc Q value for current states
    # This is probably better because we are using a different network to produce the targets 
    if hyperparams['use_target_network']:
      target_vec = np.zeros((hyperparams['batch_size'], output_dim))
    else:
      target_vec = model.predict_on_batch(states)

    indexes = np.array([i for i in range(hyperparams['batch_size'])])
    # Substitute yj as expected value for each action
    target_vec[[indexes], [actions]] = targets

  model.fit(states, target_vec, epochs=1, batch_size=hyperparams['batch_size'], verbose=0)

def train_done(hyperparams, episode, iteration):
  if 'max_train_iterations' in hyperparams:
    if iteration >= hyperparams['max_train_iterations']:
      return True
    else:
      return False
  else:
    return episode >= hyperparams['train_episodes']

def explore_only(hyperparams, episode, buffer):
  # Set an explicit number of explore only steps
  if hyperparams['explore_only_episodes'] >= 0:
    if episode < hyperparams['explore_only_episodes']:
      return True
    else:
      return False
  # Or at least wait until buffer is full enough to train
  elif len(buffer) < hyperparams['batch_size'] * 2:
    return True
  return False

def train(hyperparams, env, eval_env, model, target_model, buffer):
  start_time = current_ms()
  # Global iteration counter
  iteration = 0
  episode = -1
  # List of episode rewards
  episode_rewards = []
  mean_rewards = []
  visited_states = []
  eval_returns = []
  epsilon = hyperparams['epsilon_start']
  output_dim, input_dim = get_output_input_dim_from_env(env)

  while not train_done(hyperparams, episode, iteration):
    episode += 1
    episode_reward = 0
    state = env.reset()
    visited_states.append(state)
    state = np.reshape(state, [1, input_dim])
    for step in range(hyperparams['max_episode_steps']):
      action = get_action(hyperparams, episode, state, epsilon, model, output_dim)
      next_state, reward, done, _ = env.step(action)
      if hyperparams['reward_clipping']:
        if reward < -1:
          reward = -1
        elif reward > 1:
          reward = 1
      visited_states.append(next_state)
      episode_reward += reward
      next_state = np.reshape(next_state, [1, input_dim])
      sample = (state, action, reward, next_state, done)
      add_sample_to_buffer(hyperparams, buffer, sample)
      state = next_state
      iteration += 1

      if not explore_only(hyperparams, episode, buffer) and iteration % hyperparams['update_frequency'] == 0:
        learn(hyperparams, output_dim, episode_rewards, buffer, model, target_model)
        
      # Every C steps set Q' = Q
      if hyperparams['use_target_network'] and not explore_only(hyperparams, episode, buffer) and iteration % hyperparams['target_update_steps']:
        target_model.set_weights(model.get_weights())
      
      if hyperparams['eval_interval'] > 0 and iteration % hyperparams['eval_interval'] == 0:
        avg_returns, _ = eval_model(hyperparams, model, eval_env)
        eval_returns.append(avg_returns)
        print(">>> Iteration {0}, Episode {1}, Avg. returns {2}".format(iteration, episode, avg_returns))

      # Episode done
      if done:
        break
    episode_rewards.append(episode_reward)

    # Save model to fs
    if hyperparams['save_model_every_n_episodes'] > 0 and episode % hyperparams['save_model_every_n_episodes'] == 0:
      model_name = hyperparams['model_name'] + '_ep' + str(episode)
      if episode_reward > hyperparams['episode_solved_score']:
        model_name += '_SOLVED'
      save_model_to_fs(model, hyperparams['env_name'], model_name)
    elif hyperparams['always_save_optimal_models'] and episode_reward > hyperparams['episode_solved_score']:
      model_name = hyperparams['model_name'] + '_ep' + str(episode) + '_SOLVED'
      save_model_to_fs(model, hyperparams['env_name'], model_name)

    # Epsilon decay
    if not explore_only(hyperparams, episode, buffer) and hyperparams['epsilon_decay'] > 0 and epsilon > hyperparams['epsilon_end']:
      epsilon *= hyperparams['epsilon_decay']
    
    # Early stopping
    last_rewards_mean = np.mean(episode_rewards[hyperparams['rewards_mean_episode_limit']:])
    mean_rewards.append(last_rewards_mean)
    if hyperparams['early_stopping'] and last_rewards_mean > hyperparams['episode_solved_score']:
        print("Training complete with early stopping")
        break

    if hyperparams['log_episodes']:
      print("Iteration {0}, Episode {1}, Reward {2}, Avg. reward {3}, epsilon {4}".format(iteration, episode, episode_reward, last_rewards_mean, epsilon))
  end_time = current_ms()
  print("Evaluation done in", print_minutes(start_time, end_time))
  return episode_rewards, mean_rewards, eval_returns, visited_states

def eval_random_agent(hyperparams, env, n_tests=100):
  start_time = current_ms()
  rewards = []
  output_dim, input_dim = get_output_input_dim_from_env(env)
  for _ in range(n_tests):
    state = env.reset()
    state = np.reshape(state, [1, input_dim])
    episode_reward = 0
    for _ in range(hyperparams['max_episode_steps']):
      action = random.randrange(output_dim)
      new_state, reward, done, _ = env.step(action)
      new_state = np.reshape(new_state, [1, input_dim])
      state = new_state
      episode_reward += reward
      if done:
          break
    rewards.append(episode_reward)
  end_time = current_ms()
  print("Evaluation done in", print_minutes(start_time, end_time))
  return np.mean(rewards), rewards

def eval_model(hyperparams, model, env, n_tests=100):
  start_time = current_ms()
  rewards = []
  output_dim, input_dim = get_output_input_dim_from_env(env)
  for _ in range(n_tests):
    state = env.reset()
    state = np.reshape(state, [1, input_dim])
    episode_reward = 0
    for _ in range(hyperparams['max_episode_steps']):
      action = np.argmax(model.predict(state)[0])
      new_state, reward, done, _ = env.step(action)
      new_state = np.reshape(new_state, [1, input_dim])
      state = new_state
      episode_reward += reward
      if done:
          break
    rewards.append(episode_reward)
  end_time = current_ms()
  print("Evaluation done in", print_minutes(start_time, end_time))
  return np.mean(rewards), rewards

def create_model_eval_graph(model, env, epsilon, states_to_plot, n_steps):
  plt.figure(figsize=FIGSIZE)
  rewards = []
  states = []
  actions = []
  output_dim, input_dim = get_output_input_dim_from_env(env)
  state = env.reset()
  states.append(state)
  state = np.reshape(state, [1, input_dim])
  for _ in range(n_steps):
    action = None
    if np.random.rand() < epsilon:
      action = random.randrange(output_dim)
    else:
      action = np.argmax(model.predict(state)[0])
    actions.append(action)
    new_state, reward, done, _ = env.step(action)
    states.append(new_state)
    rewards.append(reward)
    new_state = np.reshape(new_state, [1, input_dim])
    state = new_state
    if done:
        break
  for st in states_to_plot:
    plt.plot([i for i in range(len(states))], [state[st['idx']] for state in states], label=st['label'])

  plt.scatter([i for i in range(len(actions))], [action for action in actions], label='Azione')
  plt.yticks(fontsize=FONTSIZE)
  plt.xticks(fontsize=FONTSIZE)
  plt.legend(prop={'size':FONTSIZE})
  return 

def plot_model_evaluation(avg_reward, rewards):
  plt.figure(figsize=FIGSIZE)
  plt.plot([i for i in range(len(rewards))], rewards)
  plt.plot([i for i in range(len(rewards))], [avg_reward for _ in range(len(rewards))])
  plt.xlabel('Episodi', fontsize=FONTSIZE)
  plt.ylabel('Ricompense', fontsize=FONTSIZE)
  plt.yticks(fontsize=FONTSIZE)
  plt.xticks(fontsize=FONTSIZE)

def run_single_test(hyperparams, env):
  model = create_dqn_network(hyperparams, env)
  target_model = None
  if hyperparams['use_target_network']:
    target_model = create_dqn_network(hyperparams, env)
    target_model.set_weights(model.get_weights())
  buffer = create_replay_buffer(hyperparams)
  rewards, mean_rewards, eval_returns, visited_states = train(hyperparams, env, None, model, target_model, buffer)
  return rewards, mean_rewards, eval_returns, visited_states

# Run multiple tests with the same hyperparameters
def run_multiple_tests(hyperparams, env, seed, n_tests=3):
  # Reset seeds for reproducibility
  env.seed(seed)
  random.seed(seed)
  np.random.seed(seed)

  global_rewards = []
  global_mean_rewards = []
  global_states = []
  for _ in range(n_tests):
    rewards, mean_rewards, eval_returns, visited_states = run_single_test(hyperparams, env)

    global_rewards.append(rewards)
    global_mean_rewards.append(mean_rewards)
    global_states.append(visited_states)
  return global_rewards, global_mean_rewards, global_states

def generate_hyperparams_for_tests(base_hyperparams, varying_vals):
  result = []
  for key in varying_vals:
    for i in range(len(varying_vals[key])):
      new_hyper = base_hyperparams.copy()
      new_hyper['model_name'] = base_hyperparams['model_name'] + '_' + key + '_' + i
      new_hyper[key] = varying_vals[key][i]
      result.append(new_hyper)
  return result

# Test different hyperparameters on the same environment (seed reset for each new test)
# Note: if models should be saved change model_name for each hyperparameters dictionary in the list
def run_different_hyperparams(hyper_list, env, seed, test_name):
  test = 0
  for hyperparams in hyper_list:
    test += 1
    # Reset seeds for reproducibility
    env.seed(seed)
    random.seed(seed)
    np.random.seed(seed)

    rewards, mean_rewards, eval_returns, visited_states = run_single_test(hyperparams, env)

    save_python_data_to_fs(rewards, hyperparams['env_name'], test_name, hyperparams['model_name'], 'rewards')
    save_python_data_to_fs(mean_rewards, hyperparams['env_name'], test_name, hyperparams['model_name'], 'mean_rewards')
    print('##################### Test ' + str(test) + ' done')
  return global_rewards, global_mean_rewards, global_states

def save_plot_multiple_test_results_from_file(file_path, env_name, test_name, model_name, plot_name):
  data = []
  with open(file_path, 'rb') as fp:
    data = pickle.load(fp)
  plt.figure(figsize=(20,10))
  test_i = 0
  for mean_rewards in mean_rewards_list:
    test_i += 1
    plt.plot([i for i in range(len(mean_rewards_list))], mean_rewards, label='Test ' + str(test_i))
  plt.legend()

  plot_dir = os.path.join('plots', env_name, test_name)
  if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)
  plot_path = plot_dir + '/' + model_name + '_' + plot_name
  plt.savefig(plot_path)
  print("Plots saved in: " + plot_path)

def plot_multiple_test_results_from_file(file_path, labels):
  data = []
  with open(file_path, 'rb') as fp:
    data = pickle.load(fp)
  plt.figure(figsize=(20,10))
  test_i = 0
  for datum in data:
    test_i += 1
    plt.plot([i for i in range(len(datum))], datum, label='Test ' + str(test_i))
  if 'x' in labels:
    plt.xlabel(labels['x'], fontsize=FONTSIZE)
  if 'y' in labels:
    plt.ylabel(labels['y'], fontsize=FONTSIZE)
  plt.yticks(fontsize=FONTSIZE)
  plt.xticks(fontsize=FONTSIZE)
  plt.legend(prop={'size':FONTSIZE})

def plot_multiple_files(file_paths, labels, legend_labels):
  data = []
  for file_path in file_paths:
    with open(file_path, 'rb') as fp:
      data.append(pickle.load(fp))
  plt.figure(figsize=(20,10))
  test_i = 0
  for i in range(len(data)):
    datum = data[i]
    test_i += 1
    plt.plot([i for i in range(len(datum))], datum, label=legend_labels[i])
  if 'x' in labels:
    plt.xlabel(labels['x'], fontsize=FONTSIZE)
  if 'y' in labels:
    plt.ylabel(labels['y'], fontsize=FONTSIZE)
  plt.yticks(fontsize=FONTSIZE)
  plt.xticks(fontsize=FONTSIZE)
  plt.legend(prop={'size':FONTSIZE})

def save_python_data_to_fs(data, env_name, test_name, model_name, data_name):
  save_dir = os.path.join('saved_data', env_name, test_name, model_name)
  if not os.path.exists(save_dir):
    os.makedirs(save_dir)
  file_path = os.path.join(save_dir, data_name)
  with open(file_path, 'wb') as fp:
    pickle.dump(data, fp)
  print('Data saved in: ' + file_path)

def save_model_to_fs(model, env_name, model_name):
  save_dir = os.path.join('keras_models', env_name)
  if not os.path.exists(save_dir):
    os.makedirs(save_dir)
  model_path = os.path.join(save_dir, model_name)
  model.save(model_path)
  print('Model saved in: ' + model_path)

def print_minutes(start_ms, finish_ms):
  total_ms = finish_ms - start_ms
  seconds = (total_ms / 1000) % 60
  minutes = (total_ms / (1000 * 60)) % 60
  hours = (total_ms / (1000 * 60 * 60)) % 24
  return "%d:%d:%d" % (hours, minutes, seconds)

def current_ms():
  return round(time.time() * 1000)

def embed_mp4(filename):
  """Embeds an mp4 file in the notebook."""
  video = open(filename,'rb').read()
  b64 = base64.b64encode(video)
  tag = '''
  <video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
  Your browser does not support the video tag.
  </video>'''.format(b64.decode())

  return IPython.display.HTML(tag)

def create_video_from_saved_model(model_path, filename, env_name, num_episodes=5, fps=20):
  saved_model = tf_load_model(policy_path)
  return create_policy_eval_video(saved_model, filename, env_name, num_episodes, fps)

def create_model_eval_video(model, filename, env_name, num_episodes=5, fps=20):
  env = gym.make(env_name)
  output_dim, input_dim = get_output_input_dim_from_env(env)

  filename = 'videos/' + filename + ".mp4"
  with imageio.get_writer(filename, fps=fps) as video:
    for episode in range(num_episodes):
      step = 0
      episode_reward = 0
      state = env.reset()
      state = np.reshape(state, [1, input_dim])
      video.append_data(env.render(mode='rgb_array'))
      episode_done = False
      while not episode_done:
        step += 1
        exploit_actions = model.predict(state)
        action = np.argmax(exploit_actions[0])
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        next_state = np.reshape(next_state, [1, input_dim])
        state = next_state
        episode_done = done
        frame = Image.fromarray(env.render(mode='rgb_array'))
        if episode_done:
          # Close the window
          env.close()
        frame_draw = ImageDraw.Draw(frame)
        frame_draw.text((10, 10), "Episode: " + str(episode) , fill =(255, 0, 0))
        frame_draw.text((10, 30), "Step: " + str(step) , fill =(255, 0, 0))
        frame_draw.text((10, 50), "Reward: " + str(episode_reward) , fill =(255, 0, 0))
        video.append_data(np.asarray(frame))
  return embed_mp4(filename)

def run_variance_convergence_tests(tests_to_run, hyperparams, env, seed):
  env_name = hyperparams['env_name']
  test_name = 'same_parameters_variance'

  if tests_to_run['dqn']:
    print ("### Start DQN")
    # Single Q network (standard DQN)
    s_hyperparams = hyperparams.copy()
    s_hyperparams['model_name'] = 'dqn_p1'
    s_hyperparams['loss'] = 'MSE'
    s_hyperparams['use_target_network'] = False
    s_hyperparams['double_q_learning'] = False
    s_rewards, s_mean_rewards, s_states = run_multiple_tests(s_hyperparams, env, seed)
    save_python_data_to_fs(s_rewards, env_name, test_name, s_hyperparams['model_name'], 's_rewards')
    save_python_data_to_fs(s_mean_rewards, env_name, test_name, s_hyperparams['model_name'], 's_mean_rewards')

  if tests_to_run['target_dqn_hl']:
    print ("### Start DQN with Target Network and Huber loss")
    # Huber loss
    hl_hyperparams = hyperparams.copy()
    hl_hyperparams['model_name'] = 'nature_dqn_p1_hl'
    hl_hyperparams['use_target_network'] = True
    hl_hyperparams['loss'] = 'Huber'
    hl_rewards, hl_mean_rewards, hl_states = run_multiple_tests(hl_hyperparams, env, seed)
    save_python_data_to_fs(hl_rewards, env_name, test_name, hl_hyperparams['model_name'], 'hl_rewards')
    save_python_data_to_fs(hl_mean_rewards, env_name, test_name, hl_hyperparams['model_name'], 'hl_mean_rewards')

  if tests_to_run['target_dqn_mse_noclip']:
    print ("### Start DQN with Target Network and MSE")
    # Mean squared error without error clipping
    mse_noclip_hyperparams = hyperparams.copy()
    mse_noclip_hyperparams['model_name'] = 'nature_dqn_p1_mse_noclip'
    mse_noclip_hyperparams['loss'] = 'MSE'
    mse_noclip_hyperparams['use_target_network'] = True
    mse_noclip_hyperparams['error_clipping'] = False
    mse_noclip_rewards, mse_noclip_mean_rewards, mse_noclip_states = run_multiple_tests(mse_noclip_hyperparams, env, seed)
    save_python_data_to_fs(mse_noclip_rewards, env_name, test_name, mse_noclip_hyperparams['model_name'], 'mse_noclip_rewards')
    save_python_data_to_fs(mse_noclip_mean_rewards, env_name, test_name, mse_noclip_hyperparams['model_name'], 'mse_noclip_mean_rewards')

  if tests_to_run['ddqn']:
    print ("### Start DDQN with Huber loss")
    # Huber loss
    hl_ddqn_hyperparams = hyperparams.copy()
    hl_ddqn_hyperparams['model_name'] = 'ddqn_p1_hl'
    hl_ddqn_hyperparams['loss'] = 'Huber'
    hl_ddqn_hyperparams['use_target_network'] = True
    hl_ddqn_hyperparams['use_double_q_learning'] = True

    hl_ddqn_rewards, hl_ddqn_mean_rewards, hl_ddqn_states = run_multiple_tests(hl_ddqn_hyperparams, env, seed)
    save_python_data_to_fs(hl_ddqn_rewards, env_name, test_name, hl_ddqn_hyperparams['model_name'], 'hl_ddqn_rewards')
    save_python_data_to_fs(hl_ddqn_mean_rewards, env_name, test_name, hl_ddqn_hyperparams['model_name'], 'hl_ddqn_mean_rewards')

  print('Done')

## CartPole

In [None]:
# https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py

cart_env_name = 'CartPole-v1' 

cart_env = gym.make(cart_env_name)
cart_train_py_env = gym.make(cart_env_name)
cart_test_py_env = gym.make(cart_env_name)
cart_train_py_env.seed(SEED)
cart_train_py_env.seed(SEED * 2)

[84]

In [None]:
cart_hyper = {
    'env_name': cart_env_name,
    'model_name': '',
    'layer_params': (128,),
    'learning_rate': 0.001,
    'gamma': 0.99,
    'epsilon_start': 1.0,
    'epsilon_end': 0.01,
    'epsilon_decay': 0.995,
    'use_target_network': True,
    'double_q_learning': False,
    'target_update_steps': 1000,
    'early_stopping': False,
    # Do not train if rewards for latest `early_stopping_batch` steps are higher than `early_stopping_limit` on average
    'early_stopping_batch': -10,
    'early_stopping_limit': 190,
    # Stop training altogether if rewards for latest `rewards_mean_episode_limit` are higher than `episode_solved_score` on average
    'rewards_mean_episode_limit': -100,
    'episode_solved_score': 200,
    'max_episode_steps': 200,
    'train_episodes': 1000,
    'update_frequency': 4,
    #'max_train_iterations': 500000, # overrides train_episodes
    'batch_size': 64,
    'max_buffer_size': 250000,
    'explore_only_episodes': -1, #50,
    'reward_clipping': False,
    'error_clipping': False,
    'loss': 'MSE',
    'optimizer': 'Adam',
    'eval_interval': -1,
    'log_episodes': True,
    'save_model_every_n_episodes': 400,
    'always_save_optimal_models': True
}

### Training performance variance / convergence

In [None]:
tests_to_run = {
    'dqn': True,
    'target_dqn_hl': True,
    'target_dqn_mse_noclip': True,
    'ddqn': True
}
run_variance_convergence_tests(tests_to_run, cart_hyper, cart_train_py_env, SEED)

### Model selection

In [None]:
ms_test_name = 'model_selection'
varying_vals = {
    'learning_rate': [0.01, 0.001, 0.0001],
    'epsilon_decay': [0.993, 0.995, 0.997],
    'layer_params': [(32,), (32, 32), (64,), (64, 64), (128,), (128, 128)]
}
hyper_list = generate_hyperparams_for_tests(cart_hyper, varying_vals)
run_different_hyperparams(hyper_list, cart_train_py_env, SEED, ms_test_name)

## MountainCar

In [None]:
# https://github.com/openai/gym/blob/master/gym/envs/classic_control/mountain_car.py

mount_env_name = 'MountainCar-v0' 

mount_env = gym.make(mount_env_name)
mount_train_py_env = gym.make(mount_env_name)
mount_test_py_env = gym.make(mount_env_name)
mount_train_py_env.seed(SEED)
mount_train_py_env.seed(SEED * 2)

[84]

In [None]:
mount_hyper = {
    'env_name': mount_env_name,
    'model_name': 'dqn_target_p1_2000ts',
    'layer_params': (64,),
    'learning_rate': 0.001,
    'gamma': 0.99,
    'epsilon_start': 1.0,
    'epsilon_end': 0.1,
    'epsilon_decay': 0.995,
    'use_target_network': True,
    'double_q_learning': False,
    'target_update_steps': 1000,
    'early_stopping': True,
    # Do not train if rewards for latest `early_stopping_batch` steps are higher than `early_stopping_limit` on average
    'early_stopping_batch': -10,
    'early_stopping_limit': -100,
    # Stop training altogether if rewards for latest `rewards_mean_episode_limit` are higher than `episode_solved_score` on average
    'rewards_mean_episode_limit': -100,
    'episode_solved_score': -110,
    'max_episode_steps': 200,
    'train_episodes': 2000,
    'update_frequency': 4,
    #'max_train_iterations': 500000, # overrides train_episodes
    'batch_size': 64,
    'max_buffer_size': 250000,
    'explore_only_episodes': 100, #50,
    'reward_clipping': False,
    'error_clipping': True,
    'loss': 'Huber',
    'optimizer': 'Adam',
    'eval_interval': -1,
    'log_episodes': True,
    'save_model_every_n_episodes': 400,
    'always_save_optimal_models': False
}

### Training performance variance / convergence

In [None]:
tests_to_run = {
    'dqn': True,
    'target_dqn_hl': True,
    'target_dqn_mse_noclip': True,
    'ddqn': False
}
run_variance_convergence_tests(tests_to_run, mount_hyper, mount_train_py_env, SEED)

## LunarLander

In [None]:
# https://github.com/openai/gym/blob/master/gym/envs/box2d/lunar_lander.py

land_env_name = 'LunarLander-v2'

land_train_py_env = gym.make(land_env_name)
land_test_py_env = gym.make(land_env_name)
land_train_py_env.seed(SEED)
land_test_py_env.seed(SEED * 2)

[84]

In [None]:
land_hyper = {
    'env_name': land_env_name,
    'model_name': '',
    'layer_params': (64,64),
    'learning_rate': 0.001,
    'gamma': 0.99,
    'epsilon_start': 1.0,
    'epsilon_end': 0.01,
    'epsilon_decay': 0.995,
    'double_q_learning': True,
    'use_target_network': True,
    'target_update_steps': 1000,
    'early_stopping': True,
    # Do not train if rewards for latest `early_stopping_batch` steps are higher than `early_stopping_limit` on average
    'early_stopping_batch': -10,
    'early_stopping_limit': 180,
    # Stop training altogether if rewards for latest `rewards_mean_episode_limit` are higher than `episode_solved_score` on average
    'rewards_mean_episode_limit': -100,
    'episode_solved_score': 200,
    'max_episode_steps': 1000,
    'train_episodes': 700,
    'update_frequency': 4,
    #'max_train_iterations': 500000, # overrides train_episodes
    'batch_size': 64,
    'max_buffer_size': 250000,
    'explore_only_episodes': -1, #50,
    'reward_clipping': False,
    'error_clipping': True,
    'loss': 'Huber',
    'optimizer': 'Adam',
    'eval_interval': -1,
    'log_episodes': True,
    'save_model_every_n_episodes': -1,
    'always_save_optimal_models': False
}

### Training performance variance / convergence

The next cell tests the performance variance (and convergence) of a model while training with the same hyperparameters for three times in a row.

In [None]:
tests_to_run = {
    'dqn': True, 
    'target_dqn_hl': True,
    'target_dqn_mse_noclip': True,
    'ddqn': True
}
run_variance_convergence_tests(tests_to_run, land_hyper, land_train_py_env, SEED)

### Model selection

In [None]:
ms_test_name = 'model_selection'
varying_vals = {
    'learning_rate': [0.01, 0.001, 0.0001],
    'epsilon_decay': [0.993, 0.995, 0.997],
    'layer_params': [(32,), (32, 32), (64,), (64, 64), (128,), (128, 128)]
}
hyper_list = generate_hyperparams_for_tests(land_hyper, varying_vals)
run_different_hyperparams(hyper_list, land_train_py_env, SEED, ms_test_name)

# Model based DQN

In [None]:
def create_dynamics_predictor_net(hyperparams, env):
  _, input_dim = get_output_input_dim_from_env(env)
  model = Sequential()
  model.add(Dense(hyperparams['dynamics_network']['layer_params'][0], input_dim=input_dim + 1, activation=relu, kernel_initializer=glorot_uniform))
  for lr in hyperparams['dynamics_network']['layer_params'][1:]:
    model.add(Dense(lr, activation=relu, kernel_initializer=glorot_uniform))
  model.add(Dense(input_dim, activation = linear))
  model.compile(loss=mean_squared_error, optimizer=Adam(learning_rate=hyperparams['dynamics_network']['learning_rate']))
  return model

# input: array of states Sf = [S1, S2, ...]
def compute_mean_covariance(obs_set):
  obs = np.transpose(np.array(obs_set))
  # Produce an array of means for the different features
  mean = np.mean(obs, axis=1)
  # Produce covariance matrix for the different features
  covariance = np.cov(obs)
  return mean, covariance

def predict_state(model, state, action):
  input = np.reshape(np.append(state[0], action), (1, state.shape[1] + 1))
  return model.predict(input)

def multivariate_gaussian(state, mean, covariance):
  np_state = np.array(state)
  return multivariate_normal.pdf(np_state, mean=mean, cov=covariance)

def get_explore_action(model, state, action_max, mean, covariance):
  predictions = []
  for action in range(action_max):
    state_prediction = predict_state(model, state, action)
    predictions.append(multivariate_gaussian(state_prediction, mean, covariance))
  return np.argmin(predictions)

def learn_dynamics_network(hyperparams, buffer, dynamics_network):
  # Skip learning until we have enough data in the buffer
  if len(buffer) < hyperparams['dynamics_network']['batch_size']:
    return

  sample_batch = get_batch_from_buffer(buffer, hyperparams['dynamics_network']['batch_size'])
  states, actions, _, next_states, _ = extract_data_from_batch(sample_batch)
  states_actions = np.append(states, np.reshape(actions, (hyperparams['dynamics_network']['batch_size'], 1)), axis = 1)
  history = dynamics_network.fit(states_actions, next_states, epochs=1, verbose=0)

def extract_states_from_buffer_samples(samples):
  states = []
  for sample in samples:
    states.append(np.squeeze(sample[0]))
  return states

def get_action_model_based(hyperparams, episode, state, epsilon, model, dynamics_network, max_action, buffer):
  explore = False
  if episode < hyperparams['explore_only_episodes'] or np.random.rand() < epsilon:
    explore = True
  
  # Explore: use dynamics network to select action which will lead to least likely next state
  if explore:
    start_index = hyperparams['dynamics_network']['F']
    # Account for initial states
    if len(buffer) < abs(start_index):
      start_index = -1 * len(buffer)
    Sf = [buffer[i] for i in range(start_index, 0)]
    # For the very first steps we must return a purely random action
    if len(buffer) < hyperparams['dynamics_network']['batch_size']:
      return random.randrange(max_action)
    Sf_states = extract_states_from_buffer_samples(Sf)
    mean, covariance = compute_mean_covariance(Sf_states)
    action = get_explore_action(dynamics_predictor, state, max_action, mean, covariance)
    return action

  # Exploit: select best predicted action
  exploit_actions = model.predict(state)
  return np.argmax(exploit_actions[0])

def train_model_based(hyperparams, env, eval_env, model, target_model, dynamics_network, buffer):
  start_time = current_ms()
  # Global iteration counter
  iteration = 0
  episode = -1
  # List of episode rewards
  episode_rewards = []
  mean_rewards = []
  visited_states = []
  eval_returns = []
  epsilon = hyperparams['epsilon_start']
  output_dim, input_dim = get_output_input_dim_from_env(env)

  while not train_done(hyperparams, episode, iteration):
    episode += 1
    episode_reward = 0
    state = env.reset()
    visited_states.append(state)
    state = np.reshape(state, [1, input_dim])
    for step in range(hyperparams['max_episode_steps']):
      action = get_action_model_based(hyperparams, episode, state, epsilon, model, dynamics_network, output_dim, buffer)
      next_state, reward, done, _ = env.step(action)
      if hyperparams['reward_clipping']:
        if reward < -1:
          reward = -1
        elif reward > 1:
          reward = 1
      visited_states.append(next_state)
      episode_reward += reward
      next_state = np.reshape(next_state, [1, input_dim])
      sample = (state, action, reward, next_state, done)
      add_sample_to_buffer(hyperparams, buffer, sample)
      state = next_state
      iteration += 1

      if not explore_only(hyperparams, episode, buffer) and iteration % hyperparams['update_frequency'] == 0:
        learn(hyperparams, output_dim, episode_rewards, buffer, model, target_model)
      
      # Every C steps set Q' = Q
      if hyperparams['use_target_network'] and not explore_only(hyperparams, episode, buffer) and iteration % hyperparams['target_update_steps']:
        target_model.set_weights(model.get_weights()) 

      # Train dynamics predictor
      learn_dynamics_network(hyperparams, buffer, dynamics_network)

      if hyperparams['eval_interval'] > 0 and iteration % hyperparams['eval_interval'] == 0:
        avg_returns, _ = eval_model(hyperparams, model, eval_env)
        eval_returns.append(avg_returns)
        print(">>> Iteration {0}, Episode {1}, Avg. returns {2}".format(iteration, episode, avg_returns))

      # Episode done
      if done:
        break
    episode_rewards.append(episode_reward)

    # Save model to fs
    if hyperparams['save_model_every_n_episodes'] > 0 and episode % hyperparams['save_model_every_n_episodes'] == 0:
      model_name = hyperparams['model_name'] + '_ep' + str(episode)
      if episode_reward > hyperparams['episode_solved_score']:
        model_name += '_SOLVED'
      save_model_to_fs(model, hyperparams['env_name'], model_name)
    elif hyperparams['always_save_optimal_models'] and episode_reward > hyperparams['episode_solved_score']:
      model_name = hyperparams['model_name'] + '_ep' + str(episode) + '_SOLVED'
      save_model_to_fs(model, hyperparams['env_name'], model_name)

    # Epsilon decay
    if not explore_only(hyperparams, episode, buffer) and hyperparams['epsilon_decay'] > 0 and epsilon > hyperparams['epsilon_end']:
      epsilon *= hyperparams['epsilon_decay']
    
    # Early stopping
    last_rewards_mean = np.mean(episode_rewards[hyperparams['rewards_mean_episode_limit']:])
    mean_rewards.append(last_rewards_mean)
    if hyperparams['early_stopping'] and last_rewards_mean > hyperparams['episode_solved_score']:
        print("Training complete with early stopping")
        break

    if hyperparams['log_episodes']:
      print("Iteration {0}, Episode {1}, Reward {2}, Avg. reward {3}, epsilon {4}".format(iteration, episode, episode_reward, last_rewards_mean, epsilon))

  end_time = current_ms()
  print("Evaluation done in", print_minutes(start_time, end_time))
  return episode_rewards, mean_rewards, eval_returns, visited_states

def plot_visited_states(states, shape):
  plt.figure(figsize=(20,10))
  plt.scatter([state[shape[0]] for state in states], [state[shape[1]] for state in states])

def plot_visited_states_from_file(file_path, shape, limit=(50*200)):
  data = []
  with open(file_path, 'rb') as fp:
    data = pickle.load(fp)
  plt.figure(figsize=(20,10))
  plt.scatter([state[shape[0]] for state in data[:limit]], [state[shape[1]] for state in data[:limit]])
  plt.xlabel('Posizione', fontsize=FONTSIZE)
  plt.ylabel('Velocità', fontsize=FONTSIZE)


## MountainCar

In [None]:
# https://github.com/openai/gym/blob/master/gym/envs/classic_control/mountain_car.py

mount_env_name = 'MountainCar-v0' 

mount_env = gym.make(mount_env_name)
mount_train_py_env = gym.make(mount_env_name)
mount_test_py_env = gym.make(mount_env_name)
mount_train_py_env.seed(SEED)
mount_train_py_env.seed(SEED * 2)

[84]

In [None]:
mount_hyper = {
    'env_name': mount_env_name,
    'model_name': 'dqn_model_based_p1_2000ts',
    'layer_params': (64,),
    'learning_rate': 0.001,
    'gamma': 0.99,
    'epsilon_start': 1.0,
    'epsilon_end': 0.01,
    'epsilon_decay': 0.995,
    'use_target_network': True,
    'double_q_learning': False,
    'target_update_steps': 1000,
    'early_stopping': True,
    # Do not train if rewards for latest `early_stopping_batch` steps are higher than `early_stopping_limit` on average
    'early_stopping_batch': -10,
    'early_stopping_limit': -100,
    # Stop training altogether if rewards for latest `rewards_mean_episode_limit` are higher than `episode_solved_score` on average
    'rewards_mean_episode_limit': -100,
    'episode_solved_score': -110,
    'max_episode_steps': 200,
    'train_episodes': 2000,
    'update_frequency': 4,
    #'max_train_iterations': 500000, # overrides train_episodes
    'batch_size': 64,
    'max_buffer_size': 250000,
    'explore_only_episodes': 100, #50,
    'reward_clipping': False,
    'error_clipping': True,
    'loss': 'Huber',
    'optimizer': 'Adam',
    'eval_interval': -1,
    'log_episodes': True,
    'save_model_every_n_episodes': 400,
    'always_save_optimal_models': False,
    'dynamics_network': {
        'layer_params': (24, 24),
        'batch_size': 64,
        'learning_rate': 0.02,
        'F': -50
    }
}

In [None]:
random.seed(SEED)
np.random.seed(SEED)
mount_train_py_env.seed(SEED)

test_name = 'same_parameter_variance'

for i in range(3):
  model = create_dqn_network(mount_hyper, mount_train_py_env)
  target_model = None
  if mount_hyper['use_target_network']:
    target_model = create_dqn_network(mount_hyper, mount_train_py_env)
    target_model.set_weights(model.get_weights())
  buffer = create_replay_buffer(mount_hyper)
  rewards, mean_rewards, eval_returns, visited_states = train(mount_hyper, mount_train_py_env, mount_test_py_env, model, target_model, buffer)
  save_python_data_to_fs(mean_rewards, mount_env_name, test_name, mount_hyper['model_name'], 'mean_rewards_' + str(i))
  save_model_to_fs(model, mount_env_name, mount_hyper['model_name'] + '_1000')

# Prioritized Experience Replay

In [None]:
# Sum-tree functions
# The samples array contains the experiences
# The p_tree array contains couples (sum of priorities of sub-tree leaves, max priority in sub-tree leaves)
def st_get_tree(hyperparams):
  # Array containing actual samples
  samples = np.zeros(hyperparams['max_buffer_size'], dtype=object)
  # Binary which leaves contain the probability of each sample
  p_tree = np.zeros(hyperparams['max_buffer_size'] * 2 - 1, dtype=object)
  for i in range(hyperparams['max_buffer_size'] * 2 - 1):
    p_tree[i] = (0, 0)
  return {'p_tree': p_tree, 'samples': samples, 'current_idx': 0}

def st_propagate_change(st, idx, probability_change, max_priority):
  # Propagate the probability change up the tree
  parent_node = (idx - 1) // 2
  max_p = max(st['p_tree'][parent_node][1], max_priority)
  st['p_tree'][parent_node] = (st['p_tree'][parent_node][0] + probability_change, max_p)
  # Check for termination
  if parent_node == 0:
    return
  else:
    return st_propagate_change(st, parent_node, probability_change, max_p)

def st_add(st, priority, sample):
  # Add sample to array of samples
  st['samples'][st['current_idx']] = sample
  # Create sum-tree leaf value
  leaf_idx = st['current_idx'] + len(st['samples']) - 1

  # Calculate the change in priority
  priority_change = priority - st['p_tree'][leaf_idx][0]
  st['p_tree'][leaf_idx] = (priority, priority)

  # Propagate the priority change up the tree
  st_propagate_change(st, leaf_idx, priority_change, priority)
  
  # Update index of written samples
  st['current_idx'] += 1
  if st['current_idx'] >= len(st['samples']):
    st['current_idx'] = 0

def st_update(st, idx, error):
  priority_change = error - st['p_tree'][idx][0]
  max_p = max(error, st['p_tree'][idx][1])
  st['p_tree'][idx] = (error, max_p)
  st_propagate_change(st, idx, priority_change, max_p)

def st_recursive_get(st, idx, p):
  # Calc indexes of left and right children
  left = 2 * idx + 1
  right = left + 1

  # Termination condition
  if left >= len(st['p_tree']):
    return idx

  # Keep looking for correct priority index down the tree
  if p <= st['p_tree'][left][0]:
    return st_recursive_get(st, left, p)
  else:
    return st_recursive_get(st, right, p - st['p_tree'][left][0])

def st_get(st, p):
  idx = st_recursive_get(st, 0, p)
  sample_idx = idx - len(st['samples']) + 1

  return (idx, st['p_tree'][idx][0], st['samples'][sample_idx])

def st_get_samples(st, batch_size):
  samples = []
  # Extract samples randomly from `batch_size` segments stored in the sum-tree
  segment_idx = st['p_tree'][0][0] / batch_size
  for i in range(batch_size):
    segment_start = segment_idx * i
    segment_end = segment_idx * (i + 1)

    p = random.uniform(segment_start, segment_end)
    (idx, p, sample) = st_get(st, p)
    samples.append((idx, p, sample))
  return samples

# Note: this is a problem if we have negative rewards/errors
def st_get_priority(hyperparams, error):
  return (error + (hyperparams['PER']['e'])) ** hyperparams['PER']['a']

def st_add_sample(hyperparams, st, error, sample):
  priority = st_get_priority(hyperparams, error)
  st_add(st, priority, sample)

def st_update_sample(hyperparams, st, error, sample_idx):
  priority = st_get_priority(hyperparams, error)
  st_update(st, sample_idx, priority)

# PER DQN functions

def extract_data_from_st_batch(st_batch):
  states = np.array([i[2][0] for i in st_batch])
  actions = np.array([i[2][1] for i in st_batch])
  rewards = np.array([i[2][2] for i in st_batch])
  next_states = np.array([i[2][3] for i in st_batch])
  dones = np.array([i[2][4] for i in st_batch])
  idxs = np.array([i[0] for i in st_batch])
  priorities = np.array([i[1] for i in st_batch])

  # Squeeze: remove axes of length 1 from array
  states = np.squeeze(states, axis=1)
  next_states = np.squeeze(next_states, axis=1)
  return states, actions, rewards, next_states, dones, idxs, priorities

def learn_per(hyperparams, output_dim, rewards, buffer, model, target_model):
  # Early stopping: do not train agent if it is performing well
  if hyperparams['early_stopping'] and np.mean(rewards[hyperparams['early_stopping_batch']:]) > hyperparams['early_stopping_limit']:
    return
  
  sample_batch = st_get_samples(buffer, hyperparams['batch_size'])
  states, actions, rewards, next_states, dones, idxs, priorities = extract_data_from_st_batch(sample_batch)

  # Calc Q(s, a)
  model_s_predictions = model.predict_on_batch(states)

  # Calc Q(s', a')
  model_ns_predictions = model.predict_on_batch(next_states)

  # Calc Q'(s', a')
  target_model_ns_predictions = target_model.predict_on_batch(next_states)

  errors = np.zeros(hyperparams['batch_size'])

  target_vec = np.zeros((hyperparams['batch_size'], output_dim))

  # Calc TD errors
  for i in range(hyperparams['batch_size']):
    target = np.zeros(output_dim)
    if dones[i]:
      target[actions[i]] = rewards[i]
    else:
      target[actions[i]] = rewards[i] + hyperparams['gamma'] * target_model_ns_predictions[i][np.argmax(model_ns_predictions[i])]

    errors[i] = abs(model_s_predictions[i][actions[i]] - target[actions[i]])
    target_vec[i] = target

    # Update sample weights
    st_update_sample(hyperparams, buffer, errors[i], idxs[i]) 

  model.fit(states, target_vec, epochs=1, batch_size=hyperparams['batch_size'], verbose=0)

def train_per(hyperparams, env, eval_env, model, target_model, buffer):
  start_time = current_ms()
  # Global iteration counter
  iteration = 0
  episode = -1
  # List of episode rewards
  episode_rewards = []
  mean_rewards = []
  visited_states = []
  eval_returns = []
  epsilon = hyperparams['epsilon_start']
  output_dim, input_dim = get_output_input_dim_from_env(env)

  while not train_done(hyperparams, episode, iteration):
    episode += 1
    episode_reward = 0
    state = env.reset()
    visited_states.append(state)
    state = np.reshape(state, [1, input_dim])
    for step in range(hyperparams['max_episode_steps']):
      action = get_action(hyperparams, episode, state, epsilon, model, output_dim)
      next_state, reward, done, _ = env.step(action)
      if hyperparams['reward_clipping']:
        if reward < -1:
          reward = -1
        elif reward > 1:
          reward = 1
      visited_states.append(next_state)
      episode_reward += reward
      next_state = np.reshape(next_state, [1, input_dim])
      sample = (state, action, reward, next_state, done)
      # Add sample with max priority (stored in the first element of the buffer sum-tree)
      st_add_sample(hyperparams, buffer, buffer['p_tree'][0][1], sample)
      state = next_state
      iteration += 1

      #if not explore_only(hyperparams, episode, buffer) and iteration % hyperparams['update_frequency'] == 0:
      if episode > hyperparams['start_training_after'] and iteration % hyperparams['update_frequency'] == 0:
        learn_per(hyperparams, output_dim, episode_rewards, buffer, model, target_model)
      
      # Every C steps set Q' = Q
      if hyperparams['use_target_network'] and not explore_only(hyperparams, episode, buffer) and iteration % hyperparams['target_update_steps']:
        target_model.set_weights(model.get_weights()) 

      if hyperparams['eval_interval'] > 0 and iteration % hyperparams['eval_interval'] == 0:
        avg_returns, _ = eval_model(hyperparams, model, eval_env)
        eval_returns.append(avg_returns)
        print(">>> Iteration {0}, Episode {1}, Avg. returns {2}".format(iteration, episode, avg_returns))

      # Episode done
      if done:
        break
    episode_rewards.append(episode_reward)

    # Save model to fs
    if hyperparams['save_model_every_n_episodes'] > 0 and episode % hyperparams['save_model_every_n_episodes'] == 0:
      model_name = hyperparams['model_name'] + '_ep' + str(episode)
      if episode_reward > hyperparams['episode_solved_score']:
        model_name += '_SOLVED'
      save_model_to_fs(model, hyperparams['env_name'], model_name)
    elif hyperparams['always_save_optimal_models'] and episode_reward > hyperparams['episode_solved_score']:
      model_name = hyperparams['model_name'] + '_ep' + str(episode) + '_SOLVED'
      save_model_to_fs(model, hyperparams['env_name'], model_name)

    # Epsilon decay
    if not explore_only(hyperparams, episode, buffer) and hyperparams['epsilon_decay'] > 0 and epsilon > hyperparams['epsilon_end']:
      epsilon *= hyperparams['epsilon_decay']
    
    # Early stopping
    last_rewards_mean = np.mean(episode_rewards[hyperparams['rewards_mean_episode_limit']:])
    mean_rewards.append(last_rewards_mean)
    if hyperparams['early_stopping'] and last_rewards_mean > hyperparams['episode_solved_score']:
        print("Training complete with early stopping")
        break

    if hyperparams['log_episodes']:
      print("Iteration {0}, Episode {1}, Reward {2}, Avg. reward {3}, epsilon {4}".format(iteration, episode, episode_reward, last_rewards_mean, epsilon))
      #print(">>>>", buffer['p_tree'][0])
  end_time = current_ms()
  print("Evaluation done in", print_minutes(start_time, end_time))
  return episode_rewards, mean_rewards, eval_returns, visited_states

def fill_memory(hyperparams, buffer, env):
  output_dim, input_dim = get_output_input_dim_from_env(env)
  state = env.reset()
  state = np.reshape(state, [1, input_dim])
  while True:
    action = random.randrange(output_dim)
    next_state, reward, done, _ = env.step(action)
    next_state = np.reshape(next_state, [1, input_dim])
    sample = (state, action, reward, next_state, done)
    st_add_sample(hyperparams, buffer, max(reward, 0.0), sample)
    state = next_state
    if done:
      state = env.reset()
      state = np.reshape(state, [1, input_dim])
    if buffer['current_idx'] == 0:
      break

## MountainCar

In [None]:
# https://github.com/openai/gym/blob/master/gym/envs/classic_control/mountain_car.py

mount_env_name = 'MountainCar-v0' 

mount_env = gym.make(mount_env_name)
mount_train_py_env = gym.make(mount_env_name)
mount_test_py_env = gym.make(mount_env_name)
mount_train_py_env.seed(SEED)
mount_train_py_env.seed(SEED * 2)

[84]

In [None]:
mount_hyper = {
    'env_name': mount_env_name,
    'model_name': 'dqn_PER2',
    'layer_params': (64,),
    'learning_rate': 0.001,
    'gamma': 0.99,
    'epsilon_start': 1.0,
    'epsilon_end': 0.1,
    'epsilon_decay': 0.995,
    'double_q_learning': False,
    'use_target_network': True,
    'target_update_steps': 1000,
    'early_stopping': True,
    # Do not train if rewards for latest `early_stopping_batch` steps are higher than `early_stopping_limit` on average
    'early_stopping_batch': -10,
    'early_stopping_limit': -100,
    # Stop training altogether if rewards for latest `rewards_mean_episode_limit` are higher than `episode_solved_score` on average
    'rewards_mean_episode_limit': -100,
    'episode_solved_score': -110,
    'max_episode_steps': 200,
    'train_episodes': 2000,
    'update_frequency': 4,
    #'max_train_iterations': 500000, # overrides train_episodes
    'batch_size': 64,
    'max_buffer_size': 250000,
    'start_training_after': 20,
    'explore_only_episodes': 100,
    'reward_clipping': False,
    'error_clipping': True,
    'loss': 'Huber',
    'optimizer': 'Adam',
    'eval_interval': -1,
    'log_episodes': True,
    'save_model_every_n_episodes': -1,
    'always_save_optimal_models': False,
    'minimum_reward': -1,
    'maximum_reward': 1,
    'dynamics_network': {
        'layer_params': (24, 24),
        'batch_size': 64,
        'learning_rate': 0.02,
        'F': 50
    },
    'PER': {
        'e': 0.01,
        'a': 0.6,
        'b': 0.5
    }
}

In [None]:
random.seed(SEED)
np.random.seed(SEED)
mount_train_py_env.seed(SEED)

test_name = 'same_parameter_variance_per'

for i in range(3):
  model = create_dqn_network(mount_hyper, mount_train_py_env)
  target_model = None
  if mount_hyper['use_target_network']:
    target_model = create_dqn_network(mount_hyper, mount_train_py_env)
    target_model.set_weights(model.get_weights())
  buffer = st_get_tree(mount_hyper)
  fill_memory(mount_hyper, buffer, mount_train_py_env)
  rewards, mean_rewards, eval_returns, visited_states = train_per(mount_hyper, mount_train_py_env, mount_test_py_env, model, target_model, buffer)
  save_python_data_to_fs(mean_rewards, mount_env_name, test_name, mount_hyper['model_name'], 'mean_rewards_' + str(i))
  save_model_to_fs(model, mount_env_name, mount_hyper['model_name'] + '_2000')

## LunarLander

In [None]:
# https://github.com/openai/gym/blob/master/gym/envs/box2d/lunar_lander.py

land_env_name = 'LunarLander-v2'

land_train_py_env = gym.make(land_env_name)
land_test_py_env = gym.make(land_env_name)
land_train_py_env.seed(SEED)
land_test_py_env.seed(SEED * 2)

[84]

In [None]:
land_hyper = {
    'env_name': land_env_name,
    'model_name': 'dqn_PER2',
    'layer_params': (64,64),
    'learning_rate': 0.001,
    'gamma': 0.99,
    'epsilon_start': 1.0,
    'epsilon_end': 0.01,
    'epsilon_decay': 0.995,
    'double_q_learning': False,
    'use_target_network': True,
    'target_update_steps': 1000,
    'early_stopping': True,
    # Do not train if rewards for latest `early_stopping_batch` steps are higher than `early_stopping_limit` on average
    'early_stopping_batch': -10,
    'early_stopping_limit': 180,
    # Stop training altogether if rewards for latest `rewards_mean_episode_limit` are higher than `episode_solved_score` on average
    'rewards_mean_episode_limit': -100,
    'episode_solved_score': 200,
    'max_episode_steps': 1000,
    'train_episodes': 700,
    'update_frequency': 4,
    #'max_train_iterations': 500000, # overrides train_episodes
    'batch_size': 64,
    'max_buffer_size': 250000,
    'explore_only_episodes': 100, #50,
    'start_training_after': 0,
    'reward_clipping': False,
    'error_clipping': True,
    'loss': 'Huber',
    'optimizer': 'Adam',
    'eval_interval': -1,
    'log_episodes': True,
    'save_model_every_n_episodes': -1,
    'always_save_optimal_models': False,
    'PER': {
        'e': 0.01,
        'a': 0.6,
        'b': 0.5
    }
}

In [None]:
random.seed(SEED)
np.random.seed(SEED)
mount_train_py_env.seed(SEED)

test_name = 'same_parameter_variance_per'

for i in range(3):
  model = create_dqn_network(land_hyper, land_train_py_env)
  target_model = None
  if land_hyper['use_target_network']:
    target_model = create_dqn_network(land_hyper, land_train_py_env)
    target_model.set_weights(model.get_weights())
  buffer = st_get_tree(land_hyper)
  fill_memory(land_hyper, buffer, land_train_py_env)
  rewards, mean_rewards, eval_returns, visited_states = train_per(land_hyper, land_train_py_env, land_test_py_env, model, target_model, buffer)
  save_python_data_to_fs(mean_rewards, land_env_name, test_name, land_hyper['model_name'], 'mean_rewards_' + str(i))
  save_model_to_fs(model, land_env_name, land_hyper['model_name'] + '_2000')

In [None]:
# Helpful references and sources of inspiration
# https://github.com/fakemonk1/Reinforcement-Learning-Lunar_Lander
# https://danieltakeshi.github.io/2019/07/14/per/
# https://jaromiru.com/2016/11/07/lets-make-a-dqn-double-learning-and-prioritized-experience-replay/
# https://github.com/jaromiru/AI-blog/blob/master/Seaquest-DDQN-PER.py
# https://wingedsheep.com/lunar-lander-dqn/