# Main Tests
### <font color='5E5D5D'> Descripción </font>

<i><font color='C9614B'> En esta notebook se encuentra la mayor parte de las pruebas realizadas en lo que refiere a entrenamiento de agentes DQN y NFQ sin ventanas de rollout. Tener en cuenta que la configuración que se presenta aquí, representa únicamente una de las evaluadas.</font></i><br>

***

## Setup

In [None]:
PLAY_VIDEO = False

#### Libraries

In [None]:
# Prerequisite installation: use only if needed
#! pip install gym pyvirtualdisplay > /dev/null 2>&1
#! apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

# Models
from models.models import DQNModel, FQNModel

# Agents
from agents.dqn_agent import DQNAgent
from agents.fqn_agent import FQNAgent

# Utils
from utils.utils_func import process_state, save_dataset, load_dataset
from utils.visualization import plot_results, wrap_env, show_video

# Gym
import gym

# PyTorch
import torch

# Misc
import numpy as np
import random

# Plots
import matplotlib.pyplot as plt

## Tests

### Deep Q-Learning

Entrenamiento de agentes DQN, utilizando el ambiente ``MountainCar-v0``.

#### Mountain Car

In [None]:
# Training global vars
BUFFER_SIZE = 2000
GAMMA = 0.99
NUM_EPISODES = 1000
MAX_STEPS = 200

BATCH_SIZE = 64
LEARNING_RATE = 0.001

# Dataset generation (RAND, ESA, EMA)
NUM_SAMPLES = 1000
NUM_RUNS = 1
DATASET_ACTION_TYPE='greedy'

De manera de evitar -en cierta medida- particularidades en las ejecuciones, se realizan ``NUM_RUNS`` entrenamientos. Para cada uno de ellos, se genera un dataset de experiencia y se agrega al dataset final que se utilizará para entrenar NFQ.

In [None]:
# Arrays para los resultados finales
rewards_mc_dqn, steps_mc_dqn = [], []
agent_mc_dqn = None   # var for final agent

# Model for DQN agent
model_dqn = None

# Dataset to store experience
dataset = []

# List to store trajectories (for RIS calculation)
trajectories = []

# Initial seed (could be set to any value)
num_seed = np.random.randint(0, 1000)

for _run in range(NUM_RUNS):
  # Print run
  print(f"\nRun #{_run+1} | Seed: {num_seed}")
  print("********************************************************")

  # Environment
  env = gym.make("MountainCar-v0")

  # Seed setup
  env.seed(num_seed)
  random.seed(num_seed)
  np.random.seed(num_seed)
  torch.manual_seed(num_seed)
  torch.backends.cudnn.deterministic = True
  
  # Model creation
  model_dqn = DQNModel(2, env.action_space.n)

  # Agent creation
  agent_mc_dqn = DQNAgent(env, model_dqn, process_state, 
                          BUFFER_SIZE, BATCH_SIZE, LEARNING_RATE, GAMMA, 
                          epsilon_i=0.99, epsilon_f=0.1, epsilon_anneal_time=1000)

  # Agent training
  rewards, steps_per_episode = agent_mc_dqn.train(NUM_EPISODES, MAX_STEPS)
  
  # Save results
  rewards_mc_dqn.append(rewards)
  steps_mc_dqn.append(steps_per_episode)

  # Using the trained agent(s), generate dataset
  env = gym.make("MountainCar-v0")
  experience, traject = agent_mc_dqn.generate_dataset(env, action_type=DATASET_ACTION_TYPE, epsilon=.1, num_samples=NUM_SAMPLES, max_steps=MAX_STEPS)
  dataset.extend(experience)
  trajectories.extend(traject)

  # Increment seed
  num_seed += 1

# Results averaging (per run)
rewards_mc_dqn = np.mean(rewards_mc_dqn, axis=0)
steps_mc_dqn = np.mean(steps_mc_dqn, axis=0)

In [None]:
# "Smooth" plot
plot_results(NUM_EPISODES, rewards_mc_dqn, steps_mc_dqn)

In [None]:
if PLAY_VIDEO:
    # Check learning through video
    wrapped_env = wrap_env(gym.make("MountainCar-v0"))

    agent_mc_dqn.record_test_episode(wrapped_env)

In [None]:
# Pickle dataset with experience
filename = 'datasets/dataset_sample.pkl'
save_dataset(filename, dataset)

In [None]:
# Pickle trajectories
filename = 'datasets/trajectories_sample.pkl'
save_dataset(filename, trajectories)

### Neural Fitted Q-Iteration

Entrenamiento de agentes NFQ, utilizando el ambiente ``MountainCar-v0``.

#### Mountain Car

In [None]:
# Load dataset with experience
filename = 'datasets/dataset_sample.pkl'
dataset = load_dataset(filename)

In [None]:
# Load trajectories
filename = 'datasets/trajectories_sample.pkl'
trajectories = load_dataset(filename)

In [None]:
# Training vars
GAMMA = 0.99
NUM_EPISODES = 2000
MAX_STEPS = 200
BATCH_SIZE = 64
LEARNING_RATE = 0.001
EARLY_STOPPING = True
EARLY_STOPPING_PATIENCE = 100

# Vars for RIS metric
N_ACTIONS = 3
HORIZON = 200

# Vars for testing cases
IS_TEST = True
TEST_RUN_TRIALS = 100

In [None]:
# NFQ
# Arrays para los resultados finales
rewards_mc_nfq, steps_mc_nfq = [], []

# Creo el ambiente
env = gym.make("MountainCar-v0")

# Creo el modelo
model_nfq = FQNModel(2, env.action_space.n)

# Creo el agente
agent_mc_nfq = FQNAgent(env, model_nfq, process_state, 
                        BATCH_SIZE, LEARNING_RATE, GAMMA,
                        dataset, trajectories, 
                        N_ACTIONS, HORIZON)

In [None]:
# Entreno al agente
rewards_mc_nfq, _, ris_mc_nfq = agent_mc_nfq.train_from_dataset(NUM_EPISODES, is_test=IS_TEST, test_run_trials=TEST_RUN_TRIALS, early_stopping=EARLY_STOPPING, es_patience=EARLY_STOPPING_PATIENCE)

In [None]:
# RIS plot
# 'x' axis dim must be terminal episode number + 1
plt.plot(range(102), ris_mc_nfq)
plt.title("Episode RIS")
plt.xlabel("Episode Number")
plt.ylabel("Value")
plt.show()

In [None]:
# Rewards plot
plt.plot(range(200), [item for sublist in rewards_mc_nfq for item in sublist])
plt.title("Test Run Rewards")
plt.xlabel("Episode Number")
plt.ylabel("Reward")
plt.show()

In [None]:
if PLAY_VIDEO:
    # Check learning through video
    wrapped_env = wrap_env(gym.make("MountainCar-v0"))

    agent_mc_nfq.record_test_episode(wrapped_env)