In [None]:
import os
import sys
from itertools import count

import gym
import matplotlib.pyplot as plt
import torch
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

# Make library available in path
!git clone https://github.com/fom-big-data/fom-openai-gym-rl
lib_path = os.path.join(os.getcwd(), 'fom-openai-gym-rl', 'notebooks', '00-basemodel', 'atari-dqn', 'lib')
if not (lib_path in sys.path):
    sys.path.insert(0, lib_path)

# Import library classes
from replay_memory import ReplayMemory
from deep_q_network import DeepQNetwork
from action_selector import ActionSelector
from input_extractor import InputExtractor
from model_optimizer import ModelOptimizer
from environment_enum import Environment

# 0 Setup

In [None]:
ENVIRONMENT_NAME = Environment.PONG_v0
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10
REPLAY_MEMORY_SIZE = 10000
NUM_EPISODES = 50

## 0.1 Configure device

In [None]:
# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 0.2 Set up matplotlib

In [None]:
# Enable interactive mode of matplotlib
plt.ion()

## 0.3 Set up TensorBoard

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

## 0.4 Set up environment

In [None]:
# Initialize environment
env = gym.make(ENVIRONMENT_NAME.value).unwrapped
# Reset environment
env.reset()
# Plot initial screen
InputExtractor.plot_screen(InputExtractor.get_sharp_screen(env=env, device=device), 'Example extracted screen')

# 1 Set up nets

# 1.1 Define nets

In [None]:
# Get screen size so that we can initialize layers correctly based on shape
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
# which is the result of a clamped and down-scaled render buffer in get_screen()
init_screen = InputExtractor.get_screen(env=env, device=device)
_, _, screen_height, screen_width = init_screen.shape

# Get number of actions from gym action space
n_actions = env.action_space.n

# Initialize policy net and target net
policy_net = DeepQNetwork(screen_height, screen_width, n_actions).to(device)
target_net = DeepQNetwork(screen_height, screen_width, n_actions).to(device)

# Since both nets are initialized randomly we need to copy the state of one into the other to make sure they are equal
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

# 1.2 Define optimizer and replay memory

In [None]:
# Initialize optimizer
optimizer = optim.RMSprop(policy_net.parameters())
# Initialize replay memory
memory = ReplayMemory(REPLAY_MEMORY_SIZE)

# 2 Training

In [None]:
episode_durations = []
episode_losses = []
episode_rewards = []

# 2.1 Display TensorBoard

In [None]:
# Initialize writer
tensorboard_summary_writer = SummaryWriter()
%tensorboard --logdir=runs

# 2.2 Training loop

In [None]:
# Iterate over episodes
for i_episode in range(NUM_EPISODES):

    # Initialize the environment and state
    env.reset()
    last_screen = InputExtractor.get_screen(env=env, device=device)
    current_screen = InputExtractor.get_screen(env=env, device=device)
    state = current_screen - last_screen
    
    # Run episode until status done is reached
    for t in count():
        # Select and perform an action
        action = ActionSelector.select_action(state=state,
                                              n_actions=n_actions,
                                              policy_net=policy_net,
                                              epsilon_end=EPS_END,
                                              epsilon_start=EPS_START,
                                              epsilon_decay=EPS_DECAY,
                                              device=device)
        _, reward, done, _ = env.step(action.item())

        # Transform reward into a tensor
        reward = torch.tensor([reward], device=device)

        # Observe new state
        last_screen = current_screen
        current_screen = InputExtractor.get_screen(env=env, device=device)
        
        if not done:
            next_state = current_screen - last_screen
        else:
            next_state = None

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the target network)
        loss = ModelOptimizer.optimize_model(policy_net=policy_net,
                                      target_net=target_net,
                                      optimizer=optimizer,
                                      memory=memory,
                                      batch_size=BATCH_SIZE,
                                      gamma=GAMMA,
                                      device=device)

        # Plot performance once the episode is done
        if done:
            episode_durations.append(t + 1)

            if loss is None:
                print("Episode  " + str(i_episode+1) + " (" + str(t) + " frames) reward " + str(reward.item()))
            else:
                print("Episode  " + str(i_episode+1) + " (" + str(t) + " frames) reward "
                      + str(reward.item()) + " loss " + str(loss.item()))
                episode_losses.append(loss.item())
            break

    # Update the target network, copying all weights and biases from policy net into target net
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

print('Complete')
env.render()
env.close()
plt.ioff()
plt.show()