In [None]:
import os
import sys
import time

import matplotlib.pyplot as plt
import torch
import torch.optim as optim
from torch.utils.tensorboard import FileWriter
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm_notebook as tqdm

# Make library available in path
!rm -rf 'fom-openai-gym-rl'
!git clone https://github.com/fom-big-data/fom-openai-gym-rl
lib_path = os.path.join(os.getcwd(), 'fom-openai-gym-rl', 'notebooks', '00-basemodel', 'atari-dqn', 'lib')
if not (lib_path in sys.path):
    sys.path.insert(0, lib_path)

# Import library classes
from replay_memory import ReplayMemory
from deep_q_network import DeepQNetwork
from action_selector import ActionSelector
from input_extractor import InputExtractor
from model_optimizer import ModelOptimizer
from model_storage import ModelStorage
from environment_enum import Environment
from pong_reward_shaper import PongRewardShaper
from reward_shape_enum import RewardShape
from performance_logger import PerformanceLogger
from environment_builder import EnvironmentBuilder
from environment_builder import EnvironmentWrapper

# 0 Setup

In [None]:
ENVIRONMENT_NAME = Environment.PONG_NO_FRAMESKIP_v4
ENVIRONMENT_WRAPPERS = [
    EnvironmentWrapper.NOOP_RESET_ENV,
    EnvironmentWrapper.MAX_AND_SKIP_ENV,
    # EnvironmentWrapper.FRAME_STACK,
]
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 5
REPLAY_MEMORY_SIZE = 10_000
NUM_FRAMES = 1_000_000
REWARD_SHAPINGS = [
    RewardShape.PONG_PLAYER_RACKET_HITS_BALL,
    RewardShape.PONG_PLAYER_RACKET_CLOSE_TO_BALL_LINEAR,
    RewardShape.PONG_OPPONENT_RACKET_HITS_BALL,
    RewardShape.PONG_OPPONENT_RACKET_CLOSE_TO_BALL_LINEAR,
]

## 0.1 Configure device

In [None]:
# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 0.2 Set up matplotlib

In [None]:
# Enable interactive mode of matplotlib
plt.ion()

## 0.3 Set up TensorBoard

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

## 0.4 Set up environment

In [None]:
# Initialize environment
env = EnvironmentBuilder.make_environment_with_wrappers(ENVIRONMENT_NAME.value, ENVIRONMENT_WRAPPERS)
# Reset environment
env.reset()
# Plot initial screen
InputExtractor.plot_screen(InputExtractor.get_sharp_screen(env=env, device=device), 'Example extracted screen')

# 1 Set up nets

# 1.1 Define nets

In [None]:
# Get screen size so that we can initialize layers correctly based on shape
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
# which is the result of a clamped and down-scaled render buffer in get_screen()
init_screen = InputExtractor.get_screen(env=env, device=device)
_, _, screen_height, screen_width = init_screen.shape

# Get number of actions from gym action space
n_actions = env.action_space.n

# Initialize policy net and target net
policy_net = DeepQNetwork(screen_height, screen_width, n_actions).to(device)
target_net = DeepQNetwork(screen_height, screen_width, n_actions).to(device)

# Since both nets are initialized randomly we need to copy the state of one into the other to make sure they are equal
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

# 1.2 Define optimizer and replay memory

In [None]:
# Initialize optimizer
optimizer = optim.RMSprop(policy_net.parameters())
# Initialize replay memory
memory = ReplayMemory(REPLAY_MEMORY_SIZE)

# 2 Training

In [None]:
# Initialize total variables
total_frames = 0
total_episodes = 0
total_original_rewards = []
total_shaped_rewards = []
total_start_time = time.time()

# Initialize episode variables
episode_frames = 0
episode_original_reward = 0
episode_shaped_reward = 0
episode_start_time = time.time()

# Initialize the environment and state
env.reset()
last_screen = InputExtractor.get_screen(env=env, device=device)
current_screen = InputExtractor.get_screen(env=env, device=device)
state = current_screen - last_screen

# 2.1 Display TensorBoard

In [None]:
# Initialize writer
tensorboard_summary_writer = SummaryWriter()
tensorboard_file_writer = FileWriter("images")
%tensorboard --logdir=runs

# 2.2 Training loop

In [None]:
# Iterate over frames
progress_bar = tqdm(range(NUM_FRAMES), unit='frames')
for total_frames in progress_bar:

    # Select and perform an action
    action = ActionSelector.select_action(state=state,
                                          n_actions=n_actions,
                                          policy_net=policy_net,
                                          epsilon_end=EPS_END,
                                          epsilon_start=EPS_START,
                                          epsilon_decay=EPS_DECAY,
                                          device=device)

    # Do step
    observation, reward, done, info = env.step(action.item())
    
    # Unwrap observations if frame stack is in use
    if EnvironmentWrapper.FRAME_STACK in ENVIRONMENT_WRAPPERS:
        print("Not yet supported")
        exit

    # Shape reward
    original_reward = reward
    shaped_reward = reward

    if ENVIRONMENT_NAME == Environment.PONG_v0 \
            or ENVIRONMENT_NAME == Environment.PONG_v4 \
            or ENVIRONMENT_NAME == Environment.PONG_DETERMINISTIC_v0 \
            or ENVIRONMENT_NAME == Environment.PONG_DETERMINISTIC_v4 \
            or ENVIRONMENT_NAME == Environment.PONG_NO_FRAMESKIP_v0 \
            or ENVIRONMENT_NAME == Environment.PONG_NO_FRAMESKIP_v4:
        # # Plot intermediate screen after scoring
        # InputExtractor.plot_screen(InputExtractor.get_sharp_screen(env=env, device=device), "Frame " + str(
        #     total_frames) + " / reward " + str(round(reward, 4)) + " / GOOOAAAAL!!!")
        
        reward_shaper = PongRewardShaper(observation, reward, done, info)

        if RewardShape.PONG_PLAYER_RACKET_HITS_BALL in REWARD_SHAPINGS:
            additional_reward = reward_shaper.reward_player_racket_hits_ball()
            # if additional_reward != 0:
            #     # Plot screen after additional reward has been given
            #     InputExtractor.plot_screen(InputExtractor.get_sharp_screen(env=env, device=device),
            #                                str(total_frames) + " / Player hits ball")
            shaped_reward += additional_reward
        if RewardShape.PONG_PLAYER_RACKET_COVERS_BALL in REWARD_SHAPINGS:
            shaped_reward += reward_shaper.reward_player_racket_covers_ball()
        if RewardShape.PONG_PLAYER_RACKET_CLOSE_TO_BALL_LINEAR in REWARD_SHAPINGS:
            shaped_reward += reward_shaper.reward_player_racket_close_to_ball_linear()
        if RewardShape.PONG_PLAYER_RACKET_CLOSE_TO_BALL_QUADRATIC in REWARD_SHAPINGS:
            shaped_reward += reward_shaper.reward_player_racket_close_to_ball_quadractic()

        if RewardShape.PONG_OPPONENT_RACKET_HITS_BALL in REWARD_SHAPINGS:
            additional_reward = reward_shaper.reward_opponent_racket_hits_ball()
            # if additional_reward != 0:
            #     # Plot screen after additional reward has been given
            #     InputExtractor.plot_screen(InputExtractor.get_sharp_screen(env=env, device=device),
            #                                str(total_frames) + " / Opponent hits ball")
            shaped_reward += additional_reward
        if RewardShape.PONG_OPPONENT_RACKET_COVERS_BALL in REWARD_SHAPINGS:
            shaped_reward += reward_shaper.reward_opponent_racket_covers_ball()
        if RewardShape.PONG_OPPONENT_RACKET_CLOSE_TO_BALL_LINEAR in REWARD_SHAPINGS:
            shaped_reward += reward_shaper.reward_opponent_racket_close_to_ball_linear()
        if RewardShape.PONG_OPPONENT_RACKET_CLOSE_TO_BALL_QUADRATIC in REWARD_SHAPINGS:
            shaped_reward += reward_shaper.reward_opponent_racket_close_to_ball_quadractic()


    # # Plot intermediate screen
    # if total_frames % 50 == 0:
    #     InputExtractor.plot_screen(InputExtractor.get_sharp_screen(env=env, device=device), "Frame " + str(
    #         total_frames) + " / shaped reward " + str(round(shaped_reward, 4)))
        
    # Use shaped reward for further processing
    reward = shaped_reward

    # Add reward to episode reward
    episode_original_reward += original_reward
    episode_shaped_reward += shaped_reward

    # Transform reward into a tensor
    reward = torch.tensor([reward], device=device)

    # Observe new state
    last_screen = current_screen
    current_screen = InputExtractor.get_screen(env=env, device=device)

    if not done:
        next_state = current_screen - last_screen
    else:
        next_state = None

    # Store the transition in memory
    memory.push(state, action, next_state, reward)

    # Move to the next state
    state = next_state

    # Perform one step of the optimization (on the target network)
    loss = ModelOptimizer.optimize_model(policy_net=policy_net,
                                         target_net=target_net,
                                         optimizer=optimizer,
                                         memory=memory,
                                         batch_size=BATCH_SIZE,
                                         gamma=GAMMA,
                                         device=device)

    if done:    
        # Track episode time
        episode_end_time = time.time()
        episode_duration = episode_end_time - episode_start_time
        total_duration = episode_end_time - total_start_time

        # Add rewards to total reward
        total_original_rewards.append(episode_original_reward)
        total_shaped_rewards.append(episode_shaped_reward)

        if loss is not None:
            PerformanceLogger.log_episode_short(total_episodes=total_episodes + 1,
                                                total_frames=total_frames,
                                                total_duration=total_duration,
                                                total_original_rewards=total_original_rewards,
                                                total_shaped_rewards=total_shaped_rewards,
                                                episode_frames=episode_frames + 1,
                                                episode_original_reward=episode_original_reward,
                                                episode_shaped_reward=episode_shaped_reward,
                                                episode_loss=loss.item(),
                                                episode_duration=episode_duration)

        # Update the target network, copying all weights and biases from policy net into target net
        if total_episodes % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())
            
            # Save model
            ModelStorage.saveModel(total_frames=total_frames,
                           net=target_net,
                           optimizer=optimizer,
                           loss=loss,
                           environment_name=ENVIRONMENT_NAME,
                           environment_wrappers=ENVIRONMENT_WRAPPERS,
                           batch_size=BATCH_SIZE,
                           gamma=GAMMA,
                           eps_start=EPS_START,
                           eps_end=EPS_END,
                           eps_decay=EPS_DECAY,
                           target_update=TARGET_UPDATE,
                           replay_memory_size=REPLAY_MEMORY_SIZE,
                           num_frames=NUM_FRAMES,
                           reward_shapings=REWARD_SHAPINGS
                           )

        # Reset episode variables
        episode_frames = 0
        episode_original_reward = 0
        episode_shaped_reward = 0
        episode_start_time = time.time()
        
        # Reset the environment and state
        env.reset()
        last_screen = InputExtractor.get_screen(env=env, device=device)
        current_screen = InputExtractor.get_screen(env=env, device=device)
        state = current_screen - last_screen

        # Increment counter
        total_episodes += 1

    # Increment counter
    episode_frames += 1

print('Complete')
env.render()
env.close()
plt.ioff()
plt.show()