# TODO 
- (Later)Add tests for functions

In [None]:
import numpy as np
import random
import gym
import tqdm
import torch
import torch.nn as nn

In [None]:
def calculate_naive_returns(rewards):
    """ Calculates a list of naive returns given a 
    list of rewards."""
    total_returns = np.zeros(len(rewards))
    total_return = 0.0
    for t in range(len(rewards), 0):
        total_return = total_return + reward
        total_returns[t] = total_return
    return total_returns


def discount_rewards(rewards, gamma=0.98):
    discounted_returns = [0 for _ in rewards]
    discounted_returns[-1] = rewards[-1]
    # iterate backwards
    for t in range(len(rewards)-2, -1, -1): 
        discounted_returns[t] = (rewards[t] + 
              discounted_returns[t+1]*gamma)
    return discounted_returns

def epsilon_greedy_action(action_distribution, 
                          epsilon=1e-1):
    if random.random() < epsilon:
        return np.argmax(np.random.random(
           action_distribution.shape))
    else:
        return np.argmax(action_distribution)

def epsilon_greedy_action_annealed(action_distribution,
                                   percentage, 
                                   epsilon_start=1.0, 
                                   epsilon_end=1e-2):
    annealed_epsilon = (epsilon_start*(1.0-percentage) + 
                        epsilon_end*percentage)
    if random.random() < annealed_epsilon:
        return np.argmax(np.random.random(
          action_distribution.shape))
    else:
        return np.argmax(action_distribution)

# Pole-Cart with Policy Gradients

## Creating an Agent

In [22]:
class PGAgent(object):
    def __init__(self, state_size, num_actions, 
                 hidden_size, 
                 learning_rate=1e-3, 
                 explore_exploit_setting= \
                 'epsilon_greedy_annealed_1.0->0.001'):
        self.state_size = state_size
        self.num_actions = num_actions
        self.hidden_size = hidden_size
        self.learning_rate = learning_rate
        self.explore_exploit_setting = \
                        explore_exploit_setting
        self.build_model()
        self.build_training()


    def build_model(self):
      self.model = torch.nn.Sequential(
        nn.Linear(self.state_size, self.hidden_size),
        nn.Linear(self.hidden_size, self.hidden_size),
        nn.Linear(self.hidden_size, self.num_actions),
        nn.Softmax())

    def train(self, state, action_input, reward_input):
        self.output = self.model(state)
        # Select the logits related to the action taken
        self.output_index_for_actions = (torch.range(0, 
                  self.output.size(dim=0)-1) * \
                  self.output.size(dim=1)) + action_input
        self.logits_for_actions = torch.gather(
            torch.reshape(self.output, (-1,)),0,
            self.output_index_for_actions)
        self.loss = - torch.mean(
            torch.log(self.logits_for_actions) * 
            self.reward_input)
        self.loss.backward()
        self.optimizer = optim.AdamOptimizer(
            self.model.parameters())
        self.optimizer.step()
        self.loss.zero_grad()
        return self.loss.item()
        


    def sample_action_from_distribution(self, 
                                        action_distribution, 
                                        epsilon_percentage):
        # Choose an action based on the action probability
        # distribution and an explore vs exploit
        if self.explore_exploit_setting == \
          'greedy':
              action = greedy_action(
                  action_distribution)
        elif self.explore_exploit_setting == \
          'epsilon_greedy_0.05':
              action = epsilon_greedy_action(
                  action_distribution,0.05)
        elif self.explore_exploit_setting == \
          'epsilon_greedy_0.25':
              action = epsilon_greedy_action(
                  action_distribution,0.25)
        elif self.explore_exploit_setting == \ 
          'epsilon_greedy_0.50':
              action = epsilon_greedy_action(
                  action_distribution,0.50)
        elif self.explore_exploit_setting == \ 
          'epsilon_greedy_0.90':
              action = epsilon_greedy_action(
                  action_distribution,0.90)
        elif self.explore_exploit_setting == \
          'epsilon_greedy_annealed_1.0->0.001':
              action = epsilon_greedy_action_annealed(
                  action_distribution, 
                  epsilon_percentage, 1.0,0.001)
        elif self.explore_exploit_setting == \
          'epsilon_greedy_annealed_0.5->0.001':
              action = epsilon_greedy_action_annealed(
                  action_distribution, 
                  epsilon_percentage, 0.5, 0.001)
        elif self.explore_exploit_setting == \
          'epsilon_greedy_annealed_0.25->0.001':
              action = epsilon_greedy_action_annealed(
                  action_distribution, 
                  epsilon_percentage, 0.25, 0.001)
        return action

    def predict_action(self, state, epsilon_percentage):
        action_distribution = self.model(state)[0]
        action = self.sample_action_from_distribution(
            action_distribution, epsilon_percentage)
        return action

SyntaxError: ignored

## Keeping Track of History

In [None]:
class EpisodeHistory(object):

    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.state_primes = []
        self.discounted_returns = []

    def add_to_history(self, state, action, reward, 
      state_prime):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.state_primes.append(state_prime)


class Memory(object):

    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.state_primes = []
        self.discounted_returns = []

    def reset_memory(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.state_primes = []
        self.discounted_returns = []

    def add_episode(self, episode):
        self.states += episode.states
        self.actions += episode.actions
        self.rewards += episode.rewards
        self.discounted_returns += episode.discounted_returns

## Policy Gradient Main Function

In [None]:
# Configure Settings
total_episodes = 5000
total_steps_max = 10000
epsilon_stop = 3000
train_frequency = 8
max_episode_length = 500
render_start = -1
should_render = False

explore_exploit_setting = 'epsilon_greedy_annealed_1.0->0.001'

env = gym.make('CartPole-v0')
state_size = env.observation_space.shape[0]  # 4 for 
                                              # CartPole-v0
num_actions = env.action_space.n  # 2 for CartPole-v0

solved = False
agent = PGAgent(state_size=state_size,
                num_actions=num_actions,
                hidden_size=16, 
                explore_exploit_setting= \
                  explore_exploit_setting)

In [23]:
episode_rewards = []
batch_losses = []

global_memory = Memory()
steps = 0
for i in range(total_episodes):
  state = torch.tensor(env.reset(), dtype=torch.float32)
  episode_reward = 0.0
  episode_history = EpisodeHistory()
  epsilon_percentage = float(min(i/float(
    epsilon_stop), 1.0))

  for j in range(max_episode_length):
      action = agent.predict_action(state, epsilon_percentage)
        
      state_prime, reward, terminal, _ = env.step(action)
      state_prime = torch.tensor(state_prime, 
                                 dtype=torch.float32)
      
      # if (render_start > 0 and i > 
      #   render_start and should_render) \
      #     or (solved and should_render):
      #     env.render()
      episode_history.add_to_history(
          state, action, reward, state_prime)
      state = state_prime
      episode_reward += reward
      steps += 1
      
      if terminal:
          episode_history.discounted_returns = \
            discount_rewards(episode_history.rewards)
          global_memory.add_episode(episode_history)

          if np.mod(i, train_frequency) == 0:
              reward_input = torch.tensor(
                  global_memory.discounted_returns,
                  dtype=torch.float32)
              action_input = torch.tensor(
                  global_memory.actions,
                  dtype=torch.float32)
              state = torch.stack(global_memory.states)

              # train step 
              batch_loss = agent.train(state, 
                                       action_input, 
                                       reward_input)
              batch_losses.append(batch_loss)
              global_memory.reset_memory()

          episode_rewards.append(episode_reward)
          break

          if i % 10:
              if torch.mean(episode_rewards[:-100]) > 100.0:
                  solved = True
              else:
                  solved = False
          print('Solved:', solved, 'Mean Reward', 
                torch.mean(episode_rewards[:-100]))
  break

NameError: ignored

In [None]:
action_input.shape

# Q-Learning and Deep Q-Networks

## Playing Breakout wth DQN

In [None]:
class DQNAgent(object):

    def __init__(self, num_actions,
                 learning_rate=1e-3, history_length=4,
                 screen_height=84, screen_width=84, 
                 gamma=0.99):
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.history_length = history_length
        self.screen_height = screen_height
        self.screen_width = screen_width
        self.gamma = gamma
        self.optimizer = optim.Adam()

        self.build_prediction_network()
        self.build_target_network()
        self.build_training()

    def build_prediction_network(self):
        self.model_predict = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=8 , stride=4),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.Flatten(),
            nn.Linear(64, 512),
            nn.Linear(512, self.num_actions)
            )
        self.q_action = torch.argmax(self.model_predict, dim=1)


    def build_target_network(self):
        self.model_target = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=8 , stride=4),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.Flatten(),
            nn.Linear(64, 512),
            nn.Linear(512, self.num_actions)
            )
        self.target_q_action = torch.argmax(self.target_q, 
                                         dim=1)
        
    def update_target_q_weights(self):
        self.optimizer_predict.step()

    def sample_and_train_pred(self, replay_table, batch_size):

        s_t, action, reward, s_t_plus_1, terminal = \
          replay_table.sample_batch(batch_size)
        q_t_plus_1 = self.model_target(
            {self.target_s_t: 
              s_t_plus_1})


        terminal = np.array(terminal) + 0.
        max_q_t_plus_1 = torch.max(q_t_plus_1, 
          axis=1)
        target_q_t = (1. - terminal) * self.gamma * 
              max_q_t_plus_1 + reward
        _, q_t, loss = self.train_model(self.action, 
                                        self.s_t, 
                                        self.target_q_t)

        return q_t


    def build_training(self):
        action_one_hot = torch.nn.functional.one_hot(
            self.action, num_classes=self.num_actions)
        
        q_of_action = torch.sum(self.q_t * action_one_hot, 
                                dim=1)

        self.delta = (self.target_q_t - q_of_action)
        self.loss = torch.mean(self.clip_error(self.delta),
                               dim=0)

        self.optimizer = torch.optim.AdamOptimizer(
            self.model_predict.parameters(),
            learning_rate=self.learning_rate)
        self.train_step = self.optimizer.step()

    def sample_action_from_distribution(self, 
      action_distribution, epsilon_percentage):
        # Choose an action based on the action probability 
        # distribution
        action = epsilon_greedy_action_annealed(
            action_distribution, epsilon_percentage)
        return action

    def predict_action(self, state, epsilon_percentage):
        action_distribution = train_target()
        action = self.sample_action_from_distribution(
            action_distribution, epsilon_percentage)
        return action

    def process_state_into_stacked_frames(self, frame, 
      past_frames, past_state=None):
        full_state = np.zeros(
            (self.history_length, self.screen_width, 
              self.screen_height))

        if past_state is not None:
            for i in range(len(past_state)-1):
                full_state[i, :, :] = past_state[i+1, :, :]
            full_state[-1, :, :] = self.preprocess_frame(
                  frame,
                  (self.screen_width, self.screen_height))
        else:
            all_frames = past_frames + [frame]
            for i, frame_f in enumerate(all_frames):
                full_state[i, :, :] = self.preprocess_frame(
                    frame_f, 
                    (self.screen_width, self.screen_height))
        return full_state

    def to_grayscale(self, x):
        return np.dot(x[...,:3], [0.299, 0.587, 0.114])

    def clip_error(self, x):
      try:
        return torch.select(torch.abs(x) < 1.0, 
                            0.5 * torch.square(x), 
                            torch.abs(x) - 0.5)
      except:
        return torch.where(torch.abs(x) < 1.0, 
                           0.5 * torch.square(x), 
                           torch.abs(x) - 0.5)

    def preprocess_frame(self, im, shape):
        cropped = im[16:201,:]
        grayscaled = self.to_grayscale(cropped)
        resized = imresize(grayscaled, shape, 
                           'nearest').astype('float32')
        mean, std = 40.45, 64.15
        frame = (resized-mean)/std
        return frame

## Implementing Experience Replay

In [None]:
class ExperienceReplayTable(object):

    def __init__(self, table_size=50000):
        self.states = []
        self.actions = []
        self.rewards = []
        self.state_primes = []
        self.terminals = []

        self.table_size = table_size

    def add_episode(self, episode):
        self.states += episode.states
        self.actions += episode.actions
        self.rewards += episode.rewards
        self.state_primes += episode.state_primes
        self.terminals += episode.terminals

        self.purge_old_experiences()

    def purge_old_experiences(self):
        while len(self.states) > self.table_size:
            self.states.pop(0)
            self.actions.pop(0)
            self.rewards.pop(0)
            self.state_primes.pop(0)

    def sample_batch(self, batch_size):
        s_t, action, reward = [], [], []
        s_t_plus_1, terminal = [], []
        rands = np.arange(len(self.states))
        np.random.shuffle(rands)
        rands = rands[:batch_size]

        for r_i in rands:
            s_t.append(self.states[r_i])
            action.append(self.actions[r_i])
            reward.append(self.rewards[r_i])
            s_t_plus_1.append(self.state_primes[r_i])
            terminal.append(self.terminals[r_i])
        return (np.array(s_t), np.array(action), 
                np.array(reward), np.array(s_t_plus_1), 
                np.array(terminal)


## DQN Main Loop

In [None]:
# Download Atari ROMS for Breakout
! wget http://www.atarimania.com/roms/Roms.rar
! mkdir /content/ROM/
! unrar e /content/Roms.rar /content/ROM/
! python -m atari_py.import_roms /content/ROM/

In [None]:
import torch.optim as optim

# Configure Settings
learn_start = 15000
# learn_start = 1
scale = 30
total_episodes = 500*scale
epsilon_stop = 200*scale
# epsilon_stop = 20
train_frequency = 4
target_frequency = 1000
# batch_size = 64
batch_size = 32
max_episode_length = 100000
render_start = 10
should_render = False

env = gym.make('Breakout-v4')
num_actions = env.action_space.n

In [None]:
solved = False

agent = DQNAgent(num_actions=num_actions, 
                 learning_rate=1e-4, 
                 history_length=4,
                 gamma=0.98)
    

episode_rewards = []
q_t_list = []
batch_losses = []
past_frames_last_time = None

replay_table = ExperienceReplayTable()
global_step_counter = 0
for i in range(total_episodes):
    frame = env.reset()
    past_frames = [copy.deepcopy(frame) \
                   for _ in range(agent.history_length-1)]
    state = agent.process_state_into_stacked_frames(
        frame, past_frames, past_state=None)
    episode_reward = 0.0
    episode_history = EpisodeHistory()
    epsilon_percentage = float(min(i/float(
      epsilon_stop), 1.0))
    for j in range(max_episode_length):
        action = agent.predict_action(state, 
                                      epsilon_percentage)
        if global_step_counter < learn_start:
            action = torch.argmax(
                torch.random.random((agent.num_actions)))

        reward = 0

        frame_prime, reward, terminal, _ = env.step(action)
        if terminal == True:
            reward -= 1

        state_prime = \
        agent.process_state_into_stacked_frames(
            frame_prime, 
            past_frames, 
            past_state=state)

        past_frames.append(frame_prime)
        past_frames = past_frames[len(past_frames) - \
                                  agent.history_length:]

        past_frames_last_time = past_frames

        if ((i > render_start) and should_render or 
            (solved and should_render)):
              env.render()
        episode_history.add_to_history(
            state, action, reward, state_prime, terminal)
        state = state_prime
        episode_reward += reward
        global_step_counter += 1

        if global_step_counter > learn_start:
            if global_step_counter % train_frequency == 0:
                q_t = agent.sample_and_train_pred(
                    replay_table, batch_size)
                q_t_list.append(q_t)

                if global_step_counter % target_frequency == 0:
                    agent.update_target_q_weights()


        if j == (max_episode_length - 1):
            terminal = True

        if terminal:
            replay_table.add_episode(episode_history)
            episode_rewards.append(episode_reward)
            break

    if i % 50 == 0:
        ave_reward = np.mean(episode_rewards[-100:])
        ep_percent = float(min(i/float(epsilon_stop), 1.0))
        print("Reward Stats (min, max, median, mean): ", 
              np.min(episode_rewards[-100:]), 
              np.max(episode_rewards[-100:]), 
              np.median(episode_rewards[-100:]), 
              np.mean(episode_rewards[-100:]))
        print("Global Stats (ep_percent, global_step_counter): ", 
              ep_percent, global_step_counter)
        if q_t_list:
          print("Qt Stats (min, max, median, mean): ", 
                np.min(q_t_list[-1000:]), 
                np.max(q_t_list[-100:]), 
                np.median(q_t_list[-100:]), 
                np.mean(q_t_list[-100:]))
        if ave_reward > 50.0:
            solved = True
            print('solved')
        else:
            solved = False