In [None]:
%pip install wandb box2d

Collecting wandb
  Downloading wandb-0.12.10-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 4.3 MB/s 
[?25hCollecting box2d
  Downloading Box2D-2.3.10-cp37-cp37m-manylinux1_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 18.2 MB/s 
[?25hCollecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.8-py3-none-any.whl (9.5 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.26-py3-none-any.whl (180 kB)
[K     |████████████████████████████████| 180 kB 36.2 MB/s 
[?25hCollecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting yaspin>=1.0.0
  Downloading yaspin-2.1.0-py3-none-any.whl (18 kB)
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.5.4-py2.py3-none-any.whl (143 kB)
[K     |████████████████████████████████| 143 kB 63.3 MB/s 
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.9-py3-none-any.whl (63 kB)


## Install External Dependencies

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Import Dependencies

In [None]:
# Data Manipulation
import numpy as np
import pandas as pd
import tensorflow as tf

# Neural Networks
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.optimizers.schedules import PiecewiseConstantDecay
from tensorflow.keras.regularizers import *

# Reinforcement Learning
import gym

# Model Tracking
import wandb

# Data Structures
from collections import deque

# Miscellaneous
import os
import pickle

In [None]:
ARTIFACT_DIRECTORY = 'drive/MyDrive/rl/dqn6/'
ENV = gym.make('LunarLander-v2')
from tensorflow.keras.optimizers.schedules import ExponentialDecay
AGENT_CONFIG = {
    'batch_size': 128,
    'epsilon': 1.0,
    'min_epsilon': 0.01,
    'epsilon_decay': 0.95,
    'alpha': 0.001,
    'gamma': 0.95,
    'tau': 0.8,
    'regularization': 0.002
}

In [None]:
config_str = '_'.join((f'{k}={v}' for k, v in AGENT_CONFIG.items())) + '.log'
config_str

'batch_size=128_epsilon=1.0_min_epsilon=0.01_epsilon_decay=0.95_alpha=0.001_gamma=0.95_tau=0.8_regularization=0.002.log'

In [None]:
# wandb.login()
wandb.init(project="lunarlander-dqn",
           entity="ethanolx",
           config=AGENT_CONFIG)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
ENV.observation_space.shape

(8,)

In [None]:
ENV.action_space.n

4

In [None]:
class ReplayBuffer:
    def __init__(self, max_length: int):
        self.max_length = max_length
        self.buffer = [None] * self.max_length
        self.pointer = 0
        self.size = 0

    def append(self, memory):
        if self.pointer >= self.max_length:
            self.pointer = 0
        elif self.size < self.max_length:
            self.size += 1
        self.buffer[self.pointer] = memory
        self.pointer += 1

    def sample(self, batch_size: int):
        return [self.buffer[i] for i in np.random.randint(0, self.size, batch_size)]
    
    def __getitem__(self, index):
        return self.buffer[index]
    
    def __len__(self):
        return self.size

In [None]:
class RewardTracker(deque):
    def __init__(self, max_length: int):
        super().__init__(maxlen=max_length)
        self.max_length = max_length

    def log(self, total_episodic_rewards):
        self.append(total_episodic_rewards)
    
    def get_rolling_mean(self):
        base = min(self.__len__(), self.max_length)
        return sum((r for r in self)) / float(base)

In [None]:
def masked_huber_loss(mask_value, clip_delta):
    def f(y_true, y_pred):
        error = y_true - y_pred
        cond  = tf.abs(error) < clip_delta
        mask_true = tf.cast(tf.not_equal(y_true, mask_value), tf.float32)
        masked_squared_error = 0.5 * tf.square(mask_true * (y_true - y_pred))
        linear_loss  = mask_true * (clip_delta * tf.abs(error) - 0.5 * (clip_delta ** 2))
        huber_loss = tf.where(cond, masked_squared_error, linear_loss)
        return tf.reduce_sum(huber_loss) / tf.reduce_sum(mask_true)
    f.__name__ = 'masked_huber_loss'
    return f

In [None]:
def DQN(input_shape, output_shape, alpha, regularization: float = 0.001):
    model = Sequential(layers=[
        Dense(128, input_shape=input_shape, activation='relu', kernel_regularizer=l2(regularization)),
        Dense(128, activation='relu', kernel_regularizer=l2(regularization)),
        Dense(output_shape, kernel_regularizer=l2(regularization))
    ])
    model.compile(loss=masked_huber_loss(0.0, 1.0), optimizer=Adam(learning_rate=alpha))
    return model

In [None]:
# def fill_replay_buffer(max_length: int, episodes: int, timesteps: int):
#     replay_buffer = ReplayBuffer(max_length=max_length)
#     for episode in range(episodes):
#         state = ENV.reset()
#         while True:
#             action = ENV.action_space.sample()
#             new_state, reward, done, _ = ENV.step(action)

#             replay_buffer.append((state, action, reward, new_state, done))

#             if done:
#                 break
            
#             state = new_state

#     print(len(replay_buffer))
#     return replay_buffer

In [None]:
class FileLogger:
    def __init__(self, log_file: str, *args, sep: str=';'):
        self.log_file = log_file
        self.sep = sep
        self.n_args = len(args)
        if not os.path.exists(log_file):
            self.setup(args)
    
    def setup(self, args):
        header = self.sep.join(args)
        with open(self.log_file, mode='w') as f:
            f.write(header)
            f.write('\n')

    def log(self, **kwargs):
        assert len(kwargs) == self.n_args
        header = ''
        with open(self.log_file, mode='r') as f:
            header = f.readlines()[0].replace('\n', '')
        params = header.split(self.sep)
        values = (str(kwargs[p.lower()]) for p in params)
        entry = self.sep.join(values)
        with open(self.log_file, mode='a+') as f:
            f.write(entry)
            f.write('\n')
        return self
    
    def get_logs(self):
        df = pd.read_csv(self.log_file, header=0, sep=self.sep)
        past_rewards = df['Total_Rewards']
        return past_rewards

In [None]:
class Agent:
    def __init__(self,
                 env,
                 batch_size: int,
                 alpha: float,
                 epsilon: float,
                 min_epsilon: float,
                 epsilon_decay: float,
                 gamma: float,
                 tau: float,
                 regularization: float,
                 max_memory_length: int = 250_000,
                 checkpoint_interval: int = 10,
                 log_file: str = 'progress.log'):
        self.env = env
        self.batch_size = batch_size
        self.alpha = alpha
        self.epsilon = epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay = epsilon_decay
        self.gamma = gamma
        self.tau = tau
        self.replay_buffer = ReplayBuffer(max_length = max_memory_length)
        self.model = DQN(input_shape=env.observation_space.shape,
                         output_shape=env.action_space.n,
                         alpha=alpha,
                         regularization=regularization)
        self.target_model = DQN(input_shape=env.observation_space.shape,
                         output_shape=env.action_space.n,
                         alpha=alpha,
                         regularization=regularization)
        self.file_logger = FileLogger(ARTIFACT_DIRECTORY + log_file, 'Episode', 'Steps', 'Total_Rewards')
        self.checkpoint_interval = checkpoint_interval
    
    def checkpoint(self, episode, steps, total_rewards):
        self.file_logger.log(episode=episode, steps=steps, total_rewards=total_rewards)
        if episode % self.checkpoint_interval == 0:
            self.model.save(ARTIFACT_DIRECTORY + f'{episode}.h5')
            with open(f'{ARTIFACT_DIRECTORY}replay_buffer-{episode}.p', 'wb') as saved_buffer:
                pickle.dump(self.replay_buffer, saved_buffer)
    
    def act(self, state):
        if np.random.uniform(0, 1) < self.epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.model.predict(state.reshape(1, -1))[0])
    
    def calculate_target_values(self, memory_batch):
        states = []
        next_states = []
        for sample in memory_batch:
            states.append(sample[0])
            next_states.append(sample[3])
        states = np.array(states)
        next_states = np.array(next_states)

        q_values = self.model.predict(states)
        target_q_values = self.target_model.predict(states)

        q_values_next_state = self.model.predict(next_states)
        target_q_values_next_state = self.target_model.predict(next_states)

        targets = []
        for index, (s, a, r, s_, d) in enumerate(memory_batch):
            best_action = np.argmax(q_values_next_state[index])
            best_action_next_state_q_value = target_q_values_next_state[index][best_action]
            
            target_vector = [0, 0, 0, 0]
            target_vector[a] = r if d else r + self.gamma * best_action_next_state_q_value
            targets.append(target_vector)

        return states, np.array(targets)

    def learn(self):
        if len(self.replay_buffer) >= self.batch_size:
            samples = self.replay_buffer.sample(batch_size=self.batch_size)
            states, targets = self.calculate_target_values(samples)
            self.model.fit(states, targets, epochs=1, batch_size=targets.shape[0], verbose=0)
            # self.decay_epsilon()
    
    def transfer_weights(self):
        policy_weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        if self.tau < 1.0:
            for i, weights in enumerate(policy_weights):
                target_weights[i] = weights * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)
    
    def remember(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))
    
    def decay_epsilon(self):
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

In [None]:
def main(warm_start: int = 0):
    episodes = 1000
    max_steps_per_episode = 1000
    target_model_train_interval = 1
    reward_progress = RewardTracker(max_length=100)
    dqn_agent = Agent(env=ENV, **AGENT_CONFIG, log_file=config_str)
    if warm_start >= 1:
        print(f'Resuming from last episode: {warm_start}')
        dqn_agent.epsilon = max(dqn_agent.min_epsilon, dqn_agent.epsilon_decay ** warm_start)
        dqn_agent.target_model = load_model(f'{ARTIFACT_DIRECTORY}{warm_start}.h5',
                                            custom_objects={'masked_huber_loss': masked_huber_loss(0.0, 1.0)})
        dqn_agent.model = load_model(f'{ARTIFACT_DIRECTORY}{warm_start}.h5',
                                            custom_objects={'masked_huber_loss': masked_huber_loss(0.0, 1.0)})
        with open(f'{ARTIFACT_DIRECTORY}replay_buffer-{warm_start}.p', 'rb') as saved_buffer:
            dqn_agent.replay_buffer = pickle.load(saved_buffer)
        for r in dqn_agent.file_logger.get_logs():
            reward_progress.append(r)
    for episode in range(warm_start + 1, episodes + warm_start + 1):
        cur_state = ENV.reset()
        total_episode_rewards = 0
        for step in range(max_steps_per_episode):
            action = dqn_agent.act(cur_state)
            new_state, reward, done, _ = ENV.step(action)

            total_episode_rewards += reward
            print(f'\rEpisode: {episode}; Step: {step}; Reward: {reward}; Total Episode Rewards (thus far): {total_episode_rewards}', end='')

            dqn_agent.remember(cur_state, action, reward, new_state, done)
            dqn_agent.learn()
            cur_state = new_state
            if done or total_episode_rewards < -300:
                break
        else:
            print(f'\nFailed to complete episode {episode} within {max_steps_per_episode} steps')
        print()
        if episode % target_model_train_interval == 0:
            dqn_agent.transfer_weights()
        reward_progress.append(total_episode_rewards)
        print(f'Rolling Mean: {reward_progress.get_rolling_mean()}')
        dqn_agent.checkpoint(episode=episode, steps=step, total_rewards=total_episode_rewards)
        dqn_agent.decay_epsilon()
        if total_episode_rewards < 200:
            print("Episode failed")
        else:
            print(f"Completed in {episode} episodes")
        print(f'New Epsilon: {dqn_agent.epsilon}')
        wandb.log({
            'episode': episode,
            'steps': step,
            'total rewards': total_episode_rewards
        })

In [None]:
main(warm_start=0)

Episode: 1; Step: 103; Reward: -4.000324336028; Total Episode Rewards (thus far): -303.1313142798565
Rolling Mean: -303.1313142798565
Episode failed
New Epsilon: 0.95
Episode: 2; Step: 59; Reward: -100; Total Episode Rewards (thus far): -97.14827819355696
Rolling Mean: -200.13979623670673
Episode failed
New Epsilon: 0.9025
Episode: 3; Step: 77; Reward: -100; Total Episode Rewards (thus far): -119.1424031512761
Rolling Mean: -173.14066520822985
Episode failed
New Epsilon: 0.8573749999999999
Episode: 4; Step: 115; Reward: -100; Total Episode Rewards (thus far): -133.5400980949639
Rolling Mean: -163.24052342991337
Episode failed
New Epsilon: 0.8145062499999999
Episode: 5; Step: 135; Reward: -100; Total Episode Rewards (thus far): -82.6656735150271
Rolling Mean: -147.1255534469361
Episode failed
New Epsilon: 0.7737809374999999
Episode: 6; Step: 110; Reward: -100; Total Episode Rewards (thus far): -102.41459619907285
Rolling Mean: -139.6737272389589
Episode failed
New Epsilon: 0.73509189062

KeyboardInterrupt: ignored

In [None]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
steps,▁▁▁▁▁▁▁▁▂▂▁▁▆▄▁▂▆█▃██▅▂█████████████████
total rewards,▁▄▄▄▄▃▂▁▇▃▅▆▃▅▅▅▃█▅▅▄▁▆▅▄▄▄▅▄▄▅▄▅▄▃▄▅▄▄▄

0,1
episode,51.0
steps,999.0
total rewards,-104.60815


In [None]:
# !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
# !apt-get install cmake > /dev/null 2>&1
# !pip install --upgrade setuptools 2>&1
# !pip install ez-setup > /dev/null 2>&1
# !pip install pyvirtualdisplay
# !pip install box2d-py

In [None]:
# !pip install pyvirtualdisplay

In [None]:
# !rm -r drive/MyDrive/rl/dqn5/video
# !cd drive/MyDrive/rl;xvfb-run -a -s "-screen 0 640x480x24" python make_video.py