In [2]:
%pip install box2d wandb

Collecting box2d
  Downloading Box2D-2.3.10-cp37-cp37m-manylinux1_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 10.1 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.12.10-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 48.6 MB/s 
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.26-py3-none-any.whl (180 kB)
[K     |████████████████████████████████| 180 kB 67.9 MB/s 
[?25hCollecting yaspin>=1.0.0
  Downloading yaspin-2.1.0-py3-none-any.whl (18 kB)
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.8-py3-none-any.whl (9.5 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.5.4-py2.py3-none-any.whl (143 kB)
[K     |████████████████████████████████| 143 kB 40.7 MB/s 
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.9-py3-none-any.whl (63 kB)
[K  

In [3]:
# Data Manipulation
import numpy as np
import pandas as pd
import tensorflow as tf

# Neural Networks
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.optimizers.schedules import PiecewiseConstantDecay
from tensorflow.keras.regularizers import *

# Reinforcement Learning
import gym

# Model Tracking
import wandb

# Data Structures
from collections import deque

# Miscellaneous
import os
import pickle

In [4]:
ARTIFACT_DIRECTORY = 'drive/MyDrive/dqn0/'
ENV = gym.make('LunarLander-v2')
AGENT_CONFIG = {
    'batch_size': 64,
    'epsilon': 1.0,
    'min_epsilon': 0.01,
    'epsilon_decay': 0.995,
    'gamma': 0.99,
    'alpha': 0.0005
}

In [5]:
config_str = '_'.join((f'{k}={v}' for k, v in AGENT_CONFIG.items())) + '.log'
config_str

'batch_size=64_epsilon=1.0_min_epsilon=0.01_epsilon_decay=0.995_gamma=0.99_alpha=0.0005.log'

In [6]:
class ReplayBuffer:
    def __init__(self, max_length: int):
        self.max_length = max_length
        self.buffer = [None] * self.max_length
        self.pointer = 0
        self.size = 0

    def append(self, memory):
        if self.pointer >= self.max_length:
            self.pointer = 0
        elif self.size < self.max_length:
            self.size += 1
        self.buffer[self.pointer] = memory
        self.pointer += 1

    def sample(self, batch_size: int):
        return [self.buffer[i] for i in np.random.randint(0, self.size, batch_size)]
    
    def __getitem__(self, index):
        return self.buffer[index]
    
    def __len__(self):
        return self.size

In [7]:
class FileLogger:
    def __init__(self, log_file: str, *args, sep: str=';'):
        self.log_file = log_file
        self.sep = sep
        self.n_args = len(args)
        if not os.path.exists(log_file):
            self.setup(args)
    
    def setup(self, args):
        header = self.sep.join(args)
        with open(self.log_file, mode='w') as f:
            f.write(header)
            f.write('\n')

    def log(self, **kwargs):
        assert len(kwargs) == self.n_args
        header = ''
        with open(self.log_file, mode='r') as f:
            header = f.readlines()[0].replace('\n', '')
        params = header.split(self.sep)
        values = (str(kwargs[p.lower()]) for p in params)
        entry = self.sep.join(values)
        with open(self.log_file, mode='a+') as f:
            f.write(entry)
            f.write('\n')
        return self
    
    def get_logs(self):
        df = pd.read_csv(self.log_file, header=0, sep=self.sep)
        past_rewards = df['Total_Rewards']
        return past_rewards

In [8]:
class RewardTracker(deque):
    def __init__(self, max_length: int):
        super().__init__(maxlen=max_length)
        self.max_length = max_length

    def log(self, total_episodic_rewards):
        self.append(total_episodic_rewards)
    
    def get_rolling_mean(self):
        base = min(self.__len__(), self.max_length)
        return sum((r for r in self)) / float(base)

In [9]:
def DQN(input_shape, output_shape, alpha):
    model = Sequential(layers=[
        Dense(256, input_shape=input_shape, activation='relu'),
        Dense(256, activation='relu'),
        Dense(output_shape)
    ])
    model.compile(loss='mse', optimizer=Adam(learning_rate=alpha))
    return model

In [10]:
class Agent:
    def __init__(self,
                 env,
                 batch_size: int,
                 alpha: float,
                 epsilon: float,
                 min_epsilon: float,
                 epsilon_decay: float,
                 gamma: float,
                 max_memory_length: int = 300_000,
                 checkpoint_interval: int = 10,
                 log_file: str = 'progress.log'):
        self.env = env
        self.batch_size = batch_size
        self.alpha = alpha
        self.epsilon = epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay = epsilon_decay
        self.gamma = gamma
        self.replay_buffer = ReplayBuffer(max_length = max_memory_length)
        self.model = DQN(input_shape=env.observation_space.shape,
                         output_shape=env.action_space.n,
                         alpha=alpha)
        self.file_logger = FileLogger(ARTIFACT_DIRECTORY + log_file, 'Episode', 'Steps', 'Total_Rewards')
        self.checkpoint_interval = checkpoint_interval
    
    def checkpoint(self, episode, steps, total_rewards):
        self.file_logger.log(episode=episode, steps=steps, total_rewards=total_rewards)
        if episode % self.checkpoint_interval == 0:
            self.model.save(ARTIFACT_DIRECTORY + f'{episode}.h5')
            with open(f'{ARTIFACT_DIRECTORY}replay_buffer-{episode}.p', 'wb') as saved_buffer:
                pickle.dump(self.replay_buffer, saved_buffer)
    
    def act(self, state):
        if np.random.uniform(0, 1) < self.epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.model.predict(state.reshape(1, -1))[0])
    
    def calculate_target_values(self, memory_batch):
        states = []
        next_states = []
        for sample in memory_batch:
            states.append(sample[0])
            next_states.append(sample[3])
        states = np.array(states)
        next_states = np.array(next_states)

        q_values = self.model.predict(states)
        # target_q_values = self.target_model.predict(states)

        q_values_next_state = self.model.predict(next_states)
        # target_q_values_next_state = self.target_model.predict(next_states)

        q_copy = q_values.copy()

        # targets = []
        for index, (s, a, r, s_, d) in enumerate(memory_batch):
            best_action = np.max(q_values_next_state[index])
            # best_action_next_state_q_value = q_[index][best_action]
            
            # target_vector = [0, 0, 0, 0]
            q_copy[index, a] = r if d else r + self.gamma * best_action
            # targets.append(target_vector)

        return states, q_copy

    def learn(self):
        if len(self.replay_buffer) >= self.batch_size:
            samples = self.replay_buffer.sample(batch_size=self.batch_size)
            states, targets = self.calculate_target_values(samples)
            self.model.fit(states, targets, epochs=1, batch_size=targets.shape[0], verbose=0)
            self.decay_epsilon()
    
    # def transfer_weights(self):
    #     policy_weights = self.model.get_weights()
    #     target_weights = self.target_model.get_weights()
    #     if self.tau < 1.0:
    #         for i, weights in enumerate(policy_weights):
    #             target_weights[i] = weights * self.tau + target_weights[i] * (1 - self.tau)
    #     self.target_model.set_weights(target_weights)
    
    def remember(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))
    
    def decay_epsilon(self):
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

In [18]:
wandb.init(project="lunarlander-dqn",
           entity="ethanolx",
           config=AGENT_CONFIG)

[34m[1mwandb[0m: Currently logged in as: [33methanolx[0m (use `wandb login --relogin` to force relogin)


In [12]:
def main(warm_start: int = 0):
    episodes = 1000
    # max_steps_per_episode = 800
    # target_model_train_interval = 1
    reward_progress = RewardTracker(max_length=100)
    dqn_agent = Agent(env=ENV, **AGENT_CONFIG, log_file=config_str)
    if warm_start >= 1:
        print(f'Resuming from last episode: {warm_start}')
        dqn_agent.epsilon = dqn_agent.min_epsilon
        # dqn_agent.target_model = load_model(f'{ARTIFACT_DIRECTORY}{warm_start}.h5')
        dqn_agent.model = load_model(f'{ARTIFACT_DIRECTORY}{warm_start}.h5')
        with open(f'{ARTIFACT_DIRECTORY}replay_buffer-{warm_start}.p', 'rb') as saved_buffer:
            dqn_agent.replay_buffer = pickle.load(saved_buffer)
        for r in dqn_agent.file_logger.get_logs():
            reward_progress.append(r)
    for episode in range(warm_start + 1, episodes + warm_start + 1):
        cur_state = ENV.reset()
        total_episode_rewards = 0
        done = False
        step = 0
        while not done:
            action = dqn_agent.act(cur_state)
            new_state, reward, done, _ = ENV.step(action)

            total_episode_rewards += reward
            print(f'\rEpisode: {episode}; Step: {step}; Reward: {reward}; Total Episode Rewards (thus far): {total_episode_rewards}', end='')

            dqn_agent.remember(cur_state, action, reward, new_state, done)
            dqn_agent.learn()
            cur_state = new_state
            if done or total_episode_rewards < -300:
                break
            step += 1

        print()
        reward_progress.append(total_episode_rewards)
        print(f'Rolling Mean: {reward_progress.get_rolling_mean()}')
        dqn_agent.checkpoint(episode=episode, steps=step, total_rewards=total_episode_rewards)
        # dqn_agent.decay_epsilon()
        if total_episode_rewards < 200:
            print("Episode failed")
        else:
            print(f"Completed in {episode} episodes")
        print(f'New Epsilon: {dqn_agent.epsilon}')
        wandb.log({
            'episode': episode,
            'steps': step,
            'total rewards': total_episode_rewards
        })

In [None]:
main(warm_start=320)

Resuming from last episode: 320
Episode: 321; Step: 267; Reward: 100; Total Episode Rewards (thus far): 253.58655207053533
Rolling Mean: 200.41095640376713
Completed in 321 episodes
New Epsilon: 0.01
Episode: 322; Step: 289; Reward: 100; Total Episode Rewards (thus far): 286.3541215788838
Rolling Mean: 201.85619206554358
Completed in 322 episodes
New Epsilon: 0.01
Episode: 323; Step: 262; Reward: 100; Total Episode Rewards (thus far): 312.2952204413689
Rolling Mean: 202.56130980592877
Completed in 323 episodes
New Epsilon: 0.01
Episode: 324; Step: 259; Reward: -0.21635450843138557; Total Episode Rewards (thus far): 146.22484068875045

In [None]:
wandb.finish()

In [15]:
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez-setup > /dev/null 2>&1
!pip install pyvirtualdisplay
!pip install box2d-py

Collecting setuptools
  Downloading setuptools-60.8.2-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 14.2 MB/s 
[?25hInstalling collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 57.4.0
    Uninstalling setuptools-57.4.0:
      Successfully uninstalled setuptools-57.4.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
Successfully installed setuptools-60.8.2


Collecting pyvirtualdisplay
  Downloading PyVirtualDisplay-2.2-py3-none-any.whl (15 kB)
Collecting EasyProcess
  Downloading EasyProcess-1.1-py3-none-any.whl (8.7 kB)
Installing collected packages: EasyProcess, pyvirtualdisplay
Successfully installed EasyProcess-1.1 pyvirtualdisplay-2.2
Collecting box2d-py
  Downloading box2d_py-2.3.8-cp37-cp37m-manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 14.6 MB/s 
[?25hInstalling collected packages: box2d-py
Successfully installed box2d-py-2.3.8


In [17]:
!rm -r drive/MyDrive/dqn0/video
!cd drive/MyDrive/dqn0;xvfb-run -a -s "-screen 0 640x480x24" python make_video.py

rm: cannot remove 'drive/MyDrive/dqn0/video': No such file or directory
2022-02-10 14:28:34.872586: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
