In [None]:
%pip install box2d wandb

Collecting box2d
  Downloading Box2D-2.3.10-cp37-cp37m-manylinux1_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 12.3 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.12.9-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 39.9 MB/s 
Collecting yaspin>=1.0.0
  Downloading yaspin-2.1.0-py3-none-any.whl (18 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.26-py3-none-any.whl (180 kB)
[K     |████████████████████████████████| 180 kB 49.8 MB/s 
[?25hCollecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.5.4-py2.py3-none-any.whl (143 kB)
[K     |████████████████████████████████| 143 kB 50.4 MB/s 
[?25hCollecting subprocess32>=3.5.3
  Downloading subprocess32-3.5.4.tar.gz (97 kB)
[K     |████████████████████████████████| 97 kB 6.4 MB/s 
[?25hCollecting configparser>=3.8.1
  Downloading configparser-5.2.0-py3-none-any.whl (19 kB)
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0

In [None]:
import numpy as np
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import *
import gym
import wandb
from collections import deque
import os

In [None]:
ARTIFACT_DIRECTORY = 'drive/MyDrive/rl/dqn/'
ENV = gym.make('LunarLander-v2')
CONFIG = {
    'batch_size': 128,
    'alpha': 0.001,
    'epsilon': 1.0,
    'min_epsilon': 0.01,
    'epsilon_decay': 0.995,
    'gamma': 0.95,
    'tau': 1.,
}

In [None]:
config_str = '_'.join((f'{k}={v}' for k, v in CONFIG.items())) + '.log'
config_str

'batch_size=128_alpha=0.001_epsilon=1.0_min_epsilon=0.01_epsilon_decay=0.995_gamma=0.95_tau=1.0.log'

In [None]:
# wandb.login()
wandb.init(project="lunarlander-dqn",
           entity="ethanolx",
           config=CONFIG)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
ENV.observation_space.shape

(8,)

In [None]:
ENV.action_space.n

4

In [None]:
class ReplayBuffer:
    def __init__(self, max_length: int):
        self.max_length = max_length
        self.buffer = [None] * self.max_length
        self.pointer = 0
        self.size = 0

    def append(self, memory):
        if self.pointer >= self.max_length:
            self.pointer = 0
        else:
            self.size += 1
        self.buffer[self.pointer] = memory
        self.pointer += 1

    def sample(self, batch_size: int):
        return [self.buffer[i] for i in np.random.randint(0, self.size, batch_size)]
    
    def __getitem__(self, index):
        return self.buffer[index]
    
    def __len__(self):
        return self.size

In [None]:
class RewardTracker(deque):
    def __init__(self, max_length: int):
        super().__init__(maxlen=max_length)
        self.max_length = max_length

    def log(self, total_episodic_rewards):
        self.append(total_episodic_rewards)
    
    def get_rolling_mean(self):
        base = min(self.__len__(), self.max_length)
        return sum((r for r in self)) / float(base)

In [None]:
def DQN(input_shape, output_shape, alpha: float):
    model = Sequential(layers=[
        Dense(128, input_shape=input_shape, activation='relu', kernel_regularizer='l1'),
        BatchNormalization(),
        Dense(128, activation='relu', kernel_regularizer='l1'),
        BatchNormalization(),
        Dense(128, activation='relu', kernel_regularizer='l1'),
        Dropout(rate=0.2),
        Dense(output_shape)
    ])
    model.compile(loss='mae', optimizer=Adam(learning_rate=alpha))
    return model

In [None]:
class FileLogger:
    def __init__(self, log_file: str, *args, sep: str=';'):
        self.log_file = log_file
        self.sep = sep
        self.n_args = len(args)
        if not os.path.exists(log_file):
            self.setup(args)
    
    def setup(self, args):
        header = self.sep.join(args)
        with open(self.log_file, mode='w') as f:
            f.write(header)
            f.write('\n')

    def log(self, **kwargs):
        assert len(kwargs) == self.n_args
        header = ''
        with open(self.log_file, mode='r') as f:
            header = f.readlines()[0].replace('\n', '')
        params = header.split(self.sep)
        values = (str(kwargs[p.lower()]) for p in params)
        entry = self.sep.join(values)
        with open(self.log_file, mode='a+') as f:
            f.write(entry)
            f.write('\n')
        return self

In [None]:
class Agent:
    def __init__(self,
                 env,
                 batch_size: int,
                 alpha: float,
                 epsilon: float,
                 min_epsilon: float,
                 epsilon_decay: float,
                 gamma: float,
                 tau: float,
                 max_memory_length: int = 100_000,
                 log_file: str = 'progress.log'):
        self.env = env
        self.batch_size = batch_size
        self.alpha = alpha
        self.epsilon = epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay = epsilon_decay
        self.gamma = gamma
        self.tau = tau
        self.replay_buffer = ReplayBuffer(max_length = max_memory_length)
        self.model = DQN(input_shape=env.observation_space.shape,
                         output_shape=env.action_space.n,
                         alpha=alpha)
        self.target_model = DQN(input_shape=env.observation_space.shape,
                         output_shape=env.action_space.n,
                         alpha=alpha)
        self.file_logger = FileLogger(ARTIFACT_DIRECTORY + log_file, 'Episode', 'Steps', 'Total_Rewards')
    
    def checkpoint(self, episode, steps, total_rewards):
        self.file_logger.log(episode=episode, steps=steps, total_rewards=total_rewards)
        self.model.save(ARTIFACT_DIRECTORY + f'{episode}.h5')
    
    def act(self, state):
        if np.random.uniform(0, 1, 1) < self.epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.model.predict(state.reshape(1, -1))[0])
    
    def calculate_target_values(self, memory_batch):
        states = []
        next_states = []
        for sample in memory_batch:
            states.append(sample[0])
            next_states.append(sample[3])
        states = np.array(states)
        next_states = np.array(next_states)

        q_values = self.model.predict(states)
        target_q_values = self.target_model.predict(states)

        q_values_next_state = self.model.predict(next_states)
        target_q_values_next_state = self.target_model.predict(next_states)

        targets = []
        for index, (s, a, r, s_, d) in enumerate(memory_batch):
            best_action = np.argmax(q_values_next_state[index])
            best_action_next_state_q_value = target_q_values_next_state[index][best_action]
            
            target_vector = [0, 0, 0, 0]
            target_vector[a] = r if d else r + self.gamma * best_action_next_state_q_value
            targets.append(target_vector)

        return states, np.array(targets)

    def learn(self):
        if len(self.replay_buffer) >= self.batch_size:
            samples = self.replay_buffer.sample(batch_size=self.batch_size)
            states, targets = self.calculate_target_values(samples)
            self.model.fit(states, targets, epochs=1, verbose=0)
    
    def transfer_weights(self):
        policy_weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        if self.tau < 1.0:
            for i, weights in enumerate(policy_weights):
                target_weights[i] = weights * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)
    
    def remember(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))
    
    def decay_epsilon(self):
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

In [None]:
def main(warm_start: int = 0, checkpoint_interval: int = 50):
    episodes = 1000
    max_steps_per_episode = 500
    target_model_train_interval = 10
    reward_progress = RewardTracker(max_length=100)
    dqn_agent = Agent(env=ENV, **CONFIG, log_file=config_str)
    if warm_start >= 1:
        print(f'Resuming from last episode: {warm_start}')
        dqn_agent.epsilon = max(dqn_agent.min_epsilon, dqn_agent.epsilon_decay ** (warm_start - 1))
        dqn_agent.target_model = load_model(f'{ARTIFACT_DIRECTORY}{warm_start}.h5')
        dqn_agent.model = load_model(f'{ARTIFACT_DIRECTORY}{warm_start}.h5')
    for episode in range(warm_start + 1, episodes + warm_start + 1):
        cur_state = ENV.reset()
        total_episode_rewards = 0
        for step in range(max_steps_per_episode):
            action = dqn_agent.act(cur_state)
            new_state, reward, done, _ = ENV.step(action)

            total_episode_rewards += reward
            print(f'\rEpisode: {episode}; Step: {step}; Reward: {reward}; Total Episode Rewards (thus far): {total_episode_rewards}', end='')

            dqn_agent.remember(cur_state, action, reward, new_state, done)
            dqn_agent.learn()
            cur_state = new_state
            if done:
                break
        else:
            print(f'\nFailed to complete episode {episode} within {max_steps_per_episode} steps')
        print()
        if episode % target_model_train_interval == 0:
            dqn_agent.transfer_weights()
        reward_progress.append(total_episode_rewards)
        print(f'Rolling Mean: {reward_progress.get_rolling_mean()}')
        if episode % checkpoint_interval == 0:
            dqn_agent.checkpoint(episode=episode, steps=step, total_rewards=total_episode_rewards)
        if total_episode_rewards < 200:
            print("Episode failed")
        else:
            print(f"Completed in {episode} episodes")
        dqn_agent.decay_epsilon()
        print(f'New Epsilon: {dqn_agent.epsilon}')
        wandb.log({
            'episode': episode,
            'steps': step,
            'total rewards': total_episode_rewards
        })

In [None]:
main(warm_start=100)

Resuming from last episode: 100
Episode: 101; Step: 140; Reward: -100; Total Episode Rewards (thus far): -355.6257529792642
Rolling Mean: -355.6257529792642
Episode failed
New Epsilon: 0.6057704364907279
Episode: 102; Step: 143; Reward: -100; Total Episode Rewards (thus far): -200.65385316922993
Rolling Mean: -278.1398030742471
Episode failed
New Epsilon: 0.6027415843082743
Episode: 103; Step: 131; Reward: -100; Total Episode Rewards (thus far): -186.56575113312152
Rolling Mean: -247.61511909387187
Episode failed
New Epsilon: 0.599727876386733
Episode: 104; Step: 157; Reward: -100; Total Episode Rewards (thus far): -101.3384995341242
Rolling Mean: -211.04596420393494
Episode failed
New Epsilon: 0.5967292370047993
Episode: 105; Step: 143; Reward: -100; Total Episode Rewards (thus far): -178.05746983908978
Rolling Mean: -204.4482653309659
Episode failed
New Epsilon: 0.5937455908197753
Episode: 106; Step: 91; Reward: -100; Total Episode Rewards (thus far): -244.96103627052645
Rolling Mean

KeyboardInterrupt: ignored

In [None]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
steps,▃▃▂▂▄▃▃▂▂▃▃▂▅▃▁▄▆▅▄▃▄▂▁▄▅█▃▅▃▄▅▃▅▃▄▄▄▄▃▅
total rewards,▁▆▃▆█▁█▅▆▇▅▅▃▄▆▆▇▆▆▄▅▆▄▆██▆▆▁▁▃▄▃▁▃▂▃▄▂▆

0,1
episode,222.0
steps,235.0
total rewards,-91.38853


In [1]:
import glob
from gym.wrappers.monitoring import video_recorder
from IPython.display import HTML
from IPython import display 
import io
import base64
import gym

def show_video(env_name):
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = 'video/{}.mp4'.format(env_name)
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

In [6]:
def show_video_of_model(env_name):
    env = gym.make(env_name)
    vid = video_recorder.VideoRecorder(env, path="video/{}.mp4".format(env_name))
    # agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))
    state = env.reset()
    done = False
    for i in range(1000):
        print(i)
        frame = env.render(mode='rgb_array')
        vid.capture_frame()
        
        # action = agent.act(state)
        action = env.action_space.sample()

        state, reward, done, _ = env.step(action)
        if done:
            break
    env.close()

In [10]:
show_video_of_model('LunarLander-v2')

AttributeError: module 'gym.envs.box2d' has no attribute 'LunarLander'