In [3]:
%matplotlib tk
# %matplotlib qt
import matplotlib.pyplot as plt
plt.xkcd()

<contextlib.ExitStack at 0x790b0e59fb90>

# Q-Learning Agent

setup
-----

In [4]:
import gymnasium as gym
env = gym.make("LunarLander-v2", render_mode='human')
cont = None

In [5]:
env.observation_space.sample().shape

(8,)

In [6]:
env.action_space

Discrete(4)

In [7]:
import torch
import torch.nn as nn

class LanderNetwork(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.layers = nn.Sequential(
            nn.Linear(8, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 4),
        )
    def forward(self, x):
        return self.layers(x)

Simple Network Agent
--------------------
This is just a simple agent that uses the network but does not train it.


In [8]:
from utils import Agent
class SimpleNetworkAgent(Agent):
    def __init__(self) -> None:
        super().__init__()
        self.network = LanderNetwork()
    def act(self, observation, periphral=None):
        obs = torch.from_numpy(observation)
        action = self.network(obs)
        return action.argmax().item()
    def think(self, observation_old, action, reward, observation, terminated):
        return super().think(observation_old, action, reward, observation, terminated)

sna = SimpleNetworkAgent()
cont = None
runs = [[]]

In [None]:
from utils import run_upto_n_steps, plot_reward_and_episodes
from tqdm import trange

for round in trange(20):
    cont, runs = run_upto_n_steps(env, sna, 150, cont, runs)
    if round % 2 == 0:
        plt.clf()
        plot_reward_and_episodes(runs)
        plt.pause(0.2)

  0%|          | 0/20 [00:00<?, ?it/s]

In [11]:
env.close()
# plt.close()

In [13]:
import gymnasium as gym
env = gym.make("LunarLander-v2")
cont = None

In [14]:
from utils import run_upto_n_steps, plot_reward_and_episodes
from tqdm import trange

for round in trange(80):
    cont, runs = run_upto_n_steps(env, sna, 150, cont, runs)
    if round % 2 == 0:
        plt.clf()
        plot_reward_and_episodes(runs)
        plt.pause(0.2)

  0%|          | 0/80 [00:00<?, ?it/s]findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'Comic Sans MS' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'Comic Sans MS' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'Comic Sans MS' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'Comic Sans MS' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'Comic Sans MS' not found.
findfont: Font family 'xkcd' not found.
findfont: Fo

In [15]:
plt.close()

Trainable Network Agent
-----------------------
This agent will train the network by optimizing on the most recent few 
(Observaation, Action, Reward, Observation) quartuples


### Basic

In [17]:
import gymnasium as gym
env = gym.make("LunarLander-v2")
cont = None

In [18]:
import numpy as np
from utils import Agent
class TrainableNetworkAgent(Agent):
    def __init__(self, gamma = 0.9, lr=0.001, update_interval=32, epsilon=0.1) -> None:
        super().__init__()
        self.network = LanderNetwork()
        self.gamma = gamma
        self.update_interval = update_interval
        self.epsilon = epsilon
        self.optim = torch.optim.AdamW(self.network.parameters(), lr)
        self.loss = torch.tensor(0., requires_grad=True)
        self.thoughts = 0
        self.losses = []
        self.events = [] # List of dictionary of events
    def act(self, observation, periphral=None):
        if self.network.training and torch.rand((1,)).item() < self.epsilon:
            return torch.randint(self.network.layers[-1].out_features, (1,)).item()
        obs = torch.from_numpy(observation)
        with torch.no_grad():
            action = self.network(obs)
        return action.argmax().item()
    def think(self, observation, action, reward, observation_next, terminated):
        if not self.network.training:
            return 
        self.thoughts += 1
        if isinstance(observation, np.ndarray):
            observation = torch.from_numpy(observation)
        if isinstance(observation_next, np.ndarray):
            observation_next = torch.from_numpy(observation_next)
        outputs = self.network(observation)
        # next_outputs = self.network(observation_next)
        cr_pred = outputs[action]
        cr_esti = reward + (self.gamma*self.network(observation_next).argmax() if not terminated else 0)

        loss = (cr_esti - cr_pred)**2
        # if terminated:
        #     loss = loss + self.terminal_multiplier*sum(next_outputs**2)
        self.loss = self.loss + loss
        self.losses.append(loss.item())
        self.events.append({
            "terminal": terminated,
            "reward": reward
        })
        if self.thoughts % self.update_interval == 0:
            self.update()
        
    def update(self):
        self.optim.zero_grad()
        self.loss.backward()
        self.optim.step()
        self.loss = torch.tensor(0., requires_grad=True)

In [19]:
tna = TrainableNetworkAgent(gamma=0.99, lr=0.01)
cont = None
runs = [[]]

In [20]:
from utils import run_upto_n_steps, plot_reward_and_episodes
from tqdm import trange
from itertools import count
tna.network.train()
training_steps  = 80_000
progbar = trange(training_steps)

initial_steps = tna.thoughts
breakout = False
for round in count():
    try:
        cont, runs = run_upto_n_steps(env, tna, 150, cont, runs)
        progbar.set_description(f"thoughts {tna.thoughts}")
        progbar.update(tna.thoughts-initial_steps-progbar.n)
        if progbar.n-initial_steps >= training_steps:
            breakout = True
        if round % 20 == 0 or breakout:
            # plot_agent(tna)
            plt.clf()
            plt.subplot(2,1,1)
            plt.scatter(*zip(*enumerate(tna.losses)), c=[i['terminal'] for i in tna.events], s=10, alpha=0.8)
            plt.subplot(2, 1, 2)
            plot_reward_and_episodes(runs)
            plt.pause(0.2)
        if round % 300 == 0 or breakout:
            tna.network.eval()
            enviz = gym.make("LunarLander-v2", render_mode='human')
            run_upto_n_steps(enviz, tna, 1_000)
            enviz.close()
            tna.network.train()
        if breakout:
            break
    except KeyboardInterrupt:
        break
progbar.close()

thoughts 109:   0%|          | 0/80000 [00:00<?, ?it/s]findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'Comic Sans MS' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'Comic Sans MS' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'Comic Sans MS' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'Comic Sans MS' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'Comic Sans MS' not found.
findfont: Font family 'xkcd' not fo

In [21]:
enviz = gym.make("LunarLander-v2", render_mode='human')
tna.network.eval()
for i in trange(2):
    run_upto_n_steps(enviz, tna, 1_000)
# input()
enviz.close()


100%|██████████| 2/2 [00:11<00:00,  5.52s/it]


In [22]:
# plt.close()
plt.suptitle("Simple DQN")

Text(0.5, 0.98, 'Simple DQN')

### Multiple Parallel environments

This paper [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/abs/1602.01783) 
demonstrated a strategy of training the same agent in multiple different copies 
of the environment at the same time as a method to keep diversity in training 
data.

This section is my attempt to recreate that


steps:
- ~~Seperate runs from agent, make run an array returned along side cont~~
- ~~Verify nothing broke~~
- Implement env&run swap in training loop
- ...?
- Profit?

In [23]:
from utils import TrainableNetworkAgent
import gymnasium as gym
import numpy as np

tna = TrainableNetworkAgent(gamma=0.99, lr=0.01)
envs = [gym.make("LunarLander-v2") for _ in range(5)] 
num_envs = len(envs)
cont = [None]*num_envs
runs = [ [[]] ]*num_envs
# epsilons = np.linspace(0.9, 0.1, num_envs)
epsilons = [0.1]*num_envs

In [None]:
from utils import run_upto_n_steps, plot_reward_and_episodes
from tqdm import trange
from itertools import count

tna.network.train()
training_steps  = 80_000
progbar = trange(training_steps)

initial_steps = tna.thoughts
breakout = False
for round in count():
    try:
        env = round%num_envs
        tna.epsilon = epsilons[env]
        cont[env], runs[env] = run_upto_n_steps(envs[env], tna, 40, cont[env], runs[env])
        progbar.set_description(f"thoughts {tna.thoughts}")
        progbar.update(tna.thoughts-initial_steps-progbar.n)
        if progbar.n-initial_steps >= training_steps:
            breakout = True
        if round % 20 == 0:
            # plot_agent(tna)
            plt.clf()
            plt.subplot(2,1,1)
            plt.scatter(*zip(*enumerate(tna.losses)), c=[i['terminal'] for i in tna.events], s=10, alpha=0.8)
            plt.subplot(2, 1, 2)
            plot_reward_and_episodes(runs[-1])
            plt.pause(0.2)
        if round % 500 == 0:
            tna.network.eval()
            enviz = gym.make("LunarLander-v2", render_mode='human')
            run_upto_n_steps(enviz, tna, 1_000)
            enviz.close()
            tna.network.train()
        if breakout:
            break
    except KeyboardInterrupt:
        break
progbar.close

thoughts 40:   0%|          | 0/80000 [00:00<?, ?it/s]findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'Comic Sans MS' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'Comic Sans MS' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'Comic Sans MS' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'Comic Sans MS' not found.
findfont: Font family 'xkcd' not found.
findfont: Font family 'xkcd Script' not found.
findfont: Font family 'Comic Neue' not found.
findfont: Font family 'Comic Sans MS' not found.
findfont: Font family 'xkcd' not fou

In [6]:
enviz = gym.make("LunarLander-v2", render_mode='human')
tna.network.eval()
for i in trange(5):
    run_upto_n_steps(enviz, tna, 1_000)
# input()
enviz.close()


100%|██████████| 5/5 [00:21<00:00,  4.39s/it]                         


In [22]:
plt.close()

# Random Agent

In [1]:
%matplotlib tk
# %matplotlib qt

In [2]:
from utils import Agent, run_upto_n_steps
import numpy as np
class RandLander(Agent):
    def act(self, observation, periphral=None):
        return np.random.choice([0,1,2,3])
    def think(self, observation_old, action, reward, observation, terminated):
        pass

rand_agent = RandLander()



In [3]:
from utils import Agent, run_upto_n_steps
import gymnasium as gym

env = gym.make("LunarLander-v2", render_mode="human")
cont = None

In [4]:
import logging
# logging.basicConfig(level=logging.INFO) # If your interested in some logs

In [5]:
from utils import run_and_plot

for round in range(100):
    cont = run_and_plot(env, rand_agent, 50, cont)

In [6]:
env.close()

In [7]:
import gymnasium as gym
env = gym.make("LunarLander-v2")
cont = None

In [10]:
from utils import run_and_plot

for round in range(100):
    cont = run_and_plot(env, rand_agent, 150, cont)

# Utils

In [3]:
import gymnasium as gym

env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset()

episode_over = False
while not episode_over:
    action = env.action_space.sample()  # agent policy that uses the observation and info
    observation, reward, terminated, truncated, info = env.step(action)

    episode_over = terminated or truncated

env.close()

define a function that takes a model and an environment, and run the model in the environment for *n* steps

In [22]:
import gymnasium as gym
from abc import ABC, abstractmethod

class Agent(ABC):
    @abstractmethod
    def act(self, observation, periphral=None):
        pass
    @abstractmethod
    def think(self, observation_old, action, reward, observation):
        pass

def run_n_steps(env, agent: Agent, n, continuation=None):
    if continuation is not None:
        observation, reward, terminated, truncated, info = continuation
    if continuation is None or terminated or truncated:
        print("Resetting")
        observation, info = env.reset()
        terminated = False
        truncated = False
        reward = 0
    step = 0
    # print((observation, reward, terminated, truncated, info))
    while not terminated and not truncated and step < n:
        action = agent.act(observation)
        observation_old = observation
        observation, reward, terminated, truncated, info = env.step(action)
        agent.think(observation_old, action, reward, observation)
        step += 1
    return (observation, reward, terminated, truncated, info)

In [None]:
env = gym.make("LunarLander-v2", render_mode='human')

class RandomLunar(Agent):
    def act(self, observation, periphral=None):
        return env.action_space.sample()
    def think(self, observation_old, action, reward, observation):
        return super().think(observation_old, action, reward, observation)
randAgent = RandomLunar()


cont = run_n_steps(env, randAgent, 10)

In [93]:
cont = run_n_steps(env, randAgent, 10, cont)

In [94]:
env.close()

## Add a Buffer, so we can learn from past experiences!

In [1]:
class Buffer:
    def __init__(self, state_dim, action_dim, capacity=1000):
        self.capacity = capacity
        self.states = np.zeros((capacity, state_dim), dtype=np.float32)
        self.actions = np.zeros((capacity, action_dim), dtype=np.float32)
        self.rewards = np.zeros((capacity, 1), dtype=np.float32)
        self.terminals = np.zeros((capacity, 1), dtype=np.float32)
        self.index = 0
        self.size = 0
    def add(self, state, action, reward, terminal):
        self.states[self.index] = state
        self.actions[self.index] = action
        self.rewards[self.index] = reward
        self.terminals[self.index] = terminal
        self.index = (self.index + 1) % self.capacity
        self.size = min(self.size + 1, self.capacity)
    def sample(self, batch_size):
        """return: (state, next_state, action, reward, state is terminal)"""
        indices = np.random.choice(self.size, batch_size, replace=False)
        return self.states[indices], self.states[indices+1], self.actions[indices], self.rewards[indices], self.terminals[indices]

In [2]:
class BuffTNA(TrainableNetworkAgent):
    def __init__(self, *args, batch_size=32, n_updates=4, buffer_capacity=50_000, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.batch_size = batch_size
        self.n_updates = n_updates
        self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.buffer = Buffer(8, 1, buffer_capacity)
        
    def think(self, observation, action, reward, observation_next, terminated):
        self.buffer.add(observation, action, reward, terminated)
        if not self.network.training:
            return 
        self.thoughts += 1
        self.events.append({
            "terminal": terminated,
            "reward": reward
        })
        if self.thoughts % self.update_interval == 0 and self.buffer.size > self.batch_size:
            observations, observations_next, actions, rewards, terminals = self.buffer.sample(self.batch_size)
            self.network.to(self.device)
            observations_next_t = torch.from_numpy(observations_next).to(self.device)
            outputs = self.network(torch.from_numpy(observations))
            actions_t = torch.from_numpy(actions).to(self.device)
            q_pred = outputs.gather(1, actions_t)
            q_estim = reward + (self.gamma*self.network(observations_next_t).argmax() if not terminated else 0)
            loss = torch.nn.functional.mse_loss(q_pred, q_estim)
            self.optim.zero_grad()
            loss.backward()
            self.optim.step()
            self.losses.append(loss.item())
            

NameError: name 'TrainableNetworkAgent' is not defined

In [None]:
run_n_steps(env, BuffTNA(), 10)