In [1]:
from utils import Agent, run_n_steps
import gymnasium as gym

env = gym.make("LunarLander-v3", render_mode="human")

class RandLander(Agent):
    def act(self, observation, periphral=None):
        return env.action_space.sample()
    def think(self, observation_old, action, reward, observation, terminated):
        return super().think(observation_old, action, reward, observation, terminated)

rand_agent = RandLander()

cont = run_n_steps(env, rand_agent, 10)

Resetting


In [9]:
# Run a few times to get a feel for the environment
cont = run_n_steps(env, rand_agent, 10, cont)

In [45]:
env.observation_space.sample().shape

(8,)

In [54]:
env.action_space

Discrete(4)

In [10]:
import torch
import torch.nn as nn

class LanderNetwork(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.layers = nn.Sequential(
            nn.Linear(8, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 4),
            nn.Softmax(dim=-1)
        )
    def forward(self, x):
        return self.layers(x)

In [11]:
class SimpleNetworkAgent(Agent):
    def __init__(self) -> None:
        self.network = LanderNetwork()
    def act(self, observation, periphral=None):
        obs = torch.from_numpy(observation)
        action = self.network(obs)
        return action.argmax().item()
    def think(self, observation_old, action, reward, observation, terminated):
        return super().think(observation_old, action, reward, observation, terminated)

sna = SimpleNetworkAgent()

In [42]:
# Run a few times to get a feel for the environment
cont = run_n_steps(env, sna, 10, cont)

In [43]:
class TrainableNetworkAgent(Agent):
    def __init__(self, gamma = 0.9, lr=0.001) -> None:
        self.network = LanderNetwork()
        self.gamma = gamma
        self.optim = torch.optim.AdamW(self.network.parameters(), lr)
    def act(self, observation, periphral=None):
        obs = torch.from_numpy(observation)
        with torch.no_grad():
            action = self.network(obs)
        return action.argmax().item()
    def think(self, observation, action, reward, observation_next, terminated):
        cr_pred = self.network(observation)[action]
        cr_esti = reward + self.gamma*self.network(observation_next)
        loss = (cr_esti - cr_pred)**2
        self.optim.zero_grad()
        loss.backward()
        self.optim.step()
        

        


In [44]:
env.close()

# Utils

In [3]:
import gymnasium as gym

env = gym.make("LunarLander-v3", render_mode="human")
observation, info = env.reset()

episode_over = False
while not episode_over:
    action = env.action_space.sample()  # agent policy that uses the observation and info
    observation, reward, terminated, truncated, info = env.step(action)

    episode_over = terminated or truncated

env.close()

define a function that takes a model and an environment, and run the model in the environment for *n* steps

In [22]:
import gymnasium as gym
from abc import ABC, abstractmethod

class Agent(ABC):
    @abstractmethod
    def act(self, observation, periphral=None):
        pass
    @abstractmethod
    def think(self, observation_old, action, reward, observation):
        pass

def run_n_steps(env, agent: Agent, n, continuation=None):
    if continuation is not None:
        observation, reward, terminated, truncated, info = continuation
    if continuation is None or terminated or truncated:
        print("Resetting")
        observation, info = env.reset()
        terminated = False
        truncated = False
        reward = 0
    step = 0
    # print((observation, reward, terminated, truncated, info))
    while not terminated and not truncated and step < n:
        action = agent.act(observation)
        observation_old = observation
        observation, reward, terminated, truncated, info = env.step(action)
        agent.think(observation_old, action, reward, observation)
        step += 1
    return (observation, reward, terminated, truncated, info)

In [None]:
env = gym.make("LunarLander-v3", render_mode='human')

class RandomLunar(Agent):
    def act(self, observation, periphral=None):
        return env.action_space.sample()
    def think(self, observation_old, action, reward, observation):
        return super().think(observation_old, action, reward, observation)
randAgent = RandomLunar()


cont = run_n_steps(env, randAgent, 10)

In [93]:
cont = run_n_steps(env, randAgent, 10, cont)

In [94]:
env.close()