## REINFORCE + Baseline - Lunar Lander v3

In [None]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# ============================================================
# Networks
# ============================================================
class PolicyNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(8, 256),
            nn.ReLU(),
            nn.Linear(256, 4),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.fc(x)


class ValueNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(8, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.fc(x)


# ============================================================
# Returns (discounted sum of rewards)
# ============================================================
def compute_returns(rewards, gamma=0.99):
    G = 0
    returns = []
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)
    return returns


# ============================================================
# Training Loop
# ============================================================
env = gym.make("LunarLander-v3")
policy = PolicyNet()
value_net = ValueNet()

optimizer = optim.Adam(
    list(policy.parameters()) + list(value_net.parameters()),
    lr=0.0005
)

episodes = 1500
all_rewards = []

for ep in range(episodes):
    state, _ = env.reset()
    log_probs, values, rewards = [], [], []

    done = False
    while not done:
        state_t = torch.tensor(state, dtype=torch.float32)

        probs = policy(state_t)
        dist = torch.distributions.Categorical(probs)
        action = dist.sample()

        value = value_net(state_t)

        next_state, reward, terminated, truncated, _ = env.step(action.item())
        done = terminated or truncated

        log_probs.append(dist.log_prob(action))
        values.append(value)
        rewards.append(reward)

        state = next_state

    # episode reward
    total_reward = sum(rewards)
    all_rewards.append(total_reward)

    # returns + advantages
    returns = torch.tensor(compute_returns(rewards), dtype=torch.float32)
    values = torch.cat(values).squeeze()

    # normalize returns (helps a lot)
    returns = (returns - returns.mean()) / (returns.std() + 1e-7)

    advantages = returns - values.detach()

    # losses
    policy_loss = -(advantages * torch.stack(log_probs)).sum()
    value_loss = nn.functional.mse_loss(values, returns)
    loss = policy_loss + value_loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if ep % 50 == 0:
        print(f"[REINFORCE] Episode {ep}, Reward: {total_reward}")

# ============================================================
# PLOT
# ============================================================
import matplotlib.pyplot as plt
plt.plot(all_rewards, alpha=0.4)
plt.plot(np.convolve(all_rewards, np.ones(20)/20, mode="valid"))
plt.title("REINFORCE + Baseline on LunarLander-v2")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.grid()
plt.show()


## DQN + Target - Lunar Lander v3

In [None]:
import gymnasium as gym
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque

# ============================================================
# Q-Network
# ============================================================
class QNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(8, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 4)
        )

    def forward(self, x):
        return self.fc(x)


# ============================================================
# Hyperparameters
# ============================================================
env = gym.make("LunarLander-v3")

lr = 0.0005
gamma = 0.99
episodes = 600
batch_size = 64
buffer_size = 100_000
min_replay = 1000
target_update = 1000

epsilon = 1.0
eps_min = 0.01
eps_decay = 0.995

replay = deque(maxlen=buffer_size)

q_net = QNet()
target_net = QNet()
target_net.load_state_dict(q_net.state_dict())

optimizer = optim.Adam(q_net.parameters(), lr=lr)

all_rewards = []
step_count = 0

# ============================================================
# Training Loop
# ============================================================
for ep in range(episodes):
    state, _ = env.reset()
    total_reward = 0
    done = False

    while not done:
        step_count += 1

        # epsilon-greedy
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                action = q_net(torch.tensor(state, dtype=torch.float32)).argmax().item()

        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        replay.append((state, action, reward, next_state, done))
        state = next_state
        total_reward += reward

        # training only when enough samples
        if len(replay) >= min_replay:
            batch = random.sample(replay, batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)

            states = torch.tensor(states, dtype=torch.float32)
            actions = torch.tensor(actions).long()
            rewards = torch.tensor(rewards, dtype=torch.float32)
            next_states = torch.tensor(next_states, dtype=torch.float32)
            dones = torch.tensor(dones, dtype=torch.float32)

            # Q(s,a)
            q_values = q_net(states).gather(1, actions.unsqueeze(1)).squeeze()

            # target = r + Î³ max_a Q_target(s',a)
            with torch.no_grad():
                next_q = target_net(next_states).max(1)[0]
                target = rewards + gamma * next_q * (1 - dones)

            loss = nn.functional.mse_loss(q_values, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # update target network
        if step_count % target_update == 0:
            target_net.load_state_dict(q_net.state_dict())

    all_rewards.append(total_reward)

    # epsilon decay
    epsilon = max(eps_min, epsilon * eps_decay)

    if ep % 20 == 0:
        print(f"[DQN] Episode {ep}, Reward: {total_reward}, epsilon={epsilon:.3f}")

# ============================================================
# PLOT
# ============================================================
import matplotlib.pyplot as plt
plt.plot(all_rewards, alpha=0.4)
plt.plot(np.convolve(all_rewards, np.ones(20)/20, mode="valid"))
plt.title("DQN on LunarLander-v2")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.grid()
plt.show()
