In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import gymnasium as gym
from collections import deque

In [2]:
class DQN(nn.Module):
    def __init__(self, n_observations, n_actions):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(n_observations, 16),
            # nn.Tanh(),
            # nn.Linear(256, 256),
            nn.Tanh(),
            nn.Linear(16, n_actions)
        )

    def forward(self,x):
        return self.model(x)

In [3]:
class Memory:
    def __init__(self, capacity):
        self.q = deque(maxlen = capacity)

    def push(self, transition):
        self.q.append(transition)

    def sample(self, batch_size = 512):
        batch = random.sample(self.q, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            torch.tensor(states, dtype = torch.float32),
            torch.tensor(actions),
            torch.tensor(rewards, dtype = torch.float32),
            torch.tensor(next_states, dtype = torch.float32),
            torch.tensor(dones, dtype = torch.float32),
        )

    def __len__(self):
        return len(self.q)

In [4]:
def train(env, memory, policyNN, targetNN, optimizer, epochs = 100, steps_per_epoch = 500, gamma = 0.95, batch_size = 1024):
    EPS_START = 1.0
    EPS_END = 0.05
    EPS_DECAY = 0.999

    epsilon = EPS_START
    all_rewards = []   
    
    targetNN.eval()
    for epoch in range(epochs):
        state, _ = env.reset()

        epoch_cum_reward = 0

        for step in range(steps_per_epoch):

            # choose action
            if random.random() < epsilon:
                action = env.action_space.sample()

            else:
                with torch.no_grad():
                    state_tensor = torch.tensor(state, dtype = torch.float32).unsqueeze(0)
                    action = policyNN(state_tensor).squeeze(0).argmax().item()

            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            epoch_cum_reward += reward
            
            # save transition
            memory.push((state, action, reward, next_state, done))
            state = next_state

            #learn
            if len(memory) >= 32:

                states, actions, rewards, next_states, dones = memory.sample(batch_size)

                q_actions_were_taken = policyNN(states).gather(1, actions.unsqueeze(1)).squeeze(1)
                q_next_actions = targetNN(next_states).max(1)[0]

                q_expected = rewards + gamma * q_next_actions * (1 - dones)

                loss = nn.MSELoss()(q_actions_were_taken, q_expected)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            if step % 20 == 0:
                targetNN.load_state_dict(policyNN.state_dict())

            if done:
                # print(step)
                break

        all_rewards.append(epoch_cum_reward)

        if epoch % 1 == 0:
            epsilon = max(EPS_END, epsilon * EPS_DECAY)
            
        if epoch % 100 == 0 and epoch > 0:
            print(f"Epoch {epoch}, Epsilon {epsilon}, mean reward {sum(all_rewards[-10:]) / 10.0}")


In [5]:
env = gym.make('CartPole-v1')
state, _ = env.reset()

n_observations = len(state)
n_actions = env.action_space.n

print(n_observations)
print(n_actions)

4
2


In [7]:
LR = 0.003

memory = Memory(100)

policyNN = DQN(n_observations, n_actions)
targetNN = DQN(n_observations, n_actions)
optimizer = optim.Adam(policyNN.parameters(), lr = LR)

train(env, memory, policyNN, targetNN, optimizer, epochs = 10000, steps_per_epoch = 50000, gamma = 0.99, batch_size = 16)

Epoch 100, Epsilon 0.9038873549665959, mean reward 22.6
Epoch 200, Epsilon 0.8178301806491574, mean reward 36.9
Epoch 300, Epsilon 0.7399663251239436, mean reward 38.3
Epoch 400, Epsilon 0.6695157201007336, mean reward 47.4
Epoch 500, Epsilon 0.6057725659163237, mean reward 94.3
Epoch 600, Epsilon 0.548098260578011, mean reward 88.2
Epoch 700, Epsilon 0.4959150020176678, mean reward 97.9
Epoch 800, Epsilon 0.44869999946146477, mean reward 89.2
Epoch 900, Epsilon 0.4059802359226587, mean reward 136.1
Epoch 1000, Epsilon 0.36732772934619257, mean reward 275.0
Epoch 1100, Epsilon 0.33235524492954527, mean reward 188.9
Epoch 1200, Epsilon 0.3007124156643058, mean reward 103.0
Epoch 1300, Epsilon 0.2720822322326576, mean reward 190.5
Epoch 1400, Epsilon 0.2461778670932771, mean reward 174.4
Epoch 1500, Epsilon 0.22273980093919937, mean reward 307.6
Epoch 1600, Epsilon 0.2015332227394583, mean reward 322.3
Epoch 1700, Epsilon 0.18234567731717977, mean reward 437.0


KeyboardInterrupt: 