In [86]:
import torch
import torch.nn as nn
import torch.optim as optim

import random
import numpy as np
from collections import deque

import gymnasium as gym
env = gym.make("ALE/Breakout-v5")

In [91]:
class ReplayMemory():
    def __init__(self, max_samples):
        self.memory = deque([], maxlen=max_samples)

    def push(self, state, action, next_state, reward):
        self.memory.append((state, action, next_state, reward))

    def sample(self, sample_size):
        return random.sample(self.memory, sample_size)

    def __len__(self):
        return len(self.memory)


In [14]:
class DQN(nn.Module):
    def __init__(self, width, height, channels, output_size):
        super().__init__()

        self.conv1 = nn.Conv2d(in_channels=channels, out_channels=12, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(in_channels=12, out_channels=24, kernel_size=5, padding=2)
        self.pool2 = nn.MaxPool2d(2, 2)

        self.hidden1 = nn.Linear(int(24 * (width / 4) * (height / 4)), 128)
        self.relu1 = nn.ReLU()
        self.hidden2 = nn.Linear(512, 512)
        self.relu2 = nn.ReLU()
        self.out = nn.Linear(512, output_size)

    def forward(self, x):
        x = self.conv1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.pool2(x)

        x = self.hidden1(x)
        x = self.relu1(x)
        x = self.hidden2(x)
        x = self.relu2(x)
        x = self.out(x)

        return x

In [80]:
action_size = env.action_space.n
height, width, channels = env.observation_space.shape

In [16]:
#constants
action_size = env.action_space.n
height, width, channels = env.observation_space.shape

learning_rate = 0.005
tau = 0.01
replay_memory_size = 5000
batch_size = 150

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
episodes = 600

In [121]:
target_policy = DQN(width, height, channels, action_size)
policy_policy = DQN(width, height, channels, action_size)
target_policy.load_state_dict(policy_policy.state_dict())

memory = ReplayMemory(replay_memory_size)

In [18]:
policy_optimizer = optim.Adam(target_policy.parameters(), lr=learning_rate, amsgrad=True)
bellmann_error = nn.HuberLoss()

In [122]:
def choose_action(state, steps):
    eps_max = 0.95
    eps_min = 0.05
    eps_step = 1000

    threshold = eps_min + (eps_max - eps_min) * np.exp(-1 * (steps / eps_step))

    selection = np.random.rand()

    if selection > threshold:
        with torch.no_grad():
            return policy_policy(state).max(1).indices.item()
    else:
        return torch.tensor(env.action_space.sample(), device=device)


In [157]:
def optimize():
    if len(memory) < batch_size:
        return
    
    transitions = memory.sample(batch_size)

    states = torch.cat([t[0] for t in transitions], device=device)

    

    actions = torch.tensor([t[1] for t in transitions], device=device)
    rewards = torch.tensor([t[3] for t in transitions], device=device)



    



    

In [159]:
memory = ReplayMemory(10)

for _ in range(10):
    memory.push(torch.tensor(env.reset()[0]).permute(2, 0, 1).unsqueeze(0), 1, torch.tensor(env.reset()[0]).permute(2, 0, 1).unsqueeze(0), 10)
torch.tensor([t[1] for t in memory.sample(4)])


tensor([1, 1, 1, 1])

In [160]:
for episode in range(episodes):
    state, info = env.reset()
    state = torch.tensor(state, dtype=torch.float32, device=device).permute(2, 0, 1).unsqueeze(0)
    
    done = False
    step = 0
    while not done:
        action = choose_action(state, step)

        new_state, reward, terminated, truncated, _ = env.step(action)

        done = terminated or truncated

        if terminated:
            new_state = None
        else:
            new_state = torch.tensor(new_state, device=device).permute(2, 0, 1).unsqueeze(0)

        memory.push(state, action, new_state, reward)

        state = new_state

        # optimize step here
        optimize()
        # until here


        #update weights of target with policy
        target_dic = target_policy.state_dict()
        policy_dic = policy_policy.state_dict()

        for keys in target_dic:
            target_dic[keys] = policy_dic[keys] * tau + target_dic[keys] * (1-tau)

        target_policy.load_state_dict(target_dic)

        step += 1

    

TypeError: choose_action() missing 1 required positional argument: 'steps'

In [74]:
print(target_policy.state_dict())

OrderedDict([('conv1.weight', tensor([[[[ 0.1819,  0.0235,  0.1259],
          [ 0.1018,  0.0176, -0.0120],
          [-0.1712,  0.1426, -0.0567]],

         [[-0.0192,  0.1696,  0.0966],
          [ 0.0959,  0.1023,  0.1002],
          [-0.1446, -0.1589,  0.0539]],

         [[ 0.1669, -0.1763, -0.1538],
          [ 0.0522,  0.1354, -0.0399],
          [-0.1116,  0.0119, -0.0771]]],


        [[[ 0.1890,  0.0701,  0.1324],
          [ 0.0195, -0.0197,  0.1030],
          [ 0.0320, -0.1082,  0.0821]],

         [[-0.0294, -0.1322,  0.1666],
          [-0.1491,  0.0735,  0.1108],
          [-0.1231,  0.1721,  0.0852]],

         [[-0.1840, -0.0517,  0.0856],
          [ 0.0210, -0.0728, -0.0376],
          [-0.0536,  0.1712, -0.0814]]],


        [[[ 0.1835, -0.0822, -0.0426],
          [ 0.1890, -0.0542,  0.0831],
          [ 0.1889,  0.0287, -0.0732]],

         [[-0.0945,  0.0141,  0.1431],
          [-0.1357,  0.1882, -0.0031],
          [ 0.1725, -0.1664, -0.1630]],

         [[-0.