In [479]:
import torch
import torch.nn as nn
import numpy as np
import random
from collections import namedtuple, deque, Iterable

In [480]:
BUFFER_SIZE         = int(1e5)
BATCH_SIZE          = 64
GAMMA               = 0.99  # discount factor
TAU                 = 1e-3  # soft update of target parameter
LEARNING_RATE       = 5e-4
UPDATE_EVERY        = 10    # how often to update the local
TARGET_UPDATE_EVERY = 50    # how often to update the target


In [481]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [482]:
from Board import Board
env = Board()

In [483]:
class QNetwork(nn.Module):
    """ Agent Policy Network Model """
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)
        self.flatten = torch.nn.Flatten()
        
    def forward(self, x:torch.Tensor):
        """ state -> action values """
        x = self.flatten(x)
        x = self.fc1(x)
        x = nn.functional.relu(x)
        x = self.fc2(x)
        x = nn.functional.relu(x)
        x = self.fc3(x)
        return x


In [484]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(16, 64),
    torch.nn.ReLU(),
    torch.nn.Linear(64, 4),
)
state = torch.from_numpy(np.array([12]*2*16).reshape(2,4,4)).float()
print(model(state))


tensor([[-4.9223,  1.1704,  1.0265,  2.0215],
        [-4.9223,  1.1704,  1.0265,  2.0215]], grad_fn=<AddmmBackward0>)


In [485]:
q = QNetwork(16,4)
state = torch.from_numpy(np.array(list(range(16))*2).reshape(2,4,4)).float()
# print(state)
print(state.shape)
print(q(state))

torch.Size([2, 4, 4])
tensor([[ 0.1359, -0.4027, -0.7870,  1.1597],
        [ 0.1359, -0.4027, -0.7870,  1.1597]], grad_fn=<AddmmBackward0>)


In [486]:
class ReplayBuffer:
    """ Fixed size buffer to store experience tuples """

    def __init__(self, action_size, buffer_size, batch_size):
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.experience = namedtuple('experience', field_names=[
                                     'state', 'action', 'reward', 'next_state', 'done'])
        self.memory: deque[self.experience] = deque(maxlen=buffer_size)

    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        """  """
        experiences = random.sample(self.memory, k=self.batch_size)
        states = torch.from_numpy(
            np.stack([e.state for e in experiences if e is not None], axis=0)).to(device).float()
        actions = torch.from_numpy(
            np.stack([e.action for e in experiences if e is not None], axis=0)).to(device).float()
        rewards = torch.from_numpy(
            np.stack([e.reward for e in experiences if e is not None], axis=0)).to(device).float()
        next_states = torch.from_numpy(
            np.stack([e.next_state for e in experiences if e is not None], axis=0)).to(device).float()
        dones = torch.from_numpy(np.stack(
            [e.done for e in experiences if e is not None], axis=0).astype(np.uint8)).to(device).float()
        return self.experience(states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)

In [487]:
b=ReplayBuffer(4, BUFFER_SIZE, BATCH_SIZE)
for i in range(100):
    b.add(np.array([12.3]*16).reshape(4,4),0,3,np.array([12.3]*16).reshape(4,4),False)

In [488]:
t=random.sample(b.memory, k=b.batch_size)
a = torch.cat(tuple((torch.tensor(e.state, device=device) for e in t if e is not None)),-1).to(device).float()
a.shape
# torch.tensor(t[-1].state).shape

# tuple((e.state for e in t if e is not None))

torch.Size([4, 256])

In [489]:
b.sample()

experience(state=tensor([[[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        ...,

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 

In [490]:
b.sample().state

tensor([[[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        ...,

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 12.3000, 12.3000,

In [491]:
class Agent:
    def __init__(self, state_size: int, action_size: int, learning_rate: float, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.buffer_size = buffer_size
        self.batch_size = batch_size

        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size).to(device)
        self.optimizer = torch.optim.Adam(
            self.qnetwork_local.parameters(), lr=learning_rate)
        # replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size)

        self.time_step = 0
        self.eps = 0.0
        self.gamma = 0.9

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)
        self.time_step += 1
        

    def learn_from_experience(self):
        experience = self.memory.sample()
        return self.learn(experience)

    def q_value(self, state, eps=0.0, train=True)->torch.Tensor:
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        """ (w,h)->(1,w,h) Add one dimension to the state, as the nn expect a batch.  """
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state).to(device)
        self.qnetwork_local.train()
        return action_values
    
    def decide(self, action_values:torch.Tensor, eps=0.0)->int:
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()) #  "addmm_cuda" not implemented for 'Long'
        else:
            return random.choice(np.arange(self.action_size))

    def act(self, state, eps=0.0, train=True)->int:
        action_values = self.q_value(state, eps=eps, train=train)
        return self.decide(action_values, eps=eps)

    def learn(self, experience: Iterable[torch.Tensor]):
        """ Update parameters using batch of experience tuples """
        q_current, q_targets = self._double_dqn(experience)
        # Compute the loss and gradient
        loss = torch.nn.functional.mse_loss(q_current, q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss

    def _dqn(self, experience: Iterable[torch.Tensor])->Iterable[torch.Tensor]:
        states, actions, rewards, next_states, dones = experience
        rewards = rewards.reshape(self.batch_size,1)
        dones = dones.reshape(self.batch_size,1)
        q_targets_next = self.qnetwork_target( next_states).detach().max(1)[0].unsqueeze(1)
        q_targets = rewards+self.gamma*q_targets_next*(1 - dones)
        q_current = self.qnetwork_local(states).gather(1, actions.type(torch.int64).unsqueeze(1))
        return (q_current, q_targets)
    
    def _double_dqn(self, experience: Iterable[torch.Tensor])->Iterable[torch.Tensor]:
        states, actions, rewards, next_states, dones = experience
        rewards = rewards.reshape(self.batch_size,1)
        dones = dones.reshape(self.batch_size,1)
        action_q_local_next = torch.argmax(self.qnetwork_local(next_states),-1)
        q_targets_next = self.qnetwork_target(next_states).gather(1, action_q_local_next.type(torch.int64).unsqueeze(1))
        q_targets = rewards+self.gamma*q_targets_next*(1 - dones)
        q_current = self.qnetwork_local(states).gather(1, actions.type(torch.int64).unsqueeze(1))
        return (q_current, q_targets)


    def state_to_features(self, state: np.ndarray):
        return torch.from_numpy(state.flatten()).to(device).float().unsqueeze(0)

    def soft_update(self):
        self._soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    @staticmethod
    def _soft_update(local_model:torch.nn.Module, target_model:torch.nn.Module, tau: float):
        """ θ_target = τ*θ_local + (1 - τ)*θ_target 
        copy the weights of the local model to the target model
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(
                tau*local_param.data+(1.0-tau)*target_param.data)

In [492]:
a=Agent(16, 4, LEARNING_RATE, BUFFER_SIZE, BATCH_SIZE)

In [493]:
a.optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: False
    lr: 0.0005
    maximize: False
    weight_decay: 0
)

In [494]:
a.step(np.array([123]*16),0,3,np.array([123]*16),False)

In [495]:
a.act(np.array(list(range(16))).reshape(4,4))

3

In [496]:
state = torch.tensor(np.array(list(range(16))).reshape(4,4),device=device).float()
print(state)
state = state.unsqueeze(0)
print('added dimension',state)
print(a.qnetwork_target(state))
print(torch.argmax(a.qnetwork_target(state)))


tensor([[ 0.,  1.,  2.,  3.],
        [ 4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11.],
        [12., 13., 14., 15.]])
added dimension tensor([[[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.],
         [12., 13., 14., 15.]]])
tensor([[ 0.7358, -0.3510,  1.0113, -1.0490]], grad_fn=<AddmmBackward0>)
tensor(2)


In [497]:
torch.argmax(torch.tensor([[ 0.1125,  0.1978,  0.0226, -0.2571]]))

tensor(1)

In [498]:
b.sample()

experience(state=tensor([[[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        ...,

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 

In [499]:
b.sample().state.shape

torch.Size([64, 4, 4])

In [500]:
a.learn(b.sample())

tensor(6.2372, grad_fn=<MseLossBackward0>)

In [501]:
def train(agent: Agent, n_episodes, max_time_step, eps_start, eps_end, eps_decay):
    scores = []
    num_rounds = []
    scores_window = deque(maxlen=100)
    eps = eps_start
    for episode in range(n_episodes):
        env.reset()
        state = env.board
        accumulate_reward = 0
        rounds = 0
        for time_step in range(max_time_step):
            action_values = agent.q_value(state, eps)
            action = agent.decide(action_values, eps)
            next_state, reward, done = env.step(action)
            agent.step(state, action, reward, next_state, done)
            """ === this step has finished === """
            if agent.time_step % UPDATE_EVERY == 0 and len(agent.memory) > BATCH_SIZE:
                loss = agent.learn_from_experience()
                print(loss)
            if agent.time_step % TARGET_UPDATE_EVERY == 0:
                agent.soft_update()
            """ === next iteration === """
            state = next_state
            accumulate_reward += reward
            rounds += 1
            if done:
                break
        scores_window.append(accumulate_reward)
        scores.append(accumulate_reward)
        num_rounds.append(rounds)
        eps = max(eps_end, eps-eps_decay)
        if episode % 100 == 0:
            print(episode, np.mean(scores_window))
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pt')
    return scores, num_rounds

In [504]:
agent = Agent(state_size=16, action_size=4, learning_rate=LEARNING_RATE)
scores, num_rounds = train(agent, n_episodes=1000, max_time_step=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.01)


tensor(500.9095, grad_fn=<MseLossBackward0>)
tensor(501.6540, grad_fn=<MseLossBackward0>)
tensor(450.5762, grad_fn=<MseLossBackward0>)
tensor(422.7862, grad_fn=<MseLossBackward0>)
tensor(317.7338, grad_fn=<MseLossBackward0>)
tensor(318.3767, grad_fn=<MseLossBackward0>)
0 1950.0
tensor(304.1623, grad_fn=<MseLossBackward0>)
tensor(340.5092, grad_fn=<MseLossBackward0>)
tensor(486.4716, grad_fn=<MseLossBackward0>)
tensor(414.3141, grad_fn=<MseLossBackward0>)
tensor(417.8317, grad_fn=<MseLossBackward0>)
tensor(388.7697, grad_fn=<MseLossBackward0>)
tensor(415.2410, grad_fn=<MseLossBackward0>)
tensor(406.4557, grad_fn=<MseLossBackward0>)
tensor(298.4278, grad_fn=<MseLossBackward0>)
tensor(400.0021, grad_fn=<MseLossBackward0>)
tensor(342.5425, grad_fn=<MseLossBackward0>)
tensor(423.0978, grad_fn=<MseLossBackward0>)
tensor(300.8683, grad_fn=<MseLossBackward0>)
tensor(335.5204, grad_fn=<MseLossBackward0>)
tensor(505.6613, grad_fn=<MseLossBackward0>)
tensor(364.9566, grad_fn=<MseLossBackward0>)
t

In [505]:
num_rounds

[171,
 111,
 100,
 82,
 124,
 92,
 140,
 86,
 89,
 64,
 87,
 69,
 211,
 134,
 145,
 154,
 84,
 109,
 73,
 107,
 96,
 133,
 97,
 96,
 114,
 137,
 192,
 122,
 85,
 91,
 92,
 200,
 128,
 250,
 89,
 168,
 144,
 77,
 128,
 258,
 60,
 170,
 79,
 203,
 162,
 89,
 70,
 111,
 79,
 59,
 56,
 130,
 109,
 160,
 98,
 90,
 113,
 107,
 149,
 88,
 74,
 183,
 66,
 77,
 123,
 125,
 82,
 117,
 106,
 76,
 118,
 230,
 105,
 102,
 91,
 72,
 51,
 68,
 78,
 105,
 118,
 88,
 91,
 51,
 120,
 96,
 96,
 57,
 61,
 104,
 78,
 99,
 134,
 105,
 114,
 109,
 66,
 141,
 87,
 111,
 78,
 78,
 72,
 88,
 72,
 79,
 76,
 78,
 88,
 85,
 70,
 128,
 81,
 71,
 93,
 69,
 115,
 85,
 87,
 164,
 77,
 84,
 71,
 103,
 85,
 103,
 88,
 87,
 103,
 85,
 72,
 93,
 85,
 78,
 104,
 85,
 84,
 87,
 80,
 78,
 123,
 81,
 84,
 116,
 88,
 85,
 154,
 62,
 101,
 79,
 74,
 122,
 121,
 72,
 114,
 107,
 105,
 136,
 95,
 97,
 62,
 145,
 84,
 108,
 87,
 63,
 117,
 79,
 109,
 137,
 76,
 87,
 85,
 85,
 88,
 115,
 153,
 108,
 132,
 90,
 47,
 103,
 115,
 159,

## DQN

In [503]:

states, actions, rewards, next_states, dones = b.sample()
# actions = actions.reshape(64,1)
rewards = rewards.reshape(64,1)
dones = dones.reshape(64,1)
print('input states=',states, states.shape)
print('input actions=',actions, actions.shape)
print('input rewards=',rewards, rewards.shape)
print('input next_states=',next_states, next_states.shape)
print('input dones=',dones, dones.shape)
print('q_targets_next_states=',agent.qnetwork_target( next_states))

q_targets_next = agent.qnetwork_target( next_states).detach().max(1)[0].unsqueeze(1)
print('q_targets_next',q_targets_next,q_targets_next.shape)
q_targets = rewards+agent.gamma*q_targets_next*(1-dones)
print('q_targets',q_targets,q_targets.shape)
q_current = agent.qnetwork_local(states).gather(1, actions.type(torch.int64).unsqueeze(1))
print('q_current',q_current,q_current.shape)
loss = torch.nn.functional.mse_loss(q_current, q_targets)
agent.optimizer.zero_grad()
loss.backward()
agent.optimizer.step()

agent.soft_update(agent.qnetwork_local, agent.qnetwork_target, TAU)
print('loss',loss)


input states= tensor([[[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        ...,

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 12.

TypeError: soft_update() takes 1 positional argument but 4 were given

## Double DQN

In [None]:
states, actions, rewards, next_states, dones = b.sample()
# actions = actions.reshape(64,1)
rewards = rewards.reshape(64,1)
dones = dones.reshape(64,1)
print('input states=',states, states.shape)
print('input actions=',actions, actions.shape)
print('input rewards=',rewards, rewards.shape)
print('input next_states=',next_states, next_states.shape)
print('input dones=',dones, dones.shape)

print('q values in next state',agent.qnetwork_local(next_states),agent.qnetwork_local(next_states).shape)
action_q_local_next = torch.argmax(agent.qnetwork_local(next_states),-1)
print('action_q_local_next',action_q_local_next,action_q_local_next.shape)

print('q_targets_next_states=',agent.qnetwork_target( next_states),agent.qnetwork_target( next_states).shape)
q_targets_next = agent.qnetwork_target(next_states).gather(1, action_q_local_next.type(torch.int64).unsqueeze(1))
print('q_targets_next',q_targets_next,q_targets_next.shape)
q_targets = rewards+agent.gamma*q_targets_next*(1-dones)
print('q_targets',q_targets,q_targets.shape)
q_current = agent.qnetwork_local(states).gather(1, actions.type(torch.int64).unsqueeze(1))
print('q_current',q_current,q_current.shape)
loss = torch.nn.functional.mse_loss(q_current, q_targets)
agent.optimizer.zero_grad()
loss.backward()
agent.optimizer.step()

agent.soft_update(agent.qnetwork_local, agent.qnetwork_target, TAU)
print('loss',loss)

input states= tensor([[[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        ...,

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000],
         [12.3000, 12.3000, 12.3000, 12.3000]],

        [[12.3000, 12.