# Step Wise Test of Framework

This notebook is used for peice wise testing of the complete agent framework. Each cell tests limited functionality to specially make sure that tensors have the right dimensions (column / row)

In [1]:
import torch
import torch.nn as n
import torch.optim as optim

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Device: {device}')

Device: cuda


## Generate Sample Data

In [2]:
import numpy as np

state_space = 37
action_space = 4
batch_size = 64

states_np = np.random.rand(batch_size, state_space)
actions_np = np.random.randint(action_space, size=batch_size)
rewards_np = np.random.rand(batch_size)
next_states_np = np.random.rand(batch_size, state_space)
dones_np = np.random.randint(2, size=batch_size, dtype = 'bool')

print(f'states: {states_np.shape}')
print(f'actions: {actions_np.shape}')
print(f'rewards: {rewards_np.shape}')
print(f'next_states = {next_states_np.shape}')
print(f'dones = {dones_np.shape}')


states: (64, 37)
actions: (64,)
rewards: (64,)
next_states = (64, 37)
dones = (64,)


## QNetwork

Load the qnetwork from model.py file

In [4]:
from model import QNetwork

LR = 5e-4
seed = None

qtarget_network = QNetwork(state_space, action_space, seed).to(device)
qlocal_network = QNetwork(state_space, action_space, seed).to(device)

# qtarget is never learnt, always evaluated and copied into
qtarget_network.eval()

optimizer = optim.Adam(qlocal_network.parameters(), lr = LR)

print('QNetworks loaded')

QNetworks loaded


In [40]:
import torch.nn as nn

device = 'cpu'

def model():
    return nn.Sequential(
        nn.Linear(state_space, 64),
        nn.ReLU(),
        nn.Linear(64, 32),
        nn.ReLU(),
        nn.Linear(32, action_space),
        ).double().to(device)


qlocal_network = model()
qtarget_network = model()

print(qlocal_network)

Sequential(
  (0): Linear(in_features=37, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=32, bias=True)
  (3): ReLU()
  (4): Linear(in_features=32, out_features=4, bias=True)
)


In [28]:
random_state_np = states_np[np.random.randint(batch_size)]
random_state = torch.from_numpy(random_state_np).to(device)
actions = model(random_state).max()
print(actions)
# action_chosen = int(np.argmax(actions.to('cpu').numpy()))

# print(f'Action: {actions}')
# print(f'Choosen action: {action_chosen}')

tensor(0.1984, device='cuda:0', dtype=torch.float64, grad_fn=<MaxBackward1>)


## Confirm Network Definition

If one state is passed to it, will it give us the right dimension actions?

In [41]:
import timeit

random_state_np = states_np[np.random.randint(batch_size)]
qlocal_network.eval()

def numpy_style(random_state):
    #random_state = torch.from_numpy(random_state_np).to(device)
    actions = qlocal_network(random_state).detach()

    # when actual code was run and action was passed to the unity environment it complained about the action
    # being int64 so had to be converted

    action_chosen = int(np.argmax(actions.to('cpu').numpy()))
    return action_chosen

def torch_style(random_state):
    #random_state = torch.from_numpy(random_state_np).to(device)
    action = qlocal_network(random_state).detach().max()
    return int(action)


random_state = torch.from_numpy(random_state_np).to(device)

%timeit numpy_style(random_state)
%timeit torch_style(random_state)

#print(f'Action: {actions}')
#print(f'Choosen action: {action_chosen}')

142 µs ± 3.69 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
135 µs ± 5.76 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Convert Numpy Arrays to Tensors

Use PyTorch confirm that the value of the actions that were chosen in the batch can be found using qlocal_network

Also confirm that **gradient function** is mentioned against all network nodes that require a gradient to be computed

**Limit** the batch down to 5 samples so that it is easier to confirm

In [90]:
limited_batch = 5

states = torch.from_numpy(states_np[:limited_batch]).float().to(device)
actions = torch.from_numpy(actions_np[:limited_batch]).long().to(device).unsqueeze(1)
rewards = torch.from_numpy(rewards_np[:limited_batch]).float().to(device)
next_states = torch.from_numpy(next_states_np[:limited_batch]).float().to(device)
dones = torch.from_numpy(dones_np[:limited_batch]).float().to(device)

### First Step: Get Action Values (Using Local Network)

First step in DQN is to get the action values using the local network

In [124]:
q_s_local = qlocal_network(states).gather(1, actions)
print(f'Local Network - all action values:\n{qlocal_network(states)}')
print(f'Actions that were chosen in the batch:\n{actions}')
print(f'Confirm the value corresponds to the correct action\n {q_s_local}')

Local Network - all action values:
tensor([[ 0.0359, -0.0744,  0.0047, -0.1696],
        [-0.0039, -0.0258, -0.0824, -0.2261],
        [-0.0249, -0.0343, -0.0800, -0.1715],
        [ 0.0128, -0.0763, -0.0417, -0.1786],
        [-0.0135, -0.0400, -0.0716, -0.2183]], device='cuda:0',
       grad_fn=<AddmmBackward>)
Actions that were chosen in the batch:
tensor([[0],
        [1],
        [0],
        [2],
        [3]], device='cuda:0')
Confirm the value corresponds to the correct action
 tensor([[ 0.0359],
        [-0.0258],
        [-0.0249],
        [-0.0417],
        [-0.2183]], device='cuda:0', grad_fn=<GatherBackward>)


## Second Step: Compute TD_ERROR

For next_state:

1) Use the local network to find which action is the best in next_state (the one that has max value)    
2) But find and use that particular action's value in target network not local network     
3) y = r + γ * QTarget(s', max_a QLocal(s'))    
4) Compute td_error = y - q(s,a)    

### Confirm QLocal(next_state) and the Maximum Actions

In [125]:
qlocal_netowrk.eval()

q_s_prime_local = qlocal_network(next_states).detach()
print(q_s_prime_local)
print(torch.max(q_s_prime_local, axis = 1))

qlocal_netowrk.train()


tensor([[-0.0310, -0.0276, -0.0750, -0.1856],
        [ 0.0005, -0.0472, -0.0428, -0.1894],
        [-0.0500,  0.0090, -0.1007, -0.2008],
        [-0.0096,  0.0020, -0.0830, -0.1934],
        [-0.0093, -0.0511, -0.0646, -0.1829]], device='cuda:0')
torch.return_types.max(
values=tensor([-0.0276,  0.0005,  0.0090,  0.0020, -0.0093], device='cuda:0'),
indices=tensor([1, 0, 1, 1, 0], device='cuda:0'))


QNetwork(
  (fc1): Linear(in_features=37, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (output): Linear(in_features=32, out_features=4, bias=True)
)

### Complete TD_Error calculation

In [126]:
GAMMA = 0.99
debug = True

with torch.no_grad():
    q_s_prime_target = qtarget_network(next_states).detach()

    # put the local network in eval mode so that in future if it has dropout layers etc.
    # they are used in eval
    qlocal_netowrk.eval()
    q_s_prime_local = qlocal_network(next_states).detach()
    qlocal_netowrk.train()

    a_s_prime_local = torch.max(q_s_prime_local, axis = 1).indices.unsqueeze(1)
    v_s_prime_target = q_s_prime_target.gather(1, a_s_prime_local)

    # delta = r + (1-terminal) * gamma * max_a Q(s2, a) - Q(s, a)
    # dones is a row vector, we need to convert it to column vector to multiply
    # it with gamma * max_a.values
    dones_row = (1 - dones).unsqueeze(1)
    future_rewards = dones_row * GAMMA * v_s_prime_target

    y = rewards.unsqueeze(1) + future_rewards

    td_error = (y - q_s_local).to('cpu').numpy().reshape(-1)

    if debug:
        print(f"y = r + 𝛾 * max_a V(s')")
        print(f'r: {rewards}')
        print(f'Actions: {actions}')
        print(f"Q(s' target): {q_s_prime_target}")
        print(f"Q(s' local): {q_s_prime_local}")
        print(f"A(s' local): {a_s_prime_local}")
        print(f"QTarget(s', max_a QLocal(s')):\n{v_s_prime_target}")
        print(f"future_rewards: gamma * max_a V(s'):\n{future_rewards}")
        print(f"rewards: {rewards}")
        print(f"y = r + future: {y}")
        print(f'q_s_local: {q_s_local}')
        print(f'td_error: {td_error}')


y = r + 𝛾 * max_a V(s')
r: tensor([0.0134, 0.3142, 0.6854, 0.3508, 0.9808], device='cuda:0')
Actions: tensor([[0],
        [1],
        [0],
        [2],
        [3]], device='cuda:0')
Q(s' target): tensor([[-0.0365,  0.1683,  0.0832, -0.0310],
        [-0.0444,  0.1373,  0.0966,  0.0100],
        [-0.0098,  0.1504,  0.0865,  0.0382],
        [-0.0524,  0.1961,  0.0484,  0.0316],
        [-0.0114,  0.1336,  0.0442, -0.0370]], device='cuda:0')
Q(s' local): tensor([[-0.0310, -0.0276, -0.0750, -0.1856],
        [ 0.0005, -0.0472, -0.0428, -0.1894],
        [-0.0500,  0.0090, -0.1007, -0.2008],
        [-0.0096,  0.0020, -0.0830, -0.1934],
        [-0.0093, -0.0511, -0.0646, -0.1829]], device='cuda:0')
A(s' local): tensor([[1],
        [0],
        [1],
        [1],
        [0]], device='cuda:0')
QTarget(s', max_a QLocal(s')):
tensor([[ 0.1683],
        [-0.0444],
        [ 0.1504],
        [ 0.1961],
        [-0.0114]], device='cuda:0')
future_rewards: gamma * max_a V(s'):
tensor([[0.1666

### Reason for Unsqueezing

In the following cell, one can see that v_s_prime_target is a column vector, where each entry in the vector depicts the value of the maximum action for next_state (the index of which was found using the local network NOT target network)

In [127]:
def print_heading(heading):
    print('-' * 20, end = '')
    print(f' {heading} ', end ='')
    print('-' * 20)

dones_row = (1 - dones).unsqueeze(1)
print_heading('Dones & v_s_prime_target Vectors')
print(dones_row)
print(v_s_prime_target)

future_rewards = dones_row * GAMMA * v_s_prime_target
print_heading('Future Rewards')
print(future_rewards)

print_heading('Immediate Rewards (r)')
print(rewards.unsqueeze(1))

y = rewards.unsqueeze(1) + future_rewards
print_heading('r + Immediate Rewards')
print(y)

td_error = (y - q_s_local).detach()
print_heading('td_error')
print(td_error)

print_heading('td_error in row format similar to indices')
print(td_error.data.to('cpu').numpy().reshape(-1))



-------------------- Dones & v_s_prime_target Vectors --------------------
tensor([[1.],
        [0.],
        [1.],
        [1.],
        [0.]], device='cuda:0')
tensor([[ 0.1683],
        [-0.0444],
        [ 0.1504],
        [ 0.1961],
        [-0.0114]], device='cuda:0')
-------------------- Future Rewards --------------------
tensor([[0.1666],
        [-0.0000],
        [0.1489],
        [0.1942],
        [-0.0000]], device='cuda:0')
-------------------- Immediate Rewards (r) --------------------
tensor([[0.0134],
        [0.3142],
        [0.6854],
        [0.3508],
        [0.9808]], device='cuda:0')
-------------------- r + Immediate Rewards --------------------
tensor([[0.1800],
        [0.3142],
        [0.8344],
        [0.5450],
        [0.9808]], device='cuda:0')
-------------------- td_error --------------------
tensor([[0.1441],
        [0.3399],
        [0.8592],
        [0.5867],
        [1.1992]], device='cuda:0')
-------------------- td_error in row format similar to

## Loss Function

`y - q(s,a)` has to be multiplied by the weights returned by prioritized replay buffer therefore we cannot use MSELoss as is and will have to compute Squared Error first, then multiply by weights and then take the mean

In [128]:
import torch.nn as nn

td_error = y - q_s_local

optimizer = optim.Adam(qlocal_netowrk.parameters(), lr = LR)
optimizer.zero_grad()

# loss function should only compute (x - y) ** 2 and not mean it
# loss = F.mse_loss(y, q_s_local) 
loss_fn = nn.MSELoss(reduction='none')
loss = loss_fn(y, q_s_local)
loss_mean = torch.mean(loss)
loss_mean.backward()

print(f'y: {y}')
print(f'q_s_local: {q_s_local}')
print(f'td_error = y - q_s_local = {td_error}')
print('-' * 100)
print('Confirmation that reduction=None in the loss function gives us just (y - Q(s)) ** 2')
print('-' * 100)
print(f'td_error ** 2: {td_error ** 2}')
print(f'loss: {loss}')
print('-' * 100)
print(f'Mean loss: {loss_mean}')

y: tensor([[0.1800],
        [0.3142],
        [0.8344],
        [0.5450],
        [0.9808]], device='cuda:0')
q_s_local: tensor([[ 0.0359],
        [-0.0258],
        [-0.0249],
        [-0.0417],
        [-0.2183]], device='cuda:0', grad_fn=<GatherBackward>)
td_error = y - q_s_local = tensor([[0.1441],
        [0.3399],
        [0.8592],
        [0.5867],
        [1.1992]], device='cuda:0', grad_fn=<SubBackward0>)
----------------------------------------------------------------------------------------------------
Confirmation that reduction=None in the loss function gives us just (y - Q(s)) ** 2
----------------------------------------------------------------------------------------------------
td_error ** 2: tensor([[0.0208],
        [0.1156],
        [0.7383],
        [0.3442],
        [1.4380]], device='cuda:0', grad_fn=<PowBackward0>)
loss: tensor([[0.0208],
        [0.1156],
        [0.7383],
        [0.3442],
        [1.4380]], device='cuda:0', grad_fn=<MseLossBackward>)
------