In [30]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [31]:
class Network(nn.Module):
    
    def __init__(self,input_size,hidden_layers,output_size,drop_p=0.5):
        super().__init__()
        
        self.hidden_layers = nn.ModuleList([nn.Linear(input_size,hidden_layers[0])])
        self.hidden_layers.extend([nn.Linear(h1,h2) for h1,h2 in zip(hidden_layers[:-1],hidden_layers[1:])])
        self.output = nn.Linear(hidden_layers[-1],output_size)
        
        self.dropout = nn.Dropout(p=drop_p)
        
    def forward(self,x):
        
        for linear in self.hidden_layers:
            x = F.relu(linear(x))
            x = self.dropout(x)
            
        x = self.output(x)
        
        return x
        

In [32]:
input_size = 8
hidden_layers = [64,64]
output_size = 4

model = Network(input_size,hidden_layers,output_size)
model

Network(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=8, out_features=64, bias=True)
    (1): Linear(in_features=64, out_features=64, bias=True)
  )
  (output): Linear(in_features=64, out_features=4, bias=True)
  (dropout): Dropout(p=0.5)
)

In [33]:
env = gym.make('LunarLander-v2')
env.seed(0)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


[0]

In [34]:
state = torch.rand(1,8);state

tensor([[0.4069, 0.9438, 0.5586, 0.5887, 0.6077, 0.2465, 0.5846, 0.8088]])

In [35]:
ps = model.forward(state);torch.exp(ps)

tensor([[0.9294, 0.9455, 1.0565, 0.8958]], grad_fn=<ExpBackward>)

In [36]:
state_alt = env.reset()
state_alt = torch.from_numpy(state_alt).float().unsqueeze(0)

state_alt

tensor([[-5.9156e-04,  9.4230e-01, -5.9936e-02,  1.1277e-01,  6.9229e-04,
          1.3576e-02,  0.0000e+00,  0.0000e+00]])

In [37]:
model.forward(state_alt)

tensor([[-0.0838, -0.1902, -0.1937, -0.2508]], grad_fn=<AddmmBackward>)

In [38]:
states = []
for i in range(10):
    states.append(env.reset())

states = np.array(states)
states = torch.from_numpy(states).float().unsqueeze(0)
states

tensor([[[ 1.1835e-04,  9.3636e-01,  1.1968e-02, -2.8384e-01, -1.3030e-04,
          -2.7110e-03,  0.0000e+00,  0.0000e+00],
         [ 7.5161e-03,  9.3632e-01,  7.6127e-01, -2.8623e-01, -8.7023e-03,
          -1.7244e-01,  0.0000e+00,  0.0000e+00],
         [ 5.9081e-03,  9.3416e-01,  5.9841e-01, -4.2994e-01, -6.8392e-03,
          -1.3555e-01,  0.0000e+00,  0.0000e+00],
         [-3.0343e-03,  9.4824e-01, -3.0736e-01,  5.0837e-01,  3.5228e-03,
           6.9622e-02,  0.0000e+00,  0.0000e+00],
         [-4.9807e-03,  9.3840e-01, -5.0451e-01, -1.4785e-01,  5.7782e-03,
           1.1428e-01,  0.0000e+00,  0.0000e+00],
         [-4.0418e-03,  9.4787e-01, -4.0939e-01,  4.8377e-01,  4.6900e-03,
           9.2733e-02,  0.0000e+00,  0.0000e+00],
         [ 4.2742e-03,  9.4790e-01,  4.3291e-01,  4.8599e-01, -4.9459e-03,
          -9.8060e-02,  0.0000e+00,  0.0000e+00],
         [ 2.7787e-03,  9.4285e-01,  2.8143e-01,  1.4907e-01, -3.2129e-03,
          -6.3747e-02,  0.0000e+00,  0.0000e+00],


In [61]:
output = model.forward(states)
output

tensor([[[-0.0688,  0.0126, -0.1291, -0.0143],
         [ 0.0522, -0.0377,  0.0605, -0.1899],
         [ 0.1343,  0.1069, -0.0826, -0.1410],
         [ 0.1208, -0.0482, -0.0901, -0.2191],
         [-0.0258, -0.1528, -0.1446, -0.2101],
         [ 0.0803, -0.0123, -0.1398, -0.1197],
         [-0.1032,  0.0105,  0.0314, -0.1599],
         [-0.0590, -0.0328, -0.0761, -0.2169],
         [ 0.0846, -0.1463, -0.0779, -0.2145],
         [ 0.0371,  0.0918,  0.0176, -0.0718]]], grad_fn=<AddBackward0>)

In [82]:
max_actions = output.detach().max(2)[1]; max_actions.size()
max_actions.resize_(10,1)

max_actions.dtype

torch.int64

In [73]:
actions = np.array([1,2,1,0,0,0,1,0,0,1])
actions = torch.from_numpy(actions)

model(states).detach().max(2)[0].dtype

torch.float32

In [42]:
model(states).detach().max(2)[1].unsqueeze(dim=1)

tensor([[[-0.0588,  0.0607,  0.0804,  0.2209, -0.0156,  0.0536,  0.0736,
          -0.0072,  0.1033,  0.0881]]])

In [43]:
class QNetwork(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
        """
        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        "*** YOUR CODE HERE ***"
        
        self.fc1 = nn.Linear(state_size,64)
        self.fc2 = nn.Linear(64,64)
        self.fc3 = nn.Linear(64,action_size)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        
        return x

In [44]:
modelQ = QNetwork(8,4,0)

In [45]:
modelQ

QNetwork(
  (fc1): Linear(in_features=8, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=4, bias=True)
)

In [46]:
outputQ = modelQ(states).detach().max(2)[0];outputQ

tensor([[0.0954, 0.0761, 0.0820, 0.0908, 0.1100, 0.0924, 0.0756, 0.0793, 0.0776,
         0.0720]])

In [47]:
rewards = torch.rand(1,10);rewards

tensor([[0.0422, 0.2876, 0.5166, 0.1132, 0.9388, 0.1748, 0.3052, 0.4331, 0.0950,
         0.4960]])

In [48]:
rewards + 0.2*(outputQ)

tensor([[0.0613, 0.3028, 0.5330, 0.1313, 0.9608, 0.1933, 0.3203, 0.4490, 0.1105,
         0.5104]])

In [49]:
actions = [1,2,1,0,3,1,0,2,1,3]

In [50]:
output_target = []
for index in range(len(states[0])):
    print(np.around(float(modelQ(states[0][index])[actions[index]].detach()),decimals=5))
    
output_target

-0.03213
-0.02443
-0.04881
0.09076
-0.0173
-0.01169
0.07564
-0.0318
-0.03616
-0.01078


[]

In [51]:
modelQ(states[0])[0].detach()

tensor([ 0.0954, -0.0321, -0.0560, -0.0315])

In [52]:
output = modelQ.forward(states[0]).detach().max(1)[0].unsqueeze(0);output

tensor([[0.0954, 0.0761, 0.0820, 0.0908, 0.1100, 0.0924, 0.0756, 0.0793, 0.0776,
         0.0720]])

In [53]:
output.size()

torch.Size([1, 10])

In [86]:
output_aux = modelQ.forward(states).detach().squeeze()
print(max_actions)
print(output_aux)
output_aux.gather(1,max_actions)

tensor([[1],
        [2],
        [0],
        [0],
        [0],
        [0],
        [2],
        [1],
        [0],
        [1]])
tensor([[ 0.0954, -0.0321, -0.0560, -0.0315],
        [ 0.0761, -0.0497, -0.0244, -0.0441],
        [ 0.0820, -0.0488, -0.0346, -0.0545],
        [ 0.0908, -0.0120, -0.0496, -0.0115],
        [ 0.1100, -0.0151, -0.0774, -0.0173],
        [ 0.0924, -0.0117, -0.0549, -0.0094],
        [ 0.0756, -0.0332, -0.0209, -0.0139],
        [ 0.0793, -0.0306, -0.0318, -0.0187],
        [ 0.0776, -0.0362, -0.0289, -0.0234],
        [ 0.0720, -0.0526, -0.0166, -0.0108]])


tensor([[-0.0321],
        [-0.0244],
        [ 0.0820],
        [ 0.0908],
        [ 0.1100],
        [ 0.0924],
        [-0.0209],
        [-0.0306],
        [ 0.0776],
        [-0.0526]])