In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [2]:
class Network(nn.Module):
    
    def __init__(self,input_size,hidden_layers,output_size,drop_p=0.5):
        super().__init__()
        
        self.hidden_layers = nn.ModuleList([nn.Linear(input_size,hidden_layers[0])])
        self.hidden_layers.extend([nn.Linear(h1,h2) for h1,h2 in zip(hidden_layers[:-1],hidden_layers[1:])])
        self.output = nn.Linear(hidden_layers[-1],output_size)
        
        self.dropout = nn.Dropout(p=drop_p)
        
    def forward(self,x):
        
        for linear in self.hidden_layers:
            x = F.relu(linear(x))
            x = self.dropout(x)
            
        x = self.output(x)
        
        return x
        

In [3]:
input_size = 8
hidden_layers = [64,64]
output_size = 4

model = Network(input_size,hidden_layers,output_size)
model

Network(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=8, out_features=64, bias=True)
    (1): Linear(in_features=64, out_features=64, bias=True)
  )
  (output): Linear(in_features=64, out_features=4, bias=True)
  (dropout): Dropout(p=0.5)
)

In [4]:
env = gym.make('LunarLander-v2')
env.seed(0)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


  result = entry_point.load(False)


[0]

In [5]:
state = torch.rand(1,8);state

tensor([[0.4368, 0.3003, 0.9237, 0.7213, 0.5476, 0.6316, 0.4579, 0.3041]])

In [6]:
ps = model.forward(state);torch.exp(ps)

tensor([[0.9387, 0.9101, 0.7764, 0.9486]], grad_fn=<ExpBackward>)

In [8]:
state_alt = env.reset()
state_alt = torch.from_numpy(state_alt).float().unsqueeze(0)

state_alt

tensor([[ 1.1835e-04,  9.3636e-01,  1.1968e-02, -2.8384e-01, -1.3030e-04,
         -2.7110e-03,  0.0000e+00,  0.0000e+00]])

In [87]:
model.forward(state_alt)

tensor([[-0.0311,  0.0725, -0.1546,  0.0468]], grad_fn=<AddmmBackward>)

In [9]:
states = []
for i in range(10):
    states.append(env.reset())

states = np.array(states)
states = torch.from_numpy(states).float().unsqueeze(0)
states

tensor([[[ 7.5161e-03,  9.3632e-01,  7.6127e-01, -2.8623e-01, -8.7023e-03,
          -1.7244e-01,  0.0000e+00,  0.0000e+00],
         [ 5.9081e-03,  9.3416e-01,  5.9841e-01, -4.2994e-01, -6.8392e-03,
          -1.3555e-01,  0.0000e+00,  0.0000e+00],
         [-3.0343e-03,  9.4824e-01, -3.0736e-01,  5.0837e-01,  3.5228e-03,
           6.9622e-02,  0.0000e+00,  0.0000e+00],
         [-4.9807e-03,  9.3840e-01, -5.0451e-01, -1.4785e-01,  5.7782e-03,
           1.1428e-01,  0.0000e+00,  0.0000e+00],
         [-4.0418e-03,  9.4787e-01, -4.0939e-01,  4.8377e-01,  4.6900e-03,
           9.2733e-02,  0.0000e+00,  0.0000e+00],
         [ 4.2742e-03,  9.4790e-01,  4.3291e-01,  4.8599e-01, -4.9459e-03,
          -9.8060e-02,  0.0000e+00,  0.0000e+00],
         [ 2.7787e-03,  9.4285e-01,  2.8143e-01,  1.4907e-01, -3.2129e-03,
          -6.3747e-02,  0.0000e+00,  0.0000e+00],
         [ 3.7238e-03,  9.4131e-01,  3.7717e-01,  4.6237e-02, -4.3082e-03,
          -8.5435e-02,  0.0000e+00,  0.0000e+00],


In [21]:
output = model.forward(states)
output

tensor([[[ 0.2107,  0.0259, -0.1741, -0.1223],
         [ 0.1079, -0.1645, -0.2704,  0.0881],
         [-0.0433, -0.0275, -0.0446, -0.0041],
         [-0.0383,  0.0431, -0.0854, -0.0647],
         [ 0.1106,  0.0180, -0.0566, -0.2072],
         [ 0.1537,  0.0505, -0.2691, -0.0822],
         [ 0.0646,  0.1061, -0.1396,  0.1183],
         [ 0.0193, -0.0141, -0.1404,  0.0119],
         [ 0.1483, -0.1314, -0.2473,  0.0055],
         [ 0.1051,  0.1273, -0.1862, -0.0260]]], grad_fn=<AddBackward0>)

In [28]:
max_actions = output.detach().max(2)[1].unsqueeze(0); max_actions.size()

torch.Size([1, 1, 10])

In [135]:
actions = np.array([1,2,1,0,0,0,1,0,0,1])
actions = torch.from_numpy(actions)

model(states).detach().max(2)[0].size()

torch.Size([1, 10])

In [140]:
model(states).detach().max(2)[0].unsqueeze(dim=1)

tensor([[[ 0.0875,  0.1424,  0.1077,  0.1002,  0.1377,  0.0021,  0.1228,
          -0.0229,  0.0152,  0.1007]]])

In [141]:
class QNetwork(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
        """
        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        "*** YOUR CODE HERE ***"
        
        self.fc1 = nn.Linear(state_size,64)
        self.fc2 = nn.Linear(64,64)
        self.fc3 = nn.Linear(64,action_size)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        
        return x

In [142]:
modelQ = QNetwork(8,4,0)

In [143]:
modelQ

QNetwork(
  (fc1): Linear(in_features=8, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=4, bias=True)
)

In [152]:
outputQ = modelQ(states).detach().max(2)[0];outputQ

tensor([[0.0764, 0.0731, 0.1081, 0.1088, 0.1123, 0.0994, 0.0748, 0.0784, 0.1182,
         0.0842]])

In [150]:
rewards = torch.rand(1,10);rewards

tensor([[0.0374, 0.6094, 0.7492, 0.2011, 0.6760, 0.3226, 0.4393, 0.3893, 0.3931,
         0.0932]])

In [154]:
rewards + 0.2*(outputQ)

tensor([[0.0527, 0.6240, 0.7708, 0.2229, 0.6985, 0.3425, 0.4543, 0.4050, 0.4167,
         0.1100]])

In [155]:
actions = [1,2,1,0,3,1,0,2,1,3]

In [191]:
output_target = []
for index in range(len(states[0])):
    print(np.around(float(modelQ(states[0][index])[actions[index]].detach()),decimals=5))
    
output_target

-0.0225
-0.0178
-0.00873
0.08769
-0.02485
-0.03951
0.08887
-0.04289
-0.022
-0.01337


[]

In [212]:
modelQ(states[0])[0].detach()

tensor([ 0.1003, -0.0225, -0.0640, -0.0201])

In [203]:
output = modelQ.forward(states[0]).detach().max(1)[0].unsqueeze(0);output

tensor([[0.1003, 0.0723, 0.0986, 0.0877, 0.0880, 0.0753, 0.0889, 0.0851, 0.0848,
         0.0744]])

In [204]:
output.size()

torch.Size([1, 10])