In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
import numpy as np

from mlagents_envs.environment import UnityEnvironment

In [2]:
if(torch.cuda.is_available()):
    device = torch.device("cuda")
    print(device, torch.cuda.get_device_name(0))
else:
    device= torch.device("cpu")
    print(device)

cuda NVIDIA GeForce RTX 2060


In [3]:
N_STATES  = 210
N_ACTIONS = 4

In [4]:
#generate a tensor of size (1, N_Actions)
a = torch.ones(1, N_ACTIONS)
print(a, a.shape)

tensor([[1., 1., 1., 1.]]) torch.Size([1, 4])


In [5]:
#become a NN parameter with gradients
a = nn.Parameter(torch.ones(1, N_ACTIONS) * 0.0)
print(a)

Parameter containing:
tensor([[0., 0., 0., 0.]], requires_grad=True)


In [6]:
#generate a NN parameter [0, 0]
log_std = nn.Parameter(torch.ones(1, N_ACTIONS) * 0.0)
print(log_std)

Parameter containing:
tensor([[0., 0., 0., 0.]], requires_grad=True)


In [7]:
log_std.exp()

tensor([[1., 1., 1., 1.]], grad_fn=<ExpBackward>)

In [8]:
mu = torch.FloatTensor([[1, 1, 1, 1]])

In [9]:
#expand as the size of mu
log_std.exp().expand_as(mu)

tensor([[1., 1., 1., 1.]], grad_fn=<ExpandBackward>)

In [10]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)

In [11]:
class Net(nn.Module):
    def __init__(self, ):
        super(Net, self).__init__()
        
        self.actor = nn.Sequential(
            nn.Linear(N_STATES, 256),
            nn.LayerNorm(256),
            nn.Linear(256, 256),
            nn.LayerNorm(256),
            nn.Linear(256, N_ACTIONS)
        )
        self.log_std = nn.Parameter(torch.ones(1, N_ACTIONS) * 0.0)
        self.apply(init_weights)
    
    def forward(self, x):
        mu    = self.actor(x)
        std   = self.log_std.exp().expand_as(mu)
        dist  = Normal(mu, std)
        return dist

In [12]:
net = Net().to(device)

### Connect to Unity

In [13]:
env = UnityEnvironment(file_name= None, base_port=5004)

In [14]:
env.reset()
behaviorNames = list(env.behavior_specs.keys())
behaviorName = behaviorNames[0]

In [15]:
DecisionSteps, TerminalSteps = env.get_steps(behaviorName)

### Send decision steps to NN to calculate actions

In [16]:
DecisionSteps.obs

[array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0.,
         0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0.,
         1., 1.]], dtype=float32),
 array([[0., 0.,

In [17]:
states = DecisionSteps.obs[0]

In [18]:
states = torch.FloatTensor(states).to(device)

In [19]:
dist = net(states)
print(dist)

Normal(loc: torch.Size([1, 4]), scale: torch.Size([1, 4]))


In [20]:
actions = dist.sample()
print(actions, actions.shape)

tensor([[-1.5006, -0.0490,  2.3654,  2.7857]], device='cuda:0') torch.Size([1, 4])


In [21]:
actions = actions.cpu().detach().numpy()
print(actions)

[[-1.5005676  -0.04895395  2.3654416   2.7857432 ]]


In [22]:
env.set_actions(behaviorName, actions)

In [23]:
env.step()

In [24]:
env.close()

# Play for N steps

In [25]:
env = UnityEnvironment(file_name= None, base_port=5004)

In [26]:
env.reset()
behaviorNames = list(env.behavior_specs.keys())
behaviorName = behaviorNames[0]

In [27]:
for frame in range(200):
    DecisionSteps, TerminalSteps = env.get_steps(behaviorName)
    for AgentID in TerminalSteps.agent_id:
        print("step", frame, "agent ", AgentID, "has terminal step")
    
    if(len(list(DecisionSteps.agent_id))>0):
        state = DecisionSteps.obs[0]
        state = torch.FloatTensor(state).to(device)
        dist = net(state)
        action = dist.sample()
        action = action.cpu().detach().numpy()    
        env.set_actions(behaviorName, action)   
        env.step()

step 1 agent  0 has terminal step
step 2 agent  0 has terminal step
step 3 agent  0 has terminal step
step 4 agent  0 has terminal step
step 5 agent  0 has terminal step
step 6 agent  0 has terminal step
step 7 agent  0 has terminal step
step 8 agent  0 has terminal step
step 9 agent  0 has terminal step
step 10 agent  0 has terminal step
step 11 agent  0 has terminal step
step 12 agent  0 has terminal step
step 13 agent  0 has terminal step
step 14 agent  0 has terminal step
step 15 agent  0 has terminal step
step 16 agent  0 has terminal step
step 17 agent  0 has terminal step
step 18 agent  0 has terminal step
step 19 agent  0 has terminal step
step 20 agent  0 has terminal step
step 21 agent  0 has terminal step
step 22 agent  0 has terminal step
step 23 agent  0 has terminal step
step 24 agent  0 has terminal step
step 25 agent  0 has terminal step
step 26 agent  0 has terminal step
step 27 agent  0 has terminal step
step 28 agent  0 has terminal step
step 29 agent  0 has terminal

In [28]:
env.close()