In [1]:
import os

from unityagents import UnityEnvironment
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as f
import torch.nn.functional as F
import torch.distributed as dist
from torch import optim
from torch.autograd import Variable
from collections import deque
import random
import copy

In [2]:
# env.close()
env = UnityEnvironment(file_name="Tennis_Linux/Tennis.x86_64", no_graphics=True)

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [4]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size # 2
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1] # 24
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


In [5]:
from collections import namedtuple
class ReplayBuffer:
    def __init__(self, action_size, buffer_size, batch_size, seed):
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)  # internal memory (deque)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
    
    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float()
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float()
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float()
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float()
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float()

        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [6]:
dim_actor_in = state_size
dim_actor_h1 = 256
dim_actor_h2 = 128
dim_actor_h3 = 64
dim_actor_out = action_size

dim_critic_in = num_agents * state_size
dim_critic_h1 = 256
dim_critic_hi = dim_critic_h1 + num_agents * dim_actor_out
dim_critic_h2 = 128
dim_critic_h3 = 64
dim_critic_out = 1

class OUNoise:
    """Ornstein-Uhlenbeck process."""

    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
        """Initialize parameters and noise process."""
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = random.seed(seed)
        self.reset()

    def reset(self):
        """Reset the internal state (= noise) to mean (mu)."""
        self.state = copy.copy(self.mu)

    def sample(self):
        """Update internal state and return it as a noise sample."""
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * random.gauss(self.mu, self.sigma)/2
        self.state = x + dx
        return self.state

def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(dim_actor_in, dim_actor_h1)
        self.bn1 = nn.BatchNorm1d(dim_actor_h1)
        self.fc2 = nn.Linear(dim_actor_h1, dim_actor_h2)
        self.bn2 = nn.BatchNorm1d(dim_actor_h2)
        self.fc3 = nn.Linear(dim_actor_h2, dim_actor_h3)
        self.fc4 = nn.Linear(dim_actor_h3, dim_actor_out)
        self.reset_parameters()
        
    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(*hidden_init(self.fc3))
        self.fc4.weight.data.uniform_(-3e-3, 3e-3)
        
    def forward(self, state):
        h1 = self.bn1(
            self.fc1(state))
        h2 = f.relu(self.fc2(h1))
        h3 = f.relu(self.fc3(h2))
        out = self.fc4(h3)
        return out
    
class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(dim_critic_in, dim_critic_h1)
        self.bn1 = nn.BatchNorm1d(dim_critic_h1)
        self.fc2 = nn.Linear(dim_critic_hi, dim_critic_h2)
        self.bn2 = nn.BatchNorm1d(dim_critic_h2)
        self.fc3 = nn.Linear(dim_critic_h2, dim_critic_h3)
        self.fc4 = nn.Linear(dim_critic_h3, dim_critic_out)
        self.reset_parameters()
        
    def reset_parameters(self):
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc3.weight.data.uniform_(*hidden_init(self.fc3))
        self.fc4.weight.data.uniform_(-3e-3, 3e-3)
        
    def forward(self, state, action):
#         x = torch.cat((state), dim=1)
        h1 = self.bn1(
            self.fc1(state))
        hi = torch.cat((h1, nn.BatchNorm1d(4)(action)), dim=1)
        h2 = f.relu(self.fc2(hi))
        h3 = f.relu(self.fc3(h2))
        out = self.fc4(h3)
        return out

In [7]:
class Agent:
    def __init__(self):
        self.actor_local = Actor()
        self.actor_target = Actor()
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor)
        
        self.tau = tau
        self.noise = OUNoise(action_size, seed)
        
    def act(self, state, episode, t):
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).data.numpy()
            action += max(0,(1.0/t)) * max(1-episode/3000., 0) * self.noise.sample()
        self.actor_local.train()
        return action
        
    def _update_actor(self):
        for target_param, local_param in zip(self.actor_target.parameters(), self.actor_local.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def _update_critic(self):
        for target_param, local_param in zip(self.critic_target.parameters(), self.critic_local.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def soft_update(self):
#         self._update_critic()
        self._update_actor()

class Agency:
    def __init__(self):
        self.tau = tau
        self.critic_local = Critic()
        self.critic_target = Critic()
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic)
        self.agents = [Agent() for _ in range(num_agents)]
        self.replay_buffer = ReplayBuffer(action_size*num_agents, buffer_size, batch_size, seed)
        
    def act(self, states, episode, t):
        states = states.reshape(-1, 1, 24)
        actions = np.array([
            agent.act(torch.from_numpy(states[idx]).float(), episode, t)[0] for idx, agent in enumerate(self.agents)
        ])
        return actions
    
    def add_experience(self, states,actions,rewards,next_states,dones):
        self.replay_buffer.add(
            states.reshape(1,-1),
            actions.reshape(1, -1),
            rewards.reshape(1, -1),
            next_states.reshape(1, -1),
            dones.reshape(1,-1)
        )
        self.replay_buffer.add(
            np.flip(states, axis=0).reshape(1,-1),
            np.flip(actions, axis=0).reshape(1, -1),
            np.flip(rewards, axis=0).reshape(1, -1),
            np.flip(next_states, axis=0).reshape(1, -1),
            np.flip(dones, axis=0).reshape(1,-1)
        )
    def train(self):
        if len(self.replay_buffer) >= batch_size:
            states, actions, rewards, next_states, dones = self.replay_buffer.sample()
            for idx in range(num_agents):
                # This process is run twice so that both agents may be in the "This" and "That" states.
                # We're training the critic twice and each actor only once.
                agents = np.roll(self.agents, idx, axis=0)
                this_agent = agents[0]
                that_agent = agents[1]
                actions_future = torch.cat((
                    this_agent.actor_target(next_states[:, :state_size]),
                    that_agent.actor_target(next_states[:, state_size:])
                ), dim=1)
                Q_future = self.critic_target(next_states, actions_future)
                # Gamma is scalar, Q_future is 1024x1, 
                Q_targets = (rewards[:, 0].reshape(-1, 1) + (gamma * Q_future))
                # Compute critic loss
                Q_expected = self.critic_local(states, actions)
                critic_loss = F.mse_loss(Q_expected, Q_targets)
                # Minimize the loss
                self.critic_optimizer.zero_grad()
                critic_loss.backward()
                self.critic_optimizer.step()
                # Compute actor loss
                actions_pred = torch.cat((
                    this_agent.actor_local(states[:, :state_size]),
                    that_agent.actor_local(states[:, state_size:])
                ), dim=1)
                actor_loss = -self.critic_local(states, actions_pred).mean()
                # Minimize the loss
                this_agent.actor_optimizer.zero_grad()
                actor_loss.backward()
                this_agent.actor_optimizer.step()
                # Update Actor target
                this_agent.soft_update()
                # Update Critic Target
                for target_param, local_param in zip(self.critic_target.parameters(), self.critic_local.parameters()):
                    target_param.data.copy_(self.tau*local_param.data + (1.0-self.tau)*target_param.data)


In [8]:
num_episodes = 5000
buffer_size = int(1e4)
batch_size = 1024
num_agents = 2
gamma = 0.99

tau = 5e-2
lr_actor = 1e-3
lr_critic = 1e-3

seed = 2

In [None]:
scores = np.array([]) 

agency = Agency()
avg_max = 0
for ep_num in range(num_episodes):
    ep_scores = np.zeros(num_agents)
    env_info = env.reset(train_mode=True)[brain_name]
    states = env_info.vector_observations
    t = 0
    while True:
        t += 1
        actions = agency.act(states, ep_num, t)
        env_info = env.step(actions)[brain_name]
        next_states = env_info.vector_observations
        rewards = np.array(env_info.rewards)
        dones = np.array(env_info.local_done)
        agency.add_experience(states,actions,rewards,next_states,dones)
        agency.train() 
        # Environment Rules take the max of the agents as the "episode score"
        ep_scores += rewards
        states = next_states
        if sum(dones) > 0:
            break
    ep_score = np.max(ep_scores)
    scores = np.append(scores, ep_score)
    avg_score = np.mean(scores[-100:])
    avg_max = max(avg_score, avg_max)
    if avg_score > 0.5:
        print("SOLVED IN {} EPISODES".format(ep_num))
        break
    print("\rEPISODE={}:{}\tSCORE={}\tAVG={}\tAVG_MAX={}-------".format(ep_num,t, ep_score, avg_score, avg_max), end="")



EPISODE=1094:21	SCORE=0.0	AVG=0.003800000101327896	AVG_MAX=0.006700000166893006-------166893006--------

In [None]:
scores