# Collaboration and Competition: Tennis

In [2]:
from unityagents import UnityEnvironment
import numpy as np

In [3]:
env = UnityEnvironment(file_name="./Tennis_Windows_x86_64/Tennis.exe")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [4]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [5]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [9]:
def interact(action):
    action = action.reshape(num_agents, action_size)
    env_info = env.step(action)[brain_name]
    next_state, reward, done = env_info.vector_observations, env_info.rewards, env_info.local_done
    return next_state.reshape(num_agents, -1), np.array(reward).reshape(num_agents, -1), np.array(done).reshape(num_agents, -1)

def reset():
    state = env.reset()[brain_name].vector_observations.reshape(num_agents, -1)
    return state

In [10]:
NET_SIZE = 128

class Policy(nn.Module):
    
    def __init__(self, state_size, action_size=1, n_agents=1, fc1_size=NET_SIZE, fc2_size=NET_SIZE):
        super(Policy, self).__init__()
        
        self.bn0 = nn.BatchNorm1d(state_size)
        self.fc1 = nn.Linear(state_size, fc1_size)
        self.bn1 = nn.BatchNorm1d(fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)
        self.bn2 = nn.BatchNorm1d(fc1_size)
        self.fc3_mu = nn.Linear(fc2_size, action_size)
        self.fc3_std = nn.Linear(fc2_size, action_size)

    def forward(self, state, log_std_min=-20, log_std_max=2):
        x = self.bn0(state)
        x = torch.relu(self.bn1(self.fc1(state)))
        x = torch.relu(self.bn2(self.fc2(x)))

        mean = self.fc3_mu(x)
        std = self.fc3_std(x)
        std = torch.clamp(std, log_std_min, log_std_max).exp()

        return mean, std
    
class Value(nn.Module):
    
    def __init__(self, state_size, action_size=1, n_agents=1, fc1_size=NET_SIZE, fc2_size=NET_SIZE):
        
        super(Value, self).__init__()
        
        self.bn0 = nn.BatchNorm1d(state_size)
        self.fc1 = nn.Linear(state_size, fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)
        self.fc3 = nn.Linear(fc2_size, 1)
        
    def forward(self, x):
        x = self.bn0(x)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)
    
class Q(nn.Module):
    
    def __init__(self, state_size, action_size, n_agents=1, fc1_size=NET_SIZE, fc2_size=NET_SIZE):
        
        super(Q, self).__init__()
        
        self.bn0 = nn.BatchNorm1d(state_size+action_size)
        self.fc1 = nn.Linear(state_size + action_size, fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)
        self.fc3 = nn.Linear(fc2_size, 1)
        
    def forward(self, s, a):
        x = torch.cat([s, a], 1)
        x = self.bn0(x)
        x = torch.relu(self.fc1(x)) 
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

In [11]:
from algorithms.sac import Agent

agent = Agent(
    state_size=state_size, 
    action_size=action_size,
    policy_network=Policy,
    value_network=Value,
    q_network=Q,
    n_agents=num_agents, 
    device=device,
)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [17]:
from collections import deque
import progressbar as pb

def run(n_episodes, t_max, print_every):
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=print_every)  # last 100 scores
    widget = ['training loop: ', pb.Percentage(), ' ', pb.Bar(), ' ', pb.ETA()]
    timer = pb.ProgressBar(widgets=widget, maxval=n_episodes).start()
    
    frame_counter = 0
    
    for i_episode in range(1, n_episodes+1):
        states = reset()
        score = 0
        for t in range(t_max):
            frame_counter += 1
            actions = agent.act(states)
            next_states, rewards, dones = interact(actions)
            agent.step(states, actions, rewards, next_states, dones)
            states = next_states
            score += rewards.max()
            if np.any(dones):
                break 
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        agent.writer.add_scalar('score/mean', score, i_episode)
        if i_episode % print_every == 0:
            print('\rEpisode {}\tScore Mean: {:.2f}\tScore STD: {:.2f}'.format(i_episode, np.mean(scores_window), np.std(scores_window)))
        if np.mean() > 0.5:
            print("Environment solved")
            break
        timer.update(i_episode)
    return scores

When finished, you can close the environment.

In [18]:
%time scores = run(t_max=int(10000), n_episodes=int(10000), print_every=100)

training loop:   0% |                                          | ETA:   1:19:19

Episode 100	Score Mean: 0.00	Score STD: 0.02


training loop:   1% |                                          | ETA:   1:18:55

Episode 200	Score Mean: 0.00	Score STD: 0.02


training loop:   2% |#                                         | ETA:   1:30:28

Episode 300	Score Mean: 0.04	Score STD: 0.06


training loop:   3% |#                                         | ETA:   1:52:11

Episode 400	Score Mean: 0.12	Score STD: 0.07


training loop:   4% |##                                        | ETA:   1:59:05

Episode 500	Score Mean: 0.08	Score STD: 0.06


training loop:   5% |##                                        | ETA:   2:09:18

Episode 600	Score Mean: 0.12	Score STD: 0.06


training loop:   6% |##                                        | ETA:   2:13:54

Episode 700	Score Mean: 0.11	Score STD: 0.05


training loop:   7% |###                                       | ETA:   2:16:12

Episode 800	Score Mean: 0.11	Score STD: 0.04


training loop:   8% |###                                       | ETA:   2:23:14

Episode 900	Score Mean: 0.15	Score STD: 0.10


training loop:   9% |####                                      | ETA:   2:27:09

Episode 1000	Score Mean: 0.14	Score STD: 0.09


training loop:  10% |####                                      | ETA:   2:33:52

Episode 1100	Score Mean: 0.19	Score STD: 0.13


training loop:  11% |#####                                     | ETA:   2:39:30

Episode 1200	Score Mean: 0.19	Score STD: 0.19


training loop:  12% |#####                                     | ETA:   2:42:23

Episode 1300	Score Mean: 0.17	Score STD: 0.12


training loop:  13% |#####                                     | ETA:   2:45:19

Episode 1400	Score Mean: 0.18	Score STD: 0.14


training loop:  14% |######                                    | ETA:   2:49:27

Episode 1500	Score Mean: 0.23	Score STD: 0.22


training loop:  15% |######                                    | ETA:   2:59:09

Episode 1600	Score Mean: 0.32	Score STD: 0.51


training loop:  16% |#######                                   | ETA:   3:07:05

Episode 1700	Score Mean: 0.33	Score STD: 0.56


training loop:  17% |#######                                   | ETA:   3:16:42

Episode 1800	Score Mean: 0.38	Score STD: 0.50


training loop:  18% |#######                                   | ETA:   3:59:31

Episode 1900	Score Mean: 1.16	Score STD: 1.61


training loop:  19% |########                                  | ETA:   5:09:02

Episode 2000	Score Mean: 1.88	Score STD: 2.15


training loop:  20% |########                                  | ETA:   5:17:32

KeyboardInterrupt: 

In [19]:
env.close()