Skip to content
Permalink
Browse files

Implement environment reward based on goals scored/conceded at each t…

…imestep

Previously the reward returned by the environment was always 0 at every timestep (in practice this did not matter, since we were deriving shaping rewards from statistics in the observations rather than using the environment reward). I have also implemented `get_reward_spec` and `get_discount_spec` methods for the soccer task, and added some information to the `README.md` describing how rewards and episode terminations work for the soccer environment.

PiperOrigin-RevId: 254631394
  • Loading branch information...
liusiqi43 authored and alimuldal committed Jun 23, 2019
1 parent 14037b6 commit f4fe15ea1f062bcbe9e7d3d403d64cf1d3a714a2
@@ -38,6 +38,24 @@ while not time_step.last():
time_step.observation[i]))
```

## Rewards

The environment provides a reward of +1 to each player when their team
scores a goal, -1 when their team concedes a goal, or 0 if neither team scored
on the current timestep.

In addition to the sparse reward returned the environment, the player
observations also contain various environment statistics that may be used to
derive custom per-player shaping rewards (as was done in
http://arxiv.org/abs/1902.07151, where the environment reward was ignored).

## Episode terminations

Episodes will terminate immediately with a discount factor of 0 when either side
scores a goal. There is also a per-episode `time_limit` (45 seconds by default).
If neither team scores within this time then the episode will terminate with a
discount factor of 1.

## Environment Viewer

To visualize an example 2-vs-2 soccer environment in the `dm_control`
@@ -26,19 +26,15 @@
import numpy as np
from six.moves import zip

_THROW_IN_BALL_Z = 0.5

_REWARD_LOSE = -30.
_REWARD_WIN = 30.
from dm_control.rl import specs

_HOME_XMAT = np.asarray([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
_AWAY_XMAT = np.asarray([[-1, 0, 0], [0, -1, 0], [0, 0, 1]])
_THROW_IN_BALL_Z = 0.5


def _disable_geom_contacts(entities):
for entity in entities:
mjcf_model = entity.mjcf_model
for geom in mjcf_model.find_all('geom'):
for geom in mjcf_model.find_all("geom"):
geom.set_attributes(contype=0)


@@ -137,16 +133,50 @@ def root_entity(self):
return self.arena

def get_reward(self, physics):
return [np.zeros((), dtype=np.float32)] * len(self.players)
"""Returns a list of per-player rewards.
Each player will receive a reward of:
+1 if their team scored a goal
-1 if their team conceded a goal
0 if no goals were scored on this timestep.
Note: the observations also contain various environment statistics that may
be used to derive per-player rewards (as done in
http://arxiv.org/abs/1902.07151).
Args:
physics: An instance of `Physics`.
Returns:
A list of 0-dimensional numpy arrays, one per player.
"""
scoring_team = self.arena.detected_goal()
if not scoring_team:
return [np.zeros((), dtype=np.float32)] * len(self.players)

rewards = []
for p in self.players:
if p.team == scoring_team:
rewards.append(np.ones((), dtype=np.float32))
else:
rewards.append(-np.ones((), dtype=np.float32))
return rewards

def get_reward_spec(self):
reward_spec = specs.ArraySpec(name="reward", shape=(), dtype=np.float32)
return [reward_spec] * len(self.players)

def get_discount(self, physics):
if self.arena.detected_goal():
return np.zeros((), np.float32)
return np.ones((), np.float32)

def get_discount_spec(self):
return specs.ArraySpec(name="discount", shape=(), dtype=np.float32)

def should_terminate_episode(self, physics):
# TerminationType determined by get_discount(physics).
return self.arena.detected_goal()
"""Returns True if a goal was scored by either team."""
return self.arena.detected_goal() is not None

def before_step(self, physics, actions, random_state):
for player, action in zip(self.players, actions):
@@ -162,6 +162,15 @@ def test_num_players(self, home_size, away_size, num_observations):
while not timestep.last():
timestep = env.step(actions)

self.assertLen(timestep.observation, home_size + away_size)

self.assertLen(timestep.reward, home_size + away_size)
for player_spec, player_reward in zip(env.reward_spec(), timestep.reward):
player_spec.validate(player_reward)

discount_spec = env.discount_spec()
discount_spec.validate(timestep.discount)

def test_all_contacts(self):
env = _env(_home_team(1) + _away_team(1))

@@ -278,6 +287,70 @@ def test_prev_actions(self):
actions[walker_idx],
err_msg="Walker {}: incorrect previous action.".format(walker_idx))

@parameterized.named_parameters(
dict(testcase_name="1vs2_draw",
home_size=1, away_size=2, ball_vel_x=0, expected_home_score=0),
dict(testcase_name="1vs2_home_score",
home_size=1, away_size=2, ball_vel_x=50, expected_home_score=1),
dict(testcase_name="2vs1_away_score",
home_size=2, away_size=1, ball_vel_x=-50, expected_home_score=-1),
dict(testcase_name="3vs0_home_score",
home_size=3, away_size=0, ball_vel_x=50, expected_home_score=1),
dict(testcase_name="0vs2_home_score",
home_size=0, away_size=2, ball_vel_x=50, expected_home_score=1),
dict(testcase_name="2vs2_away_score",
home_size=2, away_size=2, ball_vel_x=-50, expected_home_score=-1),
)
def test_scoring_rewards(
self, home_size, away_size, ball_vel_x, expected_home_score):
env = _env(_home_team(home_size) + _away_team(away_size))

def _score_configuration(physics, random_state):
del random_state # Unused.
# Send the ball shooting towards either the home or away goal.
env.task.ball.set_pose(physics, [0., 0., 0.5])
env.task.ball.set_velocity(physics,
velocity=[ball_vel_x, 0., 0.],
angular_velocity=[0., 0., 0.])

env.add_extra_hook("initialize_episode", _score_configuration)

actions = [np.zeros(s.shape, s.dtype) for s in env.action_spec()]

# Disable contacts and gravity so that the ball follows a straight path.
with env.physics.model.disable("contact", "gravity"):

timestep = env.reset()
with self.subTest("Reward and discount are None on the first timestep"):
self.assertTrue(timestep.first())
self.assertIsNone(timestep.reward)
self.assertIsNone(timestep.discount)

# Step until the episode ends.
timestep = env.step(actions)
while not timestep.last():
self.assertTrue(timestep.mid())
# For non-terminal timesteps, the reward should always be 0 and the
# discount should always be 1.
np.testing.assert_array_equal(np.hstack(timestep.reward), 0.)
self.assertEqual(timestep.discount, 1.)
timestep = env.step(actions)

# If a goal was scored then the epsiode should have ended with a discount of
# 0. If neither team scored and the episode ended due to hitting the time
# limit then the discount should be 1.
with self.subTest("Correct terminal discount"):
if expected_home_score != 0:
expected_discount = 0.
else:
expected_discount = 1.
self.assertEqual(timestep.discount, expected_discount)

with self.subTest("Correct terminal reward"):
reward = np.hstack(timestep.reward)
np.testing.assert_array_equal(reward[:home_size], expected_home_score)
np.testing.assert_array_equal(reward[home_size:], -expected_home_score)

def test_throw_in(self):
env = _env(_home_team(1) + _away_team(1))

@@ -287,7 +360,7 @@ def _throw_in_configuration(physics, unused_random_state):

x, y, rotation = 0., 3., np.pi / 6.
ball.set_pose(physics, [x, y, 0.5])
# Ball shooting up. Walkers going tangent.
# Ball shooting out of bounds.
ball.set_velocity(physics, velocity=[0., 50., 0.],
angular_velocity=[0., 0., 0.])

0 comments on commit f4fe15e

Please sign in to comment.
You can’t perform that action at this time.