# Notebook 06. End-to-end demo: Learning a Multiplayer Game using RLlib, Ray Tune, and Ray Serve

© 2019-2022, Anyscale. All Rights Reserved <br>
📖 [Back to Table of Contents](./ex_00_rllib_notebooks_table_of_contents.ipynb)<br>
⬅️ [Previous notebook](./ex_05_rllib_and_ray_serve.ipynb) <br>


In this notebook, you will learn how to:
* Recycle our multi-player game from a previous notebook in this tutorial
* The game will be interrupted in the middle of an episode by an in-game item sale (a power-up is offered to both players at a price determined by a trained RecSys model served via Ray Serve)
* A user model decides whether to buy the item or not
* The game continues with or without the bought item

In [68]:
# Import required packages.

import gym
import numpy as np
import os
import pandas
import requests
import time

import ray
from ray import serve
from ray import tune
from ray.rllib.algorithms.crr import CRRConfig
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.examples.env.random_env import RandomEnv

from multi_agent_arena.multi_agent_arena import MultiAgentArena


print(f"gym: {gym.__version__}")
print(f"ray: {ray.__version__}")

# !ale-import-roms --import-from-pkg atari_py.atari_roms

gym: 0.21.0
ray: 3.0.0.dev0


## Modifying our Game

So far, we have been using our own custom `MultiAgentEnv` sub-class to define our game and asked RLlib to train two policies (one for each agent/player in the game) on how to play the game close to optimal.

In this end-to-end example, we would like to extend this idea and include an in-game power-up (item) sale in the middle of the episode.
The type of the offered item is fixed and always the same for both players. Buying it will allow the respectve agent to move twice as fast as before.
Remember that each episode had a fixed number of timesteps (configurable via the `timestep_limit` constructor argument). We will now add some logic such that the game will pause after half of this number of timesteps and ask the in the to 

<img src="images/multi_agent_arena_3.png" width=800 />


In [26]:
# Use this simple script to generate some RecSys (price recommender) offline data:

dummy_config = PPOConfig().environment(env=RandomEnv, env_config={
    # Observation space: agent1 total reward, agent2 total reward
    "observation_space": gym.spaces.Box(-100, 100.0, (2, ), np.float32),
    # Price for the offered item (between $0 and $100).
    "action_space": gym.spaces.Box(0.0, 100.0, (1,), np.float32),
    "reward_space": gym.spaces.Box(0.0, 1.0, (), np.float32),
    "p_done": 0.0,
    # One-step episode len:
    # reset() -> obs=game state
    # step(action=recommended price) -> reward=bought or not + done?
    "max_episode_len": 1,
}).offline_data(output="offline_rl_data")

# Uncomment to train and generate the json output.
"""
algo = dummy_config.build()

for _ in range(4):
    algo.train()
"""

config

<ray.rllib.algorithms.ppo.ppo.PPOConfig at 0x7fbdb9faecd0>

In [27]:
# Let's first take a look at some of this (JSON) data using pandas:
json_file = "offline_rl_data/in_game_item_price_recsys.json"
dataframe = pandas.read_json(json_file, lines=True)  # don't forget lines=True -> Each line in the json is one "rollout" of 4 timesteps.
dataframe.head()

Unnamed: 0,type,obs,actions,prev_actions,rewards,prev_rewards,dones,t,eps_id,unroll_id,agent_index,action_prob,action_logp,action_dist_inputs,advantages,value_targets,new_obs
0,SampleBatch,"[[4.8118495941, -2.3441574574], [2.9057257175,...","[[1.0178880692], [0.9851945043], [-0.418107837...","[[1.0178880692], [0.9851945043], [-0.418107837...","[0.9583539367, 0.4122557342, 0.8586096168, 0.2...","[0.9583539367, 0.4122557342, 0.8586096168, 0.2...","[True, True, True, True, True, True, True, Tru...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1473058350, 1302222902, 1192158565, 138371370...","[26600, 26601, 26602, 26603, 26604, 26605, 266...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.3876596391, 0.29237189890000004, 0.33490023...","[-0.9476275444000001, -1.2297286987, -1.093922...","[[0.4882035851, -0.16736510400000001], [0.1870...","[0.4965085387, -0.0309310257, 0.33864313360000...","[0.9583539367, 0.4122557342, 0.8586096168, 0.2...","[[4.8118495941, -2.3441574574], [2.9057257175,..."
1,SampleBatch,"[[-2.5245726109, -6.7920980453], [2.82340765, ...","[[0.6935230494], [0.5013949871], [-0.285651296...","[[0.6935230494], [0.5013949871], [-0.285651296...","[0.556561172, 0.2602818906, 0.5748660564, 0.44...","[0.556561172, 0.2602818906, 0.5748660564, 0.44...","[True, True, True, True, True, True, True, Tru...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1548731360, 1146866576, 868049208, 1059559815...","[26800, 26801, 26802, 26803, 26804, 26805, 268...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.2448666096, 0.2235761732, 0.5428897738, 0.1...","[-1.4070416689, -1.4980031252, -0.6108490229, ...","[[-0.2137183249, 0.226452902], [-0.5752148628,...","[0.0268349648, -0.23261126880000002, 0.1082856...","[0.556561172, 0.2602818906, 0.5748660564, 0.44...","[[-2.5245726109, -6.7920980453], [2.82340765, ..."
2,SampleBatch,"[[-8.8964138031, -2.5550217628], [5.9735541344...","[[-0.6430661082], [-0.5806437731], [-0.9546555...","[[-0.6430661082], [-0.5806437731], [-0.9546555...","[0.1523697376, 0.3082717061, 0.1305330247, 0.7...","[0.1523697376, 0.3082717061, 0.1305330247, 0.7...","[True, True, True, True, True, True, True, Tru...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1576878179, 231362190, 98657002, 1156300994, ...","[27000, 27001, 27002, 27003, 27004, 27005, 270...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.36427187920000004, 0.4829119146, 0.42937782...","[-1.0098547935, -0.7279210091, -0.845418036000...","[[-0.2679643631, 0.0238414705], [-0.2201347053...","[-0.3653242588, -0.1588068306, -0.3405982852, ...","[0.1523697376, 0.3082717061, 0.1305330247, 0.7...","[[-8.8964138031, -2.5550217628], [5.9735541344..."
3,SampleBatch,"[[-9.1011753082, 3.4522781372], [-9.4919176102...","[[-1.2645549774], [0.5220996141], [1.653925538...","[[-1.2645549774], [0.5220996141], [1.653925538...","[0.820802331, 0.5500279665, 0.9657452106000001...","[0.820802331, 0.5500279665, 0.9657452106000001...","[True, True, True, True, True, True, True, Tru...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1646655329, 1720896215, 524391847, 1017520255...","[27200, 27201, 27202, 27203, 27204, 27205, 272...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.3244321644, 0.2451513559, 0.102826900800000...","[-1.1256787777, -1.4058794975, -2.274708271, -...","[[-0.5836541057, -0.0488789529], [-0.446589678...","[0.2631308436, 0.0150480866, 0.431158185, 0.21...","[0.820802331, 0.5500279665, 0.9657452106000001...","[[-9.1011753082, 3.4522781372], [-9.4919176102..."
4,SampleBatch,"[[4.7014288902, 0.4653792679], [-5.8722491264,...","[[-0.3971097469], [-0.8547632098], [0.50789344...","[[-0.3971097469], [-0.8547632098], [0.50789344...","[0.8241453171, 0.5656263828, 0.6364953518, 0.1...","[0.8241453171, 0.5656263828, 0.6364953518, 0.1...","[True, True, True, True, True, True, True, Tru...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[125557271, 1190899551, 660656716, 1153393864,...","[27400, 27401, 27402, 27403, 27404, 27405, 274...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.36774969100000005, 0.2262004018, 0.30153703...","[-1.0003527403, -1.4863339663000001, -1.198862...","[[0.0219877884, -0.0077851862], [0.1896262765,...","[0.3771229088, 0.060254335400000004, 0.1244024...","[0.8241453171, 0.5656263828, 0.6364953518, 0.1...","[[4.7014288902, 0.4653792679], [-5.8722491264,..."


In [30]:
crr_config = CRRConfig()

crr_config.environment(
    env=None,
    observation_space=dummy_config.env_config["observation_space"],
    action_space=dummy_config.env_config["action_space"],
)

crr_config.offline_data(
    input_="dataset",
    input_config={
        # If you feel daring here, use the `pendulum_beginner.json` file instead of the expert one here.
        # You may need to train a little longer, then, in order to get a decent policy.
        # But since you have the actual Pendulum environment available for evaluation, you should be able
        # to perfectly stop learning once a good episode reward (> -300.0) has been reached.
        "paths": os.path.join(os.getcwd(), "offline_rl_data/in_game_item_price_recsys.json"),
        "format": "json",
    },
    actions_in_input_normalized=True,
)

crr_config.framework("torch")

<ray.rllib.algorithms.crr.crr.CRRConfig at 0x7fbdbb294b20>

In [31]:
results = tune.run(
    # Registered name for the CRR Algorithm.
    "CRR",
    # Use our config -> converted to python dict.
    config=crr_config.to_dict(),
    # Stopping criteria -> As we are learning from dummy data, just train for a few iterations.
    stop={
        "training_iteration": 3,
    },
    # Create checkpoint every iteration.
    checkpoint_freq=1,
    local_dir="results",
    verbose=1,
)


[2m[36m(CRR pid=28988)[0m Checking /Users/sven/Dropbox/Projects/ray-summit-2022-training/ray-rllib/offline_rl_data/in_game_item_price_recsys.json ...
[2m[36m(CRR pid=28988)[0m fpath=/Users/sven/Dropbox/Projects/ray-summit-2022-training/ray-rllib/offline_rl_data/in_game_item_price_recsys.json ...


[2m[36m(CRR pid=28988)[0m 2022-08-04 13:04:56,531	INFO algorithm.py:332 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(CRR pid=28988)[0m [dataset]: Run `pip install tqdm` to enable progress reporting.


[2m[36m(CRR pid=28988)[0m 2022-08-04 13:05:07,350	INFO trainable.py:160 -- Trainable.setup took 10.820 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[2m[36m(RolloutWorker pid=28995)[0m DatasetReader 1 has 1, samples.
[2m[36m(RolloutWorker pid=28996)[0m DatasetReader 2 has 2, samples.
[2m[36m(RolloutWorker pid=28997)[0m DatasetReader 3 has 2, samples.
[2m[36m(RolloutWorker pid=28998)[0m DatasetReader 4 has 2, samples.


2022-08-04 13:05:38,389	INFO tune.py:737 -- Total run time: 52.12 seconds (51.89 seconds for the tuning loop).


In [35]:
# Get the best trial (there is only one) and last checkpoint.
best_trial = results.get_best_trial()
last_checkpoint = results.get_last_checkpoint(trial=best_trial)
print(f"Last checkpoint from training: {last_checkpoint}")

Last checkpoint from training: <ray.air.checkpoint.Checkpoint object at 0x7fbdbda90460>


In [38]:
# Call `serve.start()` to get 
serve.start()


@serve.deployment(route_prefix="/in-game-recommendations")
class ServeModel:
    def __init__(self, config, checkpoint) -> None:
        # Create new algo from scratch.
        self.algo = config.build()
        # Restore state of algo to a already trained one (using a checkpoint).
        self.algo.restore(checkpoint)

    async def __call__(self, request):
        json_input = await request.json()
        # Extract observation from input.
        obs = json_input["observation"]
        # Translate obs back to np.arrays.
        np_obs = np.array(obs)
        action = self.algo.compute_single_action(np_obs, explore=False)
        return {"action": action}


ServeModel.deploy(crr_config, last_checkpoint)
    
# That's it: Deployment created!

[2m[36m(ServeController pid=29159)[0m INFO 2022-08-04 13:17:29,549 controller 29159 checkpoint_path.py:17 - Using RayInternalKVStore for controller checkpoint and recovery.
[2m[36m(ServeController pid=29159)[0m INFO 2022-08-04 13:17:29,551 controller 29159 http_state.py:115 - Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:jIZYdn:SERVE_PROXY_ACTOR-node:127.0.0.1-0' on node 'node:127.0.0.1-0' listening on '127.0.0.1:8000'
[2m[36m(HTTPProxyActor pid=29160)[0m INFO:     Started server process [29160]
  ServeModel.deploy(crr_config, last_checkpoint)
[2m[36m(ServeController pid=29159)[0m INFO 2022-08-04 13:17:31,504 controller 29159 deployment_state.py:1280 - Adding 1 replicas to deployment 'ServeModel'.


[2m[36m(ServeModel pid=29161)[0m Checking /Users/sven/Dropbox/Projects/ray-summit-2022-training/ray-rllib/offline_rl_data/in_game_item_price_recsys.json ...
[2m[36m(ServeModel pid=29161)[0m fpath=/Users/sven/Dropbox/Projects/ray-summit-2022-training/ray-rllib/offline_rl_data/in_game_item_price_recsys.json ...


[2m[36m(ServeModel pid=29161)[0m 2022-08-04 13:17:40,635	INFO algorithm.py:332 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(RolloutWorker pid=29171)[0m DatasetReader 4 has 2, samples.
[2m[36m(RolloutWorker pid=29169)[0m DatasetReader 2 has 2, samples.
[2m[36m(RolloutWorker pid=29170)[0m DatasetReader 3 has 2, samples.
[2m[36m(RolloutWorker pid=29168)[0m DatasetReader 1 has 1, samples.


[2m[36m(ServeModel pid=29161)[0m 2022-08-04 13:17:51,465	INFO trainable.py:160 -- Trainable.setup took 10.834 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(ServeModel pid=29161)[0m 2022-08-04 13:17:51,492	INFO trainable.py:654 -- Restored on 127.0.0.1 from checkpoint: /var/folders/j4/brrn254576lgnbqqtp5p1z280000gn/T/checkpoint_tmp_0m3qjtnl
[2m[36m(ServeModel pid=29161)[0m 2022-08-04 13:17:51,492	INFO trainable.py:663 -- Current state after restoring: {'_iteration': 3, '_timesteps_total': None, '_time_total': 30.137645483016968, '_episodes_total': 0}


In [58]:

# Convenience function to send action requests to the service.
def get_price(rewards1, rewards2):
    obs = np.array([rewards1, rewards2])
    # Convert numpy array to list (needed for http transfer).
    obs = obs.tolist()
    resp = requests.get(
        "http://localhost:8000/in-game-recommendations", json={"observation": obs}
    )
    response_json = resp.json()
    price = response_json["action"][0]
    return price

# Test our deployment
get_price(0.0, -10.0)


55.382530212402344

[2m[36m(HTTPProxyActor pid=29160)[0m INFO 2022-08-04 14:10:29,662 http_proxy 127.0.0.1 http_proxy.py:316 - GET /in-game-recommendations 200 4.0ms
[2m[36m(ServeModel pid=29161)[0m INFO 2022-08-04 14:10:29,661 ServeModel ServeModel#xPKpku replica.py:467 - HANDLE __call__ OK 1.2ms


In [69]:
class MultiAgentArenaWithItemSale(MultiAgentArena):
    def __init__(self, config=None):
        super().__init__(config=config)
        
        self.sell_item_at_ts = self.timestep_limit // 2

    def reset(self):
        obs = super().reset()
        self.agent1_moves_first = False
        self.agent2_double_speed = False
        return obs

    def step(self, action: dict):
        # Increase our time steps counter by 1.
        self.timesteps += 1
        # An episode is "done" when we reach the time step limit.
        is_done = self.timesteps >= self.timestep_limit

        ######################
        # NEW BEHAVIOR
        ######################
        # It's time to do the item sale.
        price_agent1_item = price_agent2_item = 0.0
        if self.timesteps == self.sell_item_at_ts:
            # Send a price request to our price service.
            price_agent1_item = get_price(self.agent1_R, self.agent2_R)
            price_agent2_item = get_price(self.agent2_R, self.agent1_R)
            
            # User model agent1: User of agent1 buys if item price < 50.0.
            if price_agent1_item < 50.0:
                print("User1 bought power-up!")
                time.sleep(1.0)
                self.agent1_moves_first = True
            # User model agent2: User of agent2 buys if item price < 45.0.
            if price_agent2_item < 45.0:
                print("User2 bought power-up!")
                time.sleep(1.0)
                self.agent2_double_speed = True
        
        # Who moves first?
        # events = [collision|agent1_new_field]
        if self.agent1_moves_first:
            events = self._move(self.agent1_pos, action["agent1"], is_agent1=True)
            events |= self._move(self.agent2_pos, action["agent2"], is_agent1=False)
            # Agent2 is allowed to move twice (double the speed).
            if self.agent2_double_speed:
                events |= self._move(self.agent2_pos, action["agent2"], is_agent1=False)
        else:
            events = self._move(self.agent2_pos, action["agent2"], is_agent1=False)
            # Agent2 is allowed to move twice (double the speed).
            if self.agent2_double_speed:
                events |= self._move(self.agent2_pos, action["agent2"], is_agent1=False)
            events |= self._move(self.agent1_pos, action["agent1"], is_agent1=True)

        # Determine rewards based on the collected events AND on the prices paid:
        r1 = -1.0 if "collision" in events else 1.0 if "agent1_new_field" in events else -0.5
        r2 = 1.0 if "collision" in events else -0.1
        r1 -= price_agent1_item / 10.0
        r2 -= price_agent2_item / 10.0
        self.agent1_R += r1
        self.agent2_R += r2
        ######################
        # END: NEW BEHAVIOR
        ######################

        rewards = {
            "agent1": r1,
            "agent2": r2,
        }

        # Generate a `done` dict (per-agent and total).
        dones = {
            "agent1": is_done,
            "agent2": is_done,
            # special `__all__` key indicates that the episode is done for all agents.
            "__all__": is_done,
        }

        # Useful for rendering.
        self.collision = "collision" in events
        if self.collision is True:
            self.num_collisions += 1    

        return self._get_obs(), rewards, dones, {}  # <- info dict (not needed here).

    

In [70]:
env = MultiAgentArenaWithItemSale(config={"render": True, "width": 5, "height": 5, "timestep_limit": 10})
obs = env.reset()

with env.out:
    # Agent1 moves down, Agent2 moves up.
    obs, rewards, dones, infos = env.step(action={"agent1": 2, "agent2": 0})
    env.render()

    # Agent1 moves right, Agent2 moves left.
    obs, rewards, dones, infos = env.step(action={"agent1": 1, "agent2": 3})
    env.render()

    # Agent1 moves right, Agent2 moves left.
    obs, rewards, dones, infos = env.step(action={"agent1": 1, "agent2": 3})
    env.render()

    # Agent1 moves down, Agent2 moves up.
    obs, rewards, dones, infos = env.step(action={"agent1": 2, "agent2": 0})
    env.render()

    # Agent1 moves left, Agent2 moves up.
    obs, rewards, dones, infos = env.step(action={"agent1": 3, "agent2": 0})
    env.render()

    # Agent1 moves down, Agent2 moves up.
    obs, rewards, dones, infos = env.step(action={"agent1": 2, "agent2": 0})
    env.render()


print("Agent1's x/y position={}".format(env.agent1_pos))
print("Agent2's x/y position={}".format(env.agent2_pos))
print("Env timesteps={}".format(env.timesteps))

Output()

Agent1's x/y position=[2, 1]
Agent2's x/y position=[0, 2]
Env timesteps=6
