https://medium.com/distributed-computing-with-ray/anatomy-of-a-custom-environment-for-rllib-327157f269e5

In [8]:
import gym
from gym.utils import seeding

## Define Custom Env

Gym classes should inherit from the gym.Env class

Gym classes should have 6 methods:
- **__init__()** with self.action_space and self.observation_space
- **reset()** resets the state of the environment for a new episode and returns initial observation
- **step(action)** how an agent takes an action during one step in an episode
- **render()** (*optional*) visualise the state of the environment
- **seed()** (*optional*) set seet for env's random generators
- **close()** (*optional*) how to close an environment

In [40]:
import numpy as np

class Example_v0(gym.Env):
    
    def __init__(self, env_config=None):
        # extra vars useful for this specific env
        self.left_min, self.right_max = 1, 10
        self.move_left, self.move_right = 0, 1
        self.max_steps = 10
        self.reward_away = -2
        self.reward_step = -1
        self.reward_goal = self.max_steps
        self.metadata = {'render.modes': ['human']}
        self.goal = int((self.left_min+self.right_max-1)/2) # place goal in middle of observation space array (makes env simpler)
        self.init_positions = list(range(self.left_min, self.right_max))
        self.init_positions.remove(self.goal)
        
        # vars required by gym
        self.action_space = gym.spaces.Discrete(2) # 2 poss actions
        self.observation_space = gym.spaces.Discrete(self.right_max+1) # observation space recieved by agent
        
        # optional
        self.seed()
        
        
    def reset(self):
        # extra vars useful for this specific env
        self.position = self.np_random.choice(self.init_positions) # agent position in array
        self.count = 0 # number of steps taken this episode
        
        # vars required by gym
        self.state = self.position
        self.reward = 0
        self.done = False
        self.info = {}
        
        return self.state
        
    
    def step(self, action):
        if self.done:
            # should never happen!
            print('Episode done.')
        elif self.count == self.max_steps:
            self.done = True
        else:
            assert self.action_space.contains(action)
            self.count += 1
            
            # simulation logic to handle action
            if action == self.move_left:
                if self.position == self.left_min:
                    # invalid action
                    self.reward = self.reward_away
                else:
                    # update position
                    self.position -= 1
                
                if self.position == self.goal:
                    # agent reached goal
                    self.reward = self.reward_goal
                    self.done = True
                elif self.position < self.goal:
                    # moving away from goal
                    self.reward = self.reward_away
                else:
                    # moving towards goal
                    self.reward = self.reward_step
                    
            elif action == self.move_right:
                if self.position == self.right_max:
                    # invalid action
                    self.reward = self.reward_away
                else:
                    # update position
                    self.position += 1
                
                if self.position == self.goal:
                    # agent reached goal
                    self.reward = self.reward_goal
                    self.done = True
                elif self.position > self.goal:
                    # moving away from goal
                    self.reward = self.reward_away
                else:
                    # moving towards goal
                    self.reward = self.reward_step
                    
        # update env state
        self.state = self.position
        try:
            assert self.observation_space.contains(self.state)
        except AssertionError:
            print('Invalid state', self.state)
        
        # (optional) define info dict (useful for diagnositc info & troubleshooting)
        self.info['dist'] = self.goal - self.position
            
        return [self.state, self.reward, self.done, self.info]
            
        
        
    
    def render(self, mode='human'):
        s = "position: {:2d}  reward: {:2d}  info: {}"
        print(s.format(self.state, self.reward, self.info))
        
    
    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        
        return [seed]
    
    def close(self):
        pass

## Measure Random-Action Baseline

In [29]:
def run_one_episode(env):
    env.reset()
    sum_reward = 0
    
    for i in range(env.max_steps):
        action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        sum_reward += reward
        
        if done:
            break
            
    return sum_reward

In [30]:
env = Example_v0()

history = []

for _ in range(10000):
    sum_reward = run_one_episode(env)
    history.append(sum_reward)
    
avrg_sum_reward = sum(history) / len(history)
print('Random action baseline: {}'.format(avrg_sum_reward))

Random action baseline: -4.9992


## Train a Policy with RLLib

In [32]:
import ray
import ray.rllib.agents.ppo as ppo

ray.shutdown()
ray.init(ignore_reinit_error=True)

Instructions for updating:
non-resource variables are not supported in the long term


2020-11-08 11:36:33,934	INFO services.py:1164 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '128.40.41.23',
 'raylet_ip_address': '128.40.41.23',
 'redis_address': '128.40.41.23:41352',
 'object_store_address': '/tmp/ray/session_2020-11-08_11-36-31_984135_20585/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-11-08_11-36-31_984135_20585/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2020-11-08_11-36-31_984135_20585',
 'metrics_export_port': 53385}

### Configure Checkpoint Saving

In [31]:
import shutil
import os

# clear saved agent folder
CHECKPOINT_ROOT = 'tmp/ppo/custom_env'
shutil.rmtree(CHECKPOINT_ROOT, ignore_errors=True, onerror=None)

# clear ray results folder
RAY_RESULTS = os.getenv('HOME') + '/ray_results'
print(RAY_RESULTS)
shutil.rmtree(RAY_RESULTS, ignore_errors=True, onerror=None)

/home/zciccwf/ray_results


### Configure RL Params

In [37]:
config = ppo.DEFAULT_CONFIG.copy() # use 'proximal policy optimisation' policy optimiser
config['num_gpus'] = 1
config['num_workers'] = 1
config['eager_tracing'] = False
config['log_level'] = 'WARN'

agent = ppo.PPOTrainer(config=config, env=Example_v0)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


[2m[36m(pid=80409)[0m Instructions for updating:
[2m[36m(pid=80409)[0m non-resource variables are not supported in the long term
[2m[36m(pid=80409)[0m Instructions for updating:
[2m[36m(pid=80409)[0m If using Keras pass *_constraint arguments to layers.
2020-11-08 11:40:12,300	INFO trainable.py:252 -- Trainable.setup took 22.998 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


### Train Agent

In [39]:
N_ITER = 5
s = "{:3d} | reward {:6.2f}/{:6.2f}/{:6.2f} | len {:6.2f} | saved agent to {}"

for i in range(N_ITER):
    result = agent.train()
    file_name = agent.save(CHECKPOINT_ROOT)
    
    print(s.format(
    i + 1,
    result["episode_reward_min"],
    result["episode_reward_mean"],
    result["episode_reward_max"],
    result["episode_len_mean"],
    file_name
   ))

Instructions for updating:
Prefer Variable.assign which has equivalent behavior in 2.X.


[2m[36m(pid=80409)[0m Instructions for updating:
[2m[36m(pid=80409)[0m Prefer Variable.assign which has equivalent behavior in 2.X.


  1 | reward -21.00/ -4.95/ 10.00 | len   7.38 | saved agent to tmp/ppo/custom_env/checkpoint_1/checkpoint-1
  2 | reward -20.00/  1.23/ 10.00 | len   5.55 | saved agent to tmp/ppo/custom_env/checkpoint_2/checkpoint-2
  3 | reward -18.00/  5.77/ 10.00 | len   3.90 | saved agent to tmp/ppo/custom_env/checkpoint_3/checkpoint-3
  4 | reward -17.00/  7.02/ 10.00 | len   3.41 | saved agent to tmp/ppo/custom_env/checkpoint_4/checkpoint-4
  5 | reward -18.00/  7.66/ 10.00 | len   3.03 | saved agent to tmp/ppo/custom_env/checkpoint_5/checkpoint-5


### Rollout Trained Agent

In [41]:
agent.restore(file_name) # load last saved agent
env = Example_v0()
state = env.reset()

sum_reward = 0
n_step = 20

for i in range(n_step):
    action = agent.compute_action(state)
    state, reward, done, info = env.step(action)
    
    sum_reward += reward
    env.render()
    
    if done:
        print('Cumulative reward: {}'.format(sum_reward))
        state = env.reset()
        sum_reward = 0

2020-11-08 11:46:57,951	INFO trainable.py:481 -- Restored on 128.40.41.23 from checkpoint: tmp/ppo/custom_env/checkpoint_5/checkpoint-5
2020-11-08 11:46:57,953	INFO trainable.py:489 -- Current state after restoring: {'_iteration': 5, '_timesteps_total': None, '_time_total': 36.64543581008911, '_episodes_total': 4779}


position:  6  reward: -1  info: {'dist': -1}
position:  5  reward: 10  info: {'dist': 0}
Cumulative reward: 9
position:  5  reward: 10  info: {'dist': 0}
Cumulative reward: 10
position:  4  reward: -1  info: {'dist': 1}
position:  5  reward: 10  info: {'dist': 0}
Cumulative reward: 9
position:  3  reward: -1  info: {'dist': 2}
position:  4  reward: -1  info: {'dist': 1}
position:  5  reward: 10  info: {'dist': 0}
Cumulative reward: 8
position:  5  reward: 10  info: {'dist': 0}
Cumulative reward: 10
position:  7  reward: -1  info: {'dist': -2}
position:  6  reward: -1  info: {'dist': -1}
position:  5  reward: 10  info: {'dist': 0}
Cumulative reward: 8
position:  5  reward: 10  info: {'dist': 0}
Cumulative reward: 10
position:  6  reward: -1  info: {'dist': -1}
position:  5  reward: 10  info: {'dist': 0}
Cumulative reward: 9
position:  6  reward: -1  info: {'dist': -1}
position:  5  reward: 10  info: {'dist': 0}
Cumulative reward: 9
position:  2  reward: -1  info: {'dist': 3}
position:  