In [10]:
#automatically reload modules when they have changed
%reload_ext autoreload
import os
import ray
import ray.tune as tune
from ray.rllib import rollout
from ray.tune.registry import get_trainable_cls

from yaml_config_wrapper import Configuration
from RLcraft import MalmoMazeEnv

In [11]:
def create_env(config):
    """ Create a custom OpenAI gym environment (custom MalmoMazeEnv). 
        MalmoMazeEnv Required Args:
        action_space
        step_reward
        win_reward
        lose_reward
        mission_timeout_ms
    """
    
    env = MalmoMazeEnv(
        xml=config["mission_file"],
        width=config["width"],
        height=config["height"],
        millisec_per_tick=config["millisec_per_tick"],
        mission_timeout_ms=config['mission_timeout_ms'],
        step_reward=config['step_reward'],
        win_reward=config['win_reward'],
        lose_reward=config['lose_reward'],
        action_space=config['action_space'],
        client_port=config['client_port'],
        time_wait=config['time_wait'],
        max_loop=config['max_loop'])
    return env

def stop_check(trial_id, result):
    return result["episode_reward_mean"] >= 85

In [15]:
os.environ['MALMO_XSD_PATH'] = '/home/ubuntu/minerl/MalmoPlatform/Schemas'
yml_path = 'configs/mazes.yml'
# Load YML config file
c = Configuration(config_src=yml_path)
# Load configs from config class
c_general = c.get_config('general')[0]
c_tuner = c.get_config('tuner')[0]
# Load the values from the config
run_config = c_tuner['config']
env_config = run_config['env_config']
c_general = c_general['config']
env = None
checkpoint_path = os.path.join(c_general['checkpoint_path'], c.tag)
print(checkpoint_path, c.tag, run_config)

Configuration file loaded successfully from path: /home/ubuntu/minerl/configs/mazes.yml
Configuration Tag: mazes-v1
./checkpoints/mazes-v1 mazes-v1 {'log_level': 'WARN', 'env': 'mazes-v1', 'env_config': {'mission_file': 'missions/mazes/maze*.xml', 'client_port': 10000, 'width': 84, 'height': 84, 'millisec_per_tick': 20, 'mission_timeout_ms': 300000, 'time_wait': 0.05, 'max_loop': 50, 'action_space': ['move 1', 'move -1', 'strafe 1', 'strafe -1', 'turn 1', 'turn -1'], 'step_reward': -1, 'win_reward': 85, 'lose_reward': 85}, 'framework': 'tf', 'num_gpus': 0, 'num_workers': 1, 'double_q': True, 'dueling': True, 'explore': True, 'exploration_config': {'type': 'EpsilonGreedy', 'initial_epsilon': 1.0, 'final_epsilon': 0.02, 'epsilon_timesteps': 100}}


In [16]:
# Run agent
ray.shutdown()  # Comment this out if it's first run
tune.register_env(c.tag, create_env)
ray.init()

{'node_ip_address': '172.31.40.185',
 'raylet_ip_address': '172.31.40.185',
 'redis_address': '172.31.40.185:6379',
 'object_store_address': '/tmp/ray/session_2022-12-09_13-48-30_575985_4215/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2022-12-09_13-48-30_575985_4215/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2022-12-09_13-48-30_575985_4215',
 'metrics_export_port': 38695,
 'node_id': 'a86c0d219acc9db24a3aafee2d3b3604da38bf057766675fcb9dc280'}

In [17]:
tune.run(run_or_experiment="DQN", 
         config=run_config,
         stop=stop_check,
         checkpoint_freq=1,
         checkpoint_at_end=True,
         local_dir=c_general['log_path'])

Trial name,status,loc
DQN_mazes-v1_2c4c0_00000,PENDING,


[2m[36m(pid=5146)[0m 2022-12-09 13:48:49,067	INFO trainer.py:714 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
[2m[36m(pid=5146)[0m 2022-12-09 13:48:49,068	INFO dqn.py:188 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(pid=5146)[0m 2022-12-09 13:48:49,068	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=5146)[0m 2022-12-09 13:48:59,395	INFO trainable.py:109 -- Trainable.setup took 10.336 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
