# Experiment with Tune in the RayTradingEnvironment

- Tune is a Python library for experiment execution and hyperparameter tuning at any scale. You can tune your favorite machine learning framework
- Tune further integrates with a wide range of additional hyperparameter optimization tools
- Tune allows you to transparently parallelize across multiple GPUs and multiple nodes. <p>
- [Tune Documentation](https://docs.ray.io/en/latest/tune/index.html)

In [1]:
from ray import tune
import numpy as np
import pandas as pd
import os
from os import environ
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import pprint
import ray
from ray.rllib.agents.ppo import PPOTrainer

from ray_trading_env import RayTradingEnvironment

INFO:ray_trading_env:ray_trading_env logger started.


In [16]:
pd.set_option('display.max_rows', 10)

In [2]:
# run in local mode (to access csv file in directory)
# local_mode = bool(int(environ.get('RAY_LOCAL', '0')))
# ray.init(local_mode=local_mode)

In [3]:
ray.init()



RayContext(dashboard_url='', python_version='3.8.8', ray_version='1.13.0', ray_commit='e4ce38d001dbbe09cd21c497fedd03d692b2be3e', address_info={'node_ip_address': '10.1.150.226', 'raylet_ip_address': '10.1.150.226', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-06-27_17-35-46_176858_49782/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-06-27_17-35-46_176858_49782/sockets/raylet', 'webui_url': '', 'session_dir': '/tmp/ray/session_2022-06-27_17-35-46_176858_49782', 'metrics_export_port': 52568, 'gcs_address': '10.1.150.226:41082', 'address': '10.1.150.226:41082', 'node_id': '2c1a0853f1cdcb35b693c5864074fbad331c34a67cd4e47066d79069'})

In [4]:
# What will happen:
# - Tune will run several Rllib trainers in parallel
# - The number depends on the number of grid searches we do, i.e. 
# every possible lr has to to combined with every possible train_batch_sizer
# - So, here we will have 4 trials
# - The opportunity to run them in parallel depends on the hardware
# - By default, a single PPO trial uses 3 CPUs (2 actors which step
# through two seperate environments and one local lerner-> the local 
# lerner is responsible for the training step (update the NN) and after
# the training step it broadcasts the network back to the two actors)

In [5]:
# config dict for Rllib
config = {
    "env": RayTradingEnvironment,
    "env_config": {
        "config": {
            "trading_days": 252,
            "trading_cost_bps": 1e-3,
            "time_cost_bps": 1e-4,
            "ticker": "AAPL",
            # note: need to specify path when using Tune
            "data_path": "/home/jovyan/machine-learning-for-trading/AAPL_prices.csv",
            #"max_episode_steps": 252,
        },
    },

    "create_env_on_driver": True,
    # horizon needs to be specified if the env has no 
    # max_number_of_steps-like parameter
    "horizon" : 252,
}

In [6]:
# use the same config file as for PPOTrainer
tune_config = config.copy()

In [7]:
# config hyperparameter search for learning rates and train_batch_size
# pass lists with possible values
tune_config["lr"] = tune.grid_search([0.0001, 0.5]) 
tune_config["train_batch_size"] = tune.grid_search([3000, 4000])

In [8]:
tune_config

{'env': ray_trading_env.RayTradingEnvironment,
 'env_config': {'config': {'trading_days': 252,
   'trading_cost_bps': 0.001,
   'time_cost_bps': 0.0001,
   'ticker': 'AAPL',
   'data_path': '/home/jovyan/machine-learning-for-trading/AAPL_prices.csv'}},
 'create_env_on_driver': True,
 'horizon': 252,
 'lr': {'grid_search': [0.0001, 0.5]},
 'train_batch_size': {'grid_search': [3000, 4000]}}

In [9]:
# stopping criteria
stop = {
    # keys used here can be anything present in the above `rllib_trainer.train()` output dict.
    "training_iteration": 5,
    "episode_reward_mean": 0.2,
}

In [None]:
# run experiment
tune.run(
    "PPO", # --> PPOTrainer
    config=tune_config,
    stop=stop,

    # Note: Trainers will not be returned.
    # Tune creats Trainers internally, runs them in parallel and destroys them afterwards
    checkpoint_at_end=True,  # ... create a checkpoint when done.
    checkpoint_freq=2,  
)

In [17]:
stop = {
    "training_iteration": 100, 
    "episode_reward_mean": 60.0,
}

In [20]:
# update tune config (without grid search)
tune_config["lr"] = 0.0001
tune_config["train_batch_size"] = 3000
tune_config["num_workers"] = 5
tune_config["num_envs_per_worker"] = 5

In [21]:
tune_config

{'env': ray_trading_env.RayTradingEnvironment,
 'env_config': {'config': {'trading_days': 252,
   'trading_cost_bps': 0.001,
   'time_cost_bps': 0.0001,
   'ticker': 'AAPL',
   'data_path': '/home/jovyan/machine-learning-for-trading/AAPL_prices.csv'}},
 'create_env_on_driver': True,
 'horizon': 252,
 'lr': 0.0001,
 'train_batch_size': 3000,
 'num_workers': 5,
 'num_envs_per_worker': 5}

In [None]:
# store results in analysis object
analysis = tune.run(
    "PPO",
    config=tune_config,
    stop=stop,
    checkpoint_at_end=True,
    checkpoint_freq=5,
)

In [30]:
best_trial = analysis.get_best_trial()

In [31]:
best_trial.checkpoint

Checkpoint(persistent, /home/jovyan/ray_results/PPO/PPO_RayTradingEnvironment_f9977_00000_0_2022-06-27_17-45-46/checkpoint_000100/checkpoint-100)

In [35]:
analysis.dataframe()

Unnamed: 0,episode_reward_max,episode_reward_min,episode_reward_mean,episode_len_mean,episodes_this_iter,num_healthy_workers,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_trained,...,info/learner/default_policy/learner_stats/entropy_coeff,config/create_env_on_driver,config/env,config/env_config,config/horizon,config/lr,config/num_envs_per_worker,config/num_workers,config/train_batch_size,logdir
0,1.707665,-0.421143,0.348181,252.0,0,5,300000,300000,300000,300000,...,0.0,True,<class 'ray_trading_env.RayTradingEnvironment'>,{'config': {'data_path': '/home/jovyan/machine...,252,0.0001,5,5,3000,/home/jovyan/ray_results/PPO/PPO_RayTradingEnv...


In [36]:
analysis.stats()

{'start_time': 1656351946.7440944, 'timestamp': 1656352094.4915183}