# Example Custom TF Model for RayTradingEnv

- Rllib supports customized TensorFlow Keras and also PyTorch Models
- Example uses a sample custom model from rllib tutorials

In [14]:
from ray import tune
import numpy as np
import pandas as pd
import os
import gym
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import pprint
import ray
from ray.rllib.agents.ppo import PPOTrainer
from ray_trading_env import RayTradingEnvironment

In [3]:
ray.init()



RayContext(dashboard_url='', python_version='3.8.8', ray_version='1.13.0', ray_commit='e4ce38d001dbbe09cd21c497fedd03d692b2be3e', address_info={'node_ip_address': '10.1.150.226', 'raylet_ip_address': '10.1.150.226', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-06-27_18-55-53_599559_56084/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-06-27_18-55-53_599559_56084/sockets/raylet', 'webui_url': '', 'session_dir': '/tmp/ray/session_2022-06-27_18-55-53_599559_56084', 'metrics_export_port': 55784, 'gcs_address': '10.1.150.226:47068', 'address': '10.1.150.226:47068', 'node_id': '64285f3596a3491064a381176173629dd6e63b8f53441762dc456aef'})

In [5]:
# config dict for Rllib (default env parameters)
config = {
    "env": RayTradingEnvironment,
    "create_env_on_driver": True,
    # horizon needs to be specified if the env has no 
    # max_number_of_steps-like parameter
    "horizon" : 252,
}

In [9]:
# stopping criteria for tune
stop = {
    "training_iteration": 10,
    "episode_reward_mean": 1,
}

In [22]:
# Implement Custom Model

In [23]:
# Note: this is an example model from Rllib tutorials
# https://github.com/sven1977/rllib_tutorials/blob/main/ray_summit_2021/tutorial_notebook.ipynb

In [24]:
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.utils.framework import try_import_tf

tf1, tf, tf_version = try_import_tf()

class MyKerasModel(TFModelV2):
    """Custom model for policy gradient algorithms."""

    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        """Build a simple [16, 16]-MLP (+ value branch)."""
        super(MyKerasModel, self).__init__(obs_space, action_space,
                                           num_outputs, model_config, name)
        
        # Keras Input layer.
        self.inputs = tf.keras.layers.Input(
            shape=obs_space.shape, name="observations")

        # Hidden layer (shared by action logits outputs and value output).
        layer_1 = tf.keras.layers.Dense(
            16,
            name="layer1",
            activation=tf.nn.relu)(self.inputs)
        
        # Action logits output.
        logits = tf.keras.layers.Dense(
            num_outputs,
            name="out",
            activation=None)(layer_1)

        # "Value"-branch (single node output).
        # Used by several RLlib algorithms (e.g. PPO) to calculate an observation's value.
        value_out = tf.keras.layers.Dense(
            1,
            name="value",
            activation=None)(layer_1)

        # The actual Keras model:
        self.base_model = tf.keras.Model(self.inputs,
                                         [logits, value_out])

    def forward(self, input_dict, state, seq_lens):
        """Custom-define your forard pass logic here."""
        # Pass inputs through our 2 layers and calculate the "value"
        # of the observation and store it for when `value_function` is called.
        logits, self.cur_value = self.base_model(input_dict["obs"])
        return logits, state

    def value_function(self):
        """Implement the value branch forward pass logic here:
        
        We will just return the already calculated `self.cur_value`.
        """
        assert self.cur_value is not None, "Must call `forward()` first!"
        return tf.reshape(self.cur_value, [-1])

In [None]:
# update confic dist with "custom model": <MyModel>

In [25]:
# Set up our custom model and re-run the experiment.
config.update({
    "model": {
        "custom_model": MyKerasModel,
        "custom_model_config": {
            #"layers": [128, 128],
        },
    },
})

In [26]:
config

{'env': ray_trading_env.RayTradingEnvironment,
 'create_env_on_driver': True,
 'horizon': 252,
 'model': {'custom_model': __main__.MyKerasModel, 'custom_model_config': {}}}

In [27]:
# run Tune with custom model

In [28]:
tune.run(
    "PPO",
    config=config,  # for torch users: config=dict(config, **{"framework": "torch"}),
    stop={
        "training_iteration": 5,
    },
)

[2m[36m(PPOTrainer pid=57367)[0m INFO:ray_trading_env:ray_trading_env logger started.
[2m[36m(PPOTrainer pid=57367)[0m 2022-06-27 19:07:13,165	INFO trainer.py:2332 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
[2m[36m(PPOTrainer pid=57367)[0m 2022-06-27 19:07:13,166	INFO ppo.py:414 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(PPOTrainer pid=57367)[0m 2022-06-27 19:07:13,166	INFO trainer.py:903 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(PPOTrainer pid=57367)[0m INFO:ray_trading_env:got data for AAPL...
[2m[36m(PPOTrainer pid=57367)[0m   logger.warn(


Trial name,status,loc
PPO_RayTradingEnvironment_58882_00000,RUNNING,10.1.150.226:57367


[2m[36m(RolloutWorker pid=57400)[0m INFO:ray_trading_env:ray_trading_env logger started.
[2m[36m(RolloutWorker pid=57401)[0m INFO:ray_trading_env:ray_trading_env logger started.
[2m[36m(RolloutWorker pid=57400)[0m INFO:ray_trading_env:got data for AAPL...
[2m[36m(RolloutWorker pid=57400)[0m   logger.warn(
[2m[36m(RolloutWorker pid=57401)[0m INFO:ray_trading_env:got data for AAPL...
[2m[36m(RolloutWorker pid=57401)[0m   logger.warn(


Result for PPO_RayTradingEnvironment_58882_00000:
  agent_timesteps_total: 4000
  counters:
    num_agent_steps_sampled: 4000
    num_agent_steps_trained: 4000
    num_env_steps_sampled: 4000
    num_env_steps_trained: 4000
  custom_metrics: {}
  date: 2022-06-27_19-07-18
  done: false
  episode_len_mean: 252.0
  episode_media: {}
  episode_reward_max: 0.5642506575628538
  episode_reward_mean: -0.16439253446926522
  episode_reward_min: -0.6806104443722844
  episodes_this_iter: 14
  episodes_total: 14
  experiment_id: 0446e3ff69ca4bf89b20ba27250a3f95
  hostname: jupyter-fewald
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.20000000298023224
          cur_lr: 4.999999873689376e-05
          entropy: 0.9452173113822937
          entropy_coeff: 0.0
          kl: 0.0008030488970689476
          model: {}
          policy_loss: -0.0009901912417262793
          total_loss: 0.4790833294391632
          vf_explained_var: -0

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_RayTradingEnvironment_58882_00000,RUNNING,10.1.150.226:57367,1,4.81615,4000,-0.164393,0.564251,-0.68061,252


Result for PPO_RayTradingEnvironment_58882_00000:
  agent_timesteps_total: 16000
  counters:
    num_agent_steps_sampled: 16000
    num_agent_steps_trained: 16000
    num_env_steps_sampled: 16000
    num_env_steps_trained: 16000
  custom_metrics: {}
  date: 2022-06-27_19-07-25
  done: false
  episode_len_mean: 252.0
  episode_media: {}
  episode_reward_max: 0.6297300329195069
  episode_reward_mean: -0.2990044221292842
  episode_reward_min: -1.8460572648061357
  episodes_this_iter: 16
  episodes_total: 62
  experiment_id: 0446e3ff69ca4bf89b20ba27250a3f95
  hostname: jupyter-fewald
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.02500000037252903
          cur_lr: 4.999999873689376e-05
          entropy: 0.9188064336776733
          entropy_coeff: 0.0
          kl: 0.0007729936623945832
          model: {}
          policy_loss: -0.0070674181915819645
          total_loss: 0.13114511966705322
          vf_explained_va

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_RayTradingEnvironment_58882_00000,RUNNING,10.1.150.226:57367,4,11.1671,16000,-0.299004,0.62973,-1.84606,252


Result for PPO_RayTradingEnvironment_58882_00000:
  agent_timesteps_total: 20000
  counters:
    num_agent_steps_sampled: 20000
    num_agent_steps_trained: 20000
    num_env_steps_sampled: 20000
    num_env_steps_trained: 20000
  custom_metrics: {}
  date: 2022-06-27_19-07-27
  done: true
  episode_len_mean: 252.0
  episode_media: {}
  episode_reward_max: 0.6297300329195069
  episode_reward_mean: -0.31896347714922946
  episode_reward_min: -1.8460572648061357
  episodes_this_iter: 16
  episodes_total: 78
  experiment_id: 0446e3ff69ca4bf89b20ba27250a3f95
  hostname: jupyter-fewald
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          cur_kl_coeff: 0.012500000186264515
          cur_lr: 4.999999873689376e-05
          entropy: 0.9446378350257874
          entropy_coeff: 0.0
          kl: 0.000565099238883704
          model: {}
          policy_loss: 0.00014079088578000665
          total_loss: 0.08317603915929794
          vf_explained_va

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_RayTradingEnvironment_58882_00000,TERMINATED,10.1.150.226:57367,5,13.2159,20000,-0.318963,0.62973,-1.84606,252


2022-06-27 19:07:27,877	INFO tune.py:747 -- Total run time: 17.39 seconds (16.67 seconds for the tuning loop).


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fdb382c4a00>