# Example Custom TF Model for RayTradingEnv

- Rllib supports customized TensorFlow Keras and also PyTorch Models
- Example uses a sample custom model from rllib tutorials

In [1]:
from ray import tune
import numpy as np
import pandas as pd
import os
import gym
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import pprint
import ray
from ray.rllib.agents.ppo import PPOTrainer
from ray_trading_env import RayTradingEnvironment

INFO:ray_trading_env:ray_trading_env logger started.


In [2]:
ray.init()



RayContext(dashboard_url='', python_version='3.8.8', ray_version='1.13.0', ray_commit='e4ce38d001dbbe09cd21c497fedd03d692b2be3e', address_info={'node_ip_address': '10.1.150.226', 'raylet_ip_address': '10.1.150.226', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-06-27_19-08-18_191563_57503/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-06-27_19-08-18_191563_57503/sockets/raylet', 'webui_url': '', 'session_dir': '/tmp/ray/session_2022-06-27_19-08-18_191563_57503', 'metrics_export_port': 38393, 'gcs_address': '10.1.150.226:47157', 'address': '10.1.150.226:47157', 'node_id': '012b348b79661ce6d8968c6b5f89375d38d8a36b7fcdfeb506fd2e9a'})

In [8]:
# config dict for Rllib (default env parameters)
config = {
    "env": RayTradingEnvironment,
    "create_env_on_driver": True,
    "horizon" : 252,
    # Set Torch as Framework
    "framework": "torch",
}

In [9]:
# stopping criteria for tune
stop = {
    "training_iteration": 10,
    "episode_reward_mean": 1,
}

In [10]:
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.framework import try_import_tf, try_import_torch

tf1, tf, tf_version = try_import_tf()

In [11]:
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.framework import try_import_torch

torch, nn = try_import_torch()


class MyTorchModel(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        """Build a simple [16, 16]-MLP (+ value branch)."""
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
                              model_config, name)
        nn.Module.__init__(self)

        self.device = torch.device("cuda"
                                   if torch.cuda.is_available() else "cpu")

        # Hidden layer (shared by action logits outputs and value output).
        self.layer_1 = nn.Linear(obs_space.shape[0], 16).to(self.device)

        # Action logits output.
        self.layer_out = nn.Linear(16, num_outputs).to(self.device)

        # "Value"-branch (single node output).
        # Used by several RLlib algorithms (e.g. PPO) to calculate an observation's value.
        self.value_branch = nn.Linear(16, 1).to(self.device)
        self.cur_value = None

    def forward(self, input_dict, state, seq_lens):
        """Custom-define your forard pass logic here."""
        # Pass inputs through our 2 layers.
        layer_1_out = self.layer_1(input_dict["obs"])
        logits = self.layer_out(layer_1_out)

        # Calculate the "value" of the observation and store it for
        # when `value_function` is called.
        self.cur_value = self.value_branch(layer_1_out).squeeze(1)

        return logits, state

    def value_function(self):
        """Implement the value branch forward pass logic here:
        
        We will just return the already calculated `self.cur_value`.
        """
        assert self.cur_value is not None, "Must call `forward()` first!"
        return self.cur_value



In [12]:
# Set up our custom model and re-run the experiment.
config.update({
    "model": {
        "custom_model": MyTorchModel,  # for torch users: "custom_model": MyTorchModel
        "custom_model_config": {
            #"layers": [128, 128],
        },
    },
})

In [13]:
config

{'env': ray_trading_env.RayTradingEnvironment,
 'create_env_on_driver': True,
 'horizon': 252,
 'framework': 'torch',
 'model': {'custom_model': __main__.MyTorchModel, 'custom_model_config': {}}}

In [14]:
tune.run(
    "PPO",
    config=config,  # for torch users: config=dict(config, **{"framework": "torch"}),
    stop={
        "training_iteration": 5,
    },
)

[2m[36m(PPOTrainer pid=58451)[0m INFO:ray_trading_env:ray_trading_env logger started.
[2m[36m(PPOTrainer pid=58451)[0m 2022-06-27 19:12:51,955	INFO ppo.py:414 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(PPOTrainer pid=58451)[0m 2022-06-27 19:12:51,956	INFO trainer.py:903 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(PPOTrainer pid=58451)[0m INFO:ray_trading_env:got data for AAPL...
[2m[36m(PPOTrainer pid=58451)[0m   logger.warn(


Trial name,status,loc
PPO_RayTradingEnvironment_22415_00000,RUNNING,10.1.150.226:58451


[2m[36m(RolloutWorker pid=58486)[0m INFO:ray_trading_env:ray_trading_env logger started.
[2m[36m(RolloutWorker pid=58485)[0m INFO:ray_trading_env:ray_trading_env logger started.
[2m[36m(RolloutWorker pid=58486)[0m INFO:ray_trading_env:got data for AAPL...
[2m[36m(RolloutWorker pid=58485)[0m INFO:ray_trading_env:got data for AAPL...
[2m[36m(RolloutWorker pid=58486)[0m   logger.warn(
[2m[36m(RolloutWorker pid=58485)[0m   logger.warn(


Trial name,status,loc
PPO_RayTradingEnvironment_22415_00000,RUNNING,10.1.150.226:58451


Result for PPO_RayTradingEnvironment_22415_00000:
  agent_timesteps_total: 4000
  counters:
    num_agent_steps_sampled: 4000
    num_agent_steps_trained: 4000
    num_env_steps_sampled: 4000
    num_env_steps_trained: 4000
  custom_metrics: {}
  date: 2022-06-27_19-12-57
  done: false
  episode_len_mean: 252.0
  episode_media: {}
  episode_reward_max: 0.19450103561436083
  episode_reward_mean: -0.4151397341002601
  episode_reward_min: -0.9863045778310642
  episodes_this_iter: 14
  episodes_total: 14
  experiment_id: caa23fb1a9534bc38ba90c04332b197e
  hostname: jupyter-fewald
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.20000000000000004
          cur_lr: 5.0000000000000016e-05
          entropy: 1.034536043674715
          entropy_coeff: 0.0
          kl: 0.00040081199754314314
          policy_loss: -0.005791019055471625
          total_loss: 0.06494033922972058
          vf_exp

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_RayTradingEnvironment_22415_00000,RUNNING,10.1.150.226:58451,3,10.9169,12000,-0.280662,0.493881,-1.17008,252


Result for PPO_RayTradingEnvironment_22415_00000:
  agent_timesteps_total: 20000
  counters:
    num_agent_steps_sampled: 20000
    num_agent_steps_trained: 20000
    num_env_steps_sampled: 20000
    num_env_steps_trained: 20000
  custom_metrics: {}
  date: 2022-06-27_19-13-09
  done: true
  episode_len_mean: 252.0
  episode_media: {}
  episode_reward_max: 0.4938807844966161
  episode_reward_mean: -0.29657268018874905
  episode_reward_min: -1.1700809068137037
  episodes_this_iter: 16
  episodes_total: 78
  experiment_id: caa23fb1a9534bc38ba90c04332b197e
  hostname: jupyter-fewald
  info:
    learner:
      default_policy:
        custom_metrics: {}
        learner_stats:
          allreduce_latency: 0.0
          cur_kl_coeff: 0.012500000000000002
          cur_lr: 5.0000000000000016e-05
          entropy: 0.9753258644893605
          entropy_coeff: 0.0
          kl: 0.00032299408262023336
          policy_loss: -0.004573205544022463
          total_loss: 0.016017934673976515
         

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_RayTradingEnvironment_22415_00000,TERMINATED,10.1.150.226:58451,5,16.7699,20000,-0.296573,0.493881,-1.17008,252


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_RayTradingEnvironment_22415_00000,TERMINATED,10.1.150.226:58451,5,16.7699,20000,-0.296573,0.493881,-1.17008,252


2022-06-27 19:13:09,866	INFO tune.py:747 -- Total run time: 21.02 seconds (20.25 seconds for the tuning loop).


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7f2330034760>