In [14]:
# ## install finrl library
# !pip install wrds
# !pip install quantstats
# !pip install torch_geometric
# !pip install swig
# !pip install -q condacolab
# !pip install shimmy
# import condacolab
# condacolab.install()
# !apt-get update -y -qq && apt-get install -y -qq cmake libopenmpi-dev python3-dev zlib1g-dev libgl1-mesa-glx swig
# !pip install git+https://github.com/flpymonkey/FinRL_Online_Portfolio_Benchmarks.git

In [15]:
from stable_baselines3 import PPO

from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.callbacks import CheckpointCallback

from stable_baselines3.common.policies import ActorCriticPolicy

from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union, Callable

from gym import spaces

import torch as th
from torch import nn
import numpy as np

from stable_baselines3.common.torch_layers import (
    BaseFeaturesExtractor,
    CombinedExtractor,
    FlattenExtractor,
    MlpExtractor,
    NatureCNN,
    create_mlp,
)

from stable_baselines3.common.type_aliases import Schedule


MODELS = {
    "ppo": PPO, 
}

# We tweak the ActorCriticPolicy to apply softmax normalization to the output actions
class CustomActorCriticPolicy(ActorCriticPolicy):
    def __init__(
        self,
        observation_space: spaces.Space,
        action_space: spaces.Space,
        lr_schedule: Schedule,
        net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None,
        activation_fn: Type[nn.Module] = nn.Tanh,
        ortho_init: bool = True,
        use_sde: bool = False,
        log_std_init: float = 0.0,
        full_std: bool = True,
        use_expln: bool = False,
        squash_output: bool = False,
        features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor,
        features_extractor_kwargs: Optional[Dict[str, Any]] = None,
        share_features_extractor: bool = True,
        normalize_images: bool = True,
        optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
        optimizer_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ):

        # Pass all initialization variables to the base class
        super().__init__(
            observation_space=observation_space,
            action_space=action_space,
            lr_schedule=lr_schedule,
            net_arch=net_arch,
            activation_fn=activation_fn,
            ortho_init=ortho_init,
            use_sde=use_sde,
            log_std_init=log_std_init,
            full_std=full_std,
            use_expln=use_expln,
            squash_output=squash_output,
            features_extractor_class=features_extractor_class,
            features_extractor_kwargs=features_extractor_kwargs,
            share_features_extractor=share_features_extractor,
            normalize_images=normalize_images,
            optimizer_class=optimizer_class,
            optimizer_kwargs=optimizer_kwargs,
            **kwargs
        )

        # Set up the softmax function
        self.softmax = nn.Sequential(nn.Softmax(dim=-1))

    def _build_mlp_extractor(self) -> None:
        """
        Create the policy and value networks.
        Part of the layers can be shared.
        """
        # Note: If net_arch is None and some features extractor is used,
        #       net_arch here is an empty list and mlp_extractor does not
        #       really contain any layers (acts like an identity module).
        self.mlp_extractor = MlpExtractor(
            self.features_dim,
            net_arch=self.net_arch,
            activation_fn=self.activation_fn,
            device=self.device,
        )

    def forward(self, obs: th.Tensor, deterministic: bool = False) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
        # Preprocess the observation if needed
        features = self.extract_features(obs)
        if self.share_features_extractor:
            latent_pi, latent_vf = self.mlp_extractor(features)
        else:
            pi_features, vf_features = features
            latent_pi = self.mlp_extractor.forward_actor(pi_features)
            latent_vf = self.mlp_extractor.forward_critic(vf_features)
        # Evaluate the values for the given observations
        values = self.value_net(latent_vf)
        distribution = self._get_action_dist_from_latent(latent_pi)
        actions = distribution.get_actions(deterministic=deterministic)
        log_prob = distribution.log_prob(actions)
        actions = actions.reshape((-1, *self.action_space.shape))  # type: ignore[misc]

        # APPLY SOFTMAX to the actions
        actions = self.softmax(actions)

        return actions, values, log_prob
    
    def predict(
        self,
        observation: Union[np.ndarray, Dict[str, np.ndarray]],
        state: Optional[Tuple[np.ndarray, ...]] = None,
        episode_start: Optional[np.ndarray] = None,
        deterministic: bool = False,
    ) -> Tuple[np.ndarray, Optional[Tuple[np.ndarray, ...]]]:
        """
        Get the policy action from an observation (and optional hidden state).
        Includes sugar-coating to handle different observations (e.g. normalizing images).

        :param observation: the input observation
        :param state: The last hidden states (can be None, used in recurrent policies)
        :param episode_start: The last masks (can be None, used in recurrent policies)
            this correspond to beginning of episodes,
            where the hidden states of the RNN must be reset.
        :param deterministic: Whether or not to return deterministic actions.
        :return: the model's action and the next hidden state
            (used in recurrent policies)
        """
        # Switch to eval mode (this affects batch norm / dropout)
        self.set_training_mode(False)

        # Check for common mistake that the user does not mix Gym/VecEnv API
        # Tuple obs are not supported by SB3, so we can safely do that check
        if isinstance(observation, tuple) and len(observation) == 2 and isinstance(observation[1], dict):
            raise ValueError(
                "You have passed a tuple to the predict() function instead of a Numpy array or a Dict. "
                "You are probably mixing Gym API with SB3 VecEnv API: `obs, info = env.reset()` (Gym) "
                "vs `obs = vec_env.reset()` (SB3 VecEnv). "
                "See related issue https://github.com/DLR-RM/stable-baselines3/issues/1694 "
                "and documentation for more information: https://stable-baselines3.readthedocs.io/en/master/guide/vec_envs.html#vecenv-api-vs-gym-api"
            )

        obs_tensor, vectorized_env = self.obs_to_tensor(observation)

        with th.no_grad():
            actions = self._predict(obs_tensor, deterministic=deterministic)

            # APPLY SOFTMAX to the actions
            actions = self.softmax(actions)

        # Convert to numpy, and reshape to the original action shape
        actions = actions.cpu().numpy().reshape((-1, *self.action_space.shape))  # type: ignore[misc, assignment]

        if isinstance(self.action_space, spaces.Box):
            if self.squash_output:
                # Rescale to proper domain when using squashing
                actions = self.unscale_action(actions)  # type: ignore[assignment, arg-type]
            else:
                # Actions could be on arbitrary scale, so clip the actions to avoid
                # out of bound error (e.g. if sampling from a Gaussian distribution)
                actions = np.clip(actions, self.action_space.low, self.action_space.high)  # type: ignore[assignment, arg-type]

        # Remove batch dimension if needed
        if not vectorized_env:
            assert isinstance(actions, np.ndarray)
            actions = actions.squeeze(axis=0)

        return actions, state  # type: ignore[return-value]
    
    
# We default to this agent using our custom actor critic policy
class DRLStableAgent:
    """Implementation for DRL algorithms for portfolio optimization.

    Note:
        During testing, the agent is optimized through online learning.
        The parameters of the policy is updated repeatedly after a constant
        period of time. To disable it, set learning rate to 0.

    Attributes:
        env: Gym environment class.
    """

    def __init__(self, env):
        """Agent initialization.

        Args:
            env: Gym environment to be used in training.
        """
        self.env = env

    def get_model(
        self, model_name, device="cpu", model_kwargs=None, policy_kwargs=None
    ):
        """Setups DRL model.

        Args:
            model_name: Name of the model according to MODELS list.
            device: Device used to instantiate neural networks.
            model_kwargs: Arguments to be passed to model class.
            policy_kwargs: Arguments to be passed to policy class.

        Note:
            model_kwargs and policy_kwargs are dictionaries. The keys must be strings
            with the same names as the class arguments. Example for model_kwargs::

            { "lr": 0.01, "policy": EIIE }

        Returns:
            An instance of the model.
        """
        if model_name not in MODELS:
            raise NotImplementedError("The model requested was not implemented.")

        model = MODELS[model_name]
        model_kwargs = {} if model_kwargs is None else model_kwargs
        policy_kwargs = {} if policy_kwargs is None else policy_kwargs

        # add device settings
        model_kwargs["device"] = device
        #policy_kwargs["device"] = device

        # add policy_kwargs inside model_kwargs
        model_kwargs["policy_kwargs"] = policy_kwargs

        # Default to use the the custom policy which applies softmax normalization
        return model(env=self.env, policy=CustomActorCriticPolicy, **model_kwargs)

    @staticmethod
    def train_model(model, env, tb_log_name=None, episodes=1):
        """Trains portfolio optimization model.

        Args:
            model: Instance of the model.
            episoded: Number of episodes.

        Returns:
            An instance of the trained model.
        """
        max_steps = len(env._df['date'].unique())

        print("Max number of time steps in an episode: ", max_steps)

        checkpoint_callback = CheckpointCallback(
            save_freq=1000000,
            save_path="./results/",
            name_prefix="model_checkpoint",
            save_replay_buffer=True,
            save_vecnormalize=True,
        )

        model.learn(
            total_timesteps = max_steps * episodes,
            callback=checkpoint_callback,
            tb_log_name=tb_log_name,
            progress_bar=True
        )
        return model

    @staticmethod
    def DRL_prediction(model, env, deterministic=True, verbose=False):
        """make a prediction and get results"""
        test_env, test_obs = env.get_sb_env()
        
        test_env.reset()
        max_steps = len(env._df['date'].unique())

        validation_assets = None
        validation_dates = None

        for i in range(max_steps):
            action, _states = model.predict(test_obs, deterministic=deterministic)

            if(verbose):
                print("Step: ", str(i))
                print("Observations: ")
                print(test_obs)
                print("Actions: ")
                print(action)

            # Pull out the latest assets and dates
            validation_assets = env._asset_memory["final"]
            validation_dates = env._date_memory

            test_obs, rewards, dones, info = test_env.step(action)

            if dones[0]:
                print("hit end!")
                break
        
        return validation_assets, validation_dates

In [16]:
import pandas as pd
from stable_baselines3.common.logger import configure


from finrl.config import TRAINED_MODEL_DIR, RESULTS_DIR
from finrl.main import check_and_make_directories

import numpy as np


TIME_WINDOW = 25
COMMISSION_FEE_PERCENT = 0.001
INITIAL_CASH = 1000000


TRAIN_START_DATE = '2009-04-01'
TRAIN_END_DATE = '2021-12-31'
TEST_START_DATE = '2022-01-01'
TEST_END_DATE = '2024-09-01'


TEST_TICKER = [
   "MSFT",
    "V",
    "AAPL",
    "BA",
    "INTC",
    "WMT",
]

TRAINED_PPO = "/agent_opt_ppo_update_test"

GRAPH_TITLE = "PPO Trained 2009-2021, test softmax"



from finrl.meta.preprocessor.yahoodownloader import YahooDownloader



df = YahooDownloader(start_date = TRAIN_START_DATE,
                     end_date = TEST_END_DATE,
                     ticker_list = TEST_TICKER).fetch_data()

processed = df.copy()
processed = processed.fillna(0)
processed_test = processed.replace(np.inf,0)


from sklearn.preprocessing import MaxAbsScaler
from finrl.meta.preprocessor.preprocessors import GroupByScaler

portfolio_norm_df = GroupByScaler(by="tic", scaler=MaxAbsScaler).fit_transform(processed_test)
portfolio_norm_df

df_train = portfolio_norm_df[(portfolio_norm_df["date"] >= TRAIN_START_DATE) & (portfolio_norm_df["date"] <= TRAIN_END_DATE)]
df_test = portfolio_norm_df[(portfolio_norm_df["date"] >= TEST_START_DATE) & (portfolio_norm_df["date"] < TEST_END_DATE)]

# TODO use the start and end date here

df_train.groupby("tic").count()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Shape of DataFrame:  (23286, 8)


  X.loc[select_mask, self.columns] = self.scalers[value].transform(
  X.loc[select_mask, self.columns] = self.scalers[value].transform(


Unnamed: 0_level_0,date,open,high,low,close,volume,day
tic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAPL,3212,3212,3212,3212,3212,3212,3212
BA,3212,3212,3212,3212,3212,3212,3212
INTC,3212,3212,3212,3212,3212,3212,3212
MSFT,3212,3212,3212,3212,3212,3212,3212
V,3212,3212,3212,3212,3212,3212,3212
WMT,3212,3212,3212,3212,3212,3212,3212


In [17]:
import torch
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [18]:
# Fit for the portfolio optimization model

from finrl.meta.env_portfolio_optimization.env_portfolio_optimization import PortfolioOptimizationEnv


from sklearn.preprocessing import MaxAbsScaler
from finrl.meta.preprocessor.preprocessors import GroupByScaler

from finrl.meta.preprocessor.preprocessors import data_split

from datetime import datetime, timedelta


environment_ppo = PortfolioOptimizationEnv(
    df_train,
    initial_amount=INITIAL_CASH,
    comission_fee_pct=COMMISSION_FEE_PERCENT,
    # time_window=TIME_WINDOW,
    features=["close", "high", "low"],
    normalize_df=None,
    reward_scaling=1e-4
)

In [19]:



agent_ppo = DRLStableAgent(env = environment_ppo)
PPO_PARAMS = {
    "n_steps": 2048,
    "batch_size": 64,
    "ent_coef": 0.01,
    "learning_rate": 0.00025, # TODO tried raising the lr which caused vanishing problem
    "clip_range": 0.2,
    # "gae_lambda": 0.001,
}

PPO_policy_params = { 
    "log_std_init": 1
}

# Lower clip_range makes the stocks flatter, very conservative policy

# TODO try playing around with the number of epochs? n_epochs
# TODO try playing around more with the entropy term, make sure agent does enough exploration during training
# TODO try playing around more with the clip papram here


model_ppo = agent_ppo.get_model("ppo", device, model_kwargs=PPO_PARAMS, policy_kwargs=None)

# set up logger
tmp_path = RESULTS_DIR + '/ppo'
new_logger_ppo = configure(tmp_path, ["stdout", "csv", "tensorboard"])
# Set new logger
model_ppo.set_logger(new_logger_ppo)

Logging to results/ppo




In [None]:
model_ppo = DRLStableAgent.train_model(model_ppo, env=environment_ppo, episodes=10)

from finrl.config import TRAINED_MODEL_DIR

environment_ppo.reset()

model_ppo.save(TRAINED_MODEL_DIR + "/agent_opt_ppo_update_test")

Output()

Max number of time steps in an episode:  3212


-----------------------------
| time/              |      |
|    fps             | 106  |
|    iterations      | 1    |
|    time_elapsed    | 19   |
|    total_timesteps | 2048 |
-----------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 3.21e+03  |
|    ep_rew_mean          | -2.9e-05  |
| time/                   |           |
|    fps                  | 97        |
|    iterations           | 2         |
|    time_elapsed         | 41        |
|    total_timesteps      | 4096      |
| train/                  |           |
|    approx_kl            | 24.740677 |
|    clip_fraction        | 0.958     |
|    clip_range           | 0.2       |
|    entropy_loss         | -10       |
|    explained_variance   | 0.945     |
|    learning_rate        | 0.00025   |
|    loss                 | 0.122     |
|    n_updates            | 10        |
|    policy_gradient_loss | 36.5      |
|    std                  | 1         |
|    value_loss           | 8.26e-05  |
---------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 3.21e+03  |
|    ep_rew_mean          | -2.9e-05  |
| time/                   |           |
|    fps                  | 99        |
|    iterations           | 3         |
|    time_elapsed         | 61        |
|    total_timesteps      | 6144      |
| train/                  |           |
|    approx_kl            | 173.17297 |
|    clip_fraction        | 0.987     |
|    clip_range           | 0.2       |
|    entropy_loss         | -9.94     |
|    explained_variance   | 0.922     |
|    learning_rate        | 0.00025   |
|    loss                 | -0.244    |
|    n_updates            | 20        |
|    policy_gradient_loss | -0.0702   |
|    std                  | 1         |
|    value_loss           | 0.000278  |
---------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 3.21e+03  |
|    ep_rew_mean          | -1.25e-05 |
| time/                   |           |
|    fps                  | 92        |
|    iterations           | 4         |
|    time_elapsed         | 88        |
|    total_timesteps      | 8192      |
| train/                  |           |
|    approx_kl            | 48.026875 |
|    clip_fraction        | 0.949     |
|    clip_range           | 0.2       |
|    entropy_loss         | -9.99     |
|    explained_variance   | 0.925     |
|    learning_rate        | 0.00025   |
|    loss                 | -0.147    |
|    n_updates            | 30        |
|    policy_gradient_loss | 1.98      |
|    std                  | 1.01      |
|    value_loss           | 0.000101  |
---------------------------------------


In [None]:
from stable_baselines3 import PPO
from finrl.config import TRAINED_MODEL_DIR


# Load the trained models
# trained_ppo_opt = PPO.load(TRAINED_MODEL_DIR + "/agent_opt_ppo_10_27") 




trained_ppo_opt = PPO.load(TRAINED_MODEL_DIR + TRAINED_PPO) 


# trained_ppo_opt = model_ppo


In [None]:
agent_ppo_test = DRLStableAgent(env = environment_ppo)

PPO_results = {
    "train": {},
}

values, dates = DRLStableAgent.DRL_prediction(trained_ppo_opt, environment_ppo, verbose=True)
PPO_results["train"]["value"] = environment_ppo._terminal_asset_memory["final"]
PPO_results["train"]["date"] = environment_ppo._terminal_date_memory


# Write this out to a csv file, with date and net worth
df_ppo_opt = pd.DataFrame(PPO_results["train"]["value"], columns=['ppo_opt_net_worth'])
df_ppo_date = pd.DataFrame(PPO_results["train"]["date"], columns=['Date'])
if len(df_ppo_opt) == len(df_ppo_date):
    df_ppo_opt['Date'] = df_ppo_date['Date']
else:
    raise ValueError("DataFrames do not have the same number of rows.")

print(df_ppo_opt)


print(df_ppo_opt.loc[0, 'Date'])


Step:  0
Observations: 
[[[[0.01397609]
   [0.06240008]
   [0.15314019]
   [0.0307046 ]
   [0.0420142 ]
   [0.16277684]]

  [[0.01640963]
   [0.07981883]
   [0.22109972]
   [0.04133661]
   [0.04811658]
   [0.22826038]]

  [[0.01591813]
   [0.07771644]
   [0.21720399]
   [0.03914223]
   [0.04656999]
   [0.2240007 ]]]]
Actions: 
[[0.14301108 0.15419316 0.14354204 0.13286671 0.13929768 0.14511268
  0.14197667]]
Step:  1
Observations: 
[[[[0.01449301]
   [0.06549895]
   [0.15996681]
   [0.0306728 ]
   [0.04302026]
   [0.1653038 ]]

  [[0.01727528]
   [0.08515504]
   [0.23091355]
   [0.04246824]
   [0.04907032]
   [0.23480058]]

  [[0.0168206 ]
   [0.08214635]
   [0.22671224]
   [0.04090772]
   [0.04786384]
   [0.23265985]]]]
Actions: 
[[0.14301287 0.15440646 0.14355491 0.1326916  0.13922955 0.14515112
  0.1419534 ]]
Step:  2
Observations: 
[[[[0.01491478]
   [0.06636169]
   [0.16251403]
   [0.02981415]
   [0.04649499]
   [0.16579692]]

  [[0.01748303]
   [0.08546893]
   [0.23033626]
   [0.



Step:  7
Observations: 
[[[[0.0154587 ]
   [0.0654109 ]
   [0.16281962]
   [0.03114983]
   [0.04729208]
   [0.1588014 ]]

  [[0.01821319]
   [0.08403399]
   [0.23221244]
   [0.04238283]
   [0.0527564 ]
   [0.22365648]]

  [[0.0182333 ]
   [0.08135124]
   [0.23116924]
   [0.04170435]
   [0.05036471]
   [0.2213767 ]]]]
Actions: 
[[0.14301316 0.15438485 0.14355977 0.13271326 0.13922623 0.14514813
  0.1419546 ]]
Step:  8
Observations: 
[[[[0.01521309]
   [0.06521724]
   [0.16312529]
   [0.03076821]
   [0.0451407 ]
   [0.15753795]]

  [[0.01809125]
   [0.08407883]
   [0.23654206]
   [0.04174228]
   [0.05234396]
   [0.2207306 ]]

  [[0.01796516]
   [0.08246439]
   [0.23473482]
   [0.04090772]
   [0.0504776 ]
   [0.22107059]]]]
Actions: 
[[0.14301275 0.15439498 0.14356118 0.1327046  0.13922004 0.145152
  0.1419545 ]]
Step:  9
Observations: 
[[[[0.01512695]
   [0.0661504 ]
   [0.15915167]
   [0.02994136]
   [0.04574431]
   [0.15806179]]

  [[0.01780219]
   [0.08528957]
   [0.22629528]
   [0.04

In [None]:
environment_ppo_test = PortfolioOptimizationEnv(
    df_test,
    initial_amount=INITIAL_CASH,
    comission_fee_pct=COMMISSION_FEE_PERCENT,
    # time_window=TIME_WINDOW,
    features=["close", "high", "low"],
    normalize_df=None,
    reward_scaling=1e-4
)

PPO_results = {
    "test": {},
}

values, dates = DRLStableAgent.DRL_prediction(trained_ppo_opt, environment_ppo_test, verbose=False)
PPO_results["test"]["value"] = environment_ppo_test._terminal_asset_memory["final"]
PPO_results["test"]["date"] = environment_ppo_test._terminal_date_memory


# Write this out to a csv file, with date and net worth
df_ppo_opt = pd.DataFrame(PPO_results["test"]["value"], columns=['ppo_opt_net_worth'])
df_ppo_date = pd.DataFrame(PPO_results["test"]["date"], columns=['Date'])
if len(df_ppo_opt) == len(df_ppo_date):
    df_ppo_opt['Date'] = df_ppo_date['Date']
else:
    raise ValueError("DataFrames do not have the same number of rows.")

print(df_ppo_opt)


print(df_ppo_opt.loc[0, 'Date'])



Initial portfolio value:1000000
Final portfolio value: 1119027.875
Final accumulative portfolio value: 1.119027875
Maximum DrawDown: -0.24772787500000004
Sharpe ratio: 0.32685380549302967
Total commission cost: 5438.612459130818
hit end!
     ppo_opt_net_worth       Date
0         1.000000e+06 2022-01-03
1         9.975320e+05 2022-01-04
2         9.896571e+05 2022-01-05
3         9.845290e+05 2022-01-06
4         9.857133e+05 2022-01-07
..                 ...        ...
664       1.095416e+06 2024-08-26
665       1.097094e+06 2024-08-27
666       1.088372e+06 2024-08-28
667       1.100659e+06 2024-08-29
668       1.119028e+06 2024-08-30

[669 rows x 2 columns]
2022-01-03 00:00:00
