In [1]:
import pandas as pd
from stable_baselines3.common.logger import configure

from finrl.agents.stablebaselines3.models import DRLAgent as StockDRLAgent
from finrl.agents.portfolio_optimization.models import DRLAgent as OptDRLAgent

from finrl.config import TRAINED_MODEL_DIR, RESULTS_DIR
from finrl.main import check_and_make_directories
from finrl.meta.env_stock_trading.env_stocktrading import StockTradingEnv

import numpy as np


TIME_WINDOW = 25
COMMISSION_FEE_PERCENT = 0.001
INITIAL_CASH = 1000000


TRAIN_START_DATE = '2009-04-01'
TRAIN_END_DATE = '2021-12-31'


TEST_TICKER = [
   "MSFT",
    "V",
    "AAPL",
    "BA",
    "INTC",
    "WMT",
]

TRAINED_PPO = "/agent_opt_ppo_update"

GRAPH_TITLE = "PPO Trained 2009-2021, Two Million Time Steps Checkpoint"



from finrl.meta.preprocessor.yahoodownloader import YahooDownloader



df = YahooDownloader(start_date = TRAIN_START_DATE,
                     end_date = TRAIN_END_DATE,
                     ticker_list = TEST_TICKER).fetch_data()

processed = df.copy()
processed = processed.fillna(0)
processed_test = processed.replace(np.inf,0)


from sklearn.preprocessing import MaxAbsScaler
from finrl.meta.preprocessor.preprocessors import GroupByScaler

portfolio_norm_df = GroupByScaler(by="tic", scaler=MaxAbsScaler).fit_transform(processed_test)
portfolio_norm_df

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Shape of DataFrame:  (19266, 8)


  X.loc[select_mask, self.columns] = self.scalers[value].transform(
  X.loc[select_mask, self.columns] = self.scalers[value].transform(


Unnamed: 0,date,open,high,low,close,volume,tic,day
0,2009-04-01,0.020525,0.021374,0.020783,0.018456,0.313329,AAPL,0.50
1,2009-04-01,0.077397,0.079819,0.077716,0.062400,0.089997,BA,0.50
2,2009-04-01,0.216569,0.221100,0.217204,0.153140,0.377145,INTC,0.50
3,2009-04-01,0.052899,0.055366,0.053127,0.042764,0.302015,MSFT,0.50
4,2009-04-01,0.054739,0.055408,0.054015,0.049621,0.130785,V,0.50
...,...,...,...,...,...,...,...,...
19261,2021-12-30,0.455147,0.461873,0.460256,0.471090,0.067943,BA,0.75
19262,2021-12-30,0.761290,0.752490,0.767345,0.772947,0.092257,INTC,0.75
19263,2021-12-30,0.992136,0.981297,0.990123,0.988954,0.050090,MSFT,0.75
19264,2021-12-30,0.871706,0.867337,0.874829,0.870964,0.011296,V,0.75


In [2]:
from __future__ import annotations

import math

import gym
import matplotlib
import numpy as np
import pandas as pd
from gym import spaces
from gym.utils import seeding
import copy

matplotlib.use("Agg")
import matplotlib.pyplot as plt
from stable_baselines3.common.vec_env import DummyVecEnv
from pathlib import Path

try:
    import quantstats as qs
except ModuleNotFoundError:
    raise ModuleNotFoundError(
        """QuantStats module not found, environment can't plot results and calculate indicadors.
        This module is not installed with FinRL. Install by running one of the options:
        pip install quantstats --upgrade --no-cache-dir
        conda install -c ranaroussi quantstats
        """
    )


class PortfolioOptimizationEnv(gym.Env):
    """A portfolio allocation environment for OpenAI gym.

    This environment simulates the interactions between an agent and the financial market
    based on data provided by a dataframe. The dataframe contains the time series of
    features defined by the user (such as closing, high and low prices) and must have
    a time and a tic column with a list of datetimes and ticker symbols respectively.
    An example of dataframe is shown below::

            date        high            low             close           tic
        0   2020-12-23  0.157414        0.127420        0.136394        ADA-USD
        1   2020-12-23  34.381519       30.074295       31.097898       BNB-USD
        2   2020-12-23  24024.490234    22802.646484    23241.345703    BTC-USD
        3   2020-12-23  0.004735        0.003640        0.003768        DOGE-USD
        4   2020-12-23  637.122803      560.364258      583.714600      ETH-USD
        ... ...         ...             ...             ...             ...

    Based on this dataframe, the environment will create an observation space that can
    be a Dict or a Box. The Box observation space is a three-dimensional array of shape
    (f, n, t), where f is the number of features, n is the number of stocks in the
    portfolio and t is the user-defined time window. If the environment is created with
    the parameter return_last_action set to True, the observation space is a Dict with
    the following keys::

        {
        "state": three-dimensional Box (f, n, t) representing the time series,
        "last_action": one-dimensional Box (n+1,) representing the portfolio weights
        }

    Note that the action space of this environment is an one-dimensional Box with size
    n + 1 because the portfolio weights must contains the weights related to all the
    stocks in the portfolio and to the remaining cash.

    Attributes:
        action_space: Action space.
        observation_space: Observation space.
        episode_length: Number of timesteps of an episode.
        portfolio_size: Number of stocks in the portfolio.
    """

    metadata = {"render.modes": ["human"]}

    def __init__(
        self,
        df,
        initial_amount,
        order_df=True,
        return_last_action=False,
        normalize_df="by_previous_time",
        reward_scaling=1,
        comission_fee_model="trf",
        comission_fee_pct=0,
        features=["close", "high", "low"],
        valuation_feature="close",
        time_column="date",
        time_format="%Y-%m-%d",
        tic_column="tic",
        tics_in_portfolio="all",
        time_window=1,
        cwd="./",
        new_gym_api=False,
    ):
        """Initializes environment's instance.

        Args:
            df: Dataframe with market information over a period of time.
            initial_amount: Initial amount of cash available to be invested.
            order_df: If True input dataframe is ordered by time.
            return_last_action: If True, observations also return the last performed
                action. Note that, in that case, the observation space is a Dict.
            normalize_df: Defines the normalization method applied to input dataframe.
                Possible values are "by_previous_time", "by_fist_time_window_value",
                "by_COLUMN_NAME" (where COLUMN_NAME must be changed to a real column
                name) and a custom function. If None no normalization is done.
            reward_scaling: A scaling factor to multiply the reward function. This
                factor can help training.
            comission_fee_model: Model used to simulate comission fee. Possible values
                are "trf" (for transaction remainder factor model) and "wvm" (for weights
                vector modifier model). If None, commission fees are not considered.
            comission_fee_pct: Percentage to be used in comission fee. It must be a value
                between 0 and 1.
            features: List of features to be considered in the observation space. The
                items of the list must be names of columns of the input dataframe.
            valuation_feature: Feature to be considered in the portfolio value calculation.
            time_column: Name of the dataframe's column that contain the datetimes that
                index the dataframe.
            time_format: Formatting string of time column.
            tic_name: Name of the dataframe's column that contain ticker symbols.
            tics_in_portfolio: List of ticker symbols to be considered as part of the
                portfolio. If "all", all tickers of input data are considered.
            time_window: Size of time window.
            cwd: Local repository in which resulting graphs will be saved.
            new_gym_api: If True, the environment will use the new gym api standard for
                step and reset methods.
        """
        self._time_window = time_window
        self._time_index = time_window - 1
        self._time_column = time_column
        self._time_format = time_format
        self._tic_column = tic_column
        self._df = df
        self._initial_amount = initial_amount
        self._return_last_action = return_last_action
        self._reward_scaling = reward_scaling
        self._comission_fee_pct = comission_fee_pct
        self._comission_fee_model = comission_fee_model
        self._features = features
        self._valuation_feature = valuation_feature
        self._cwd = Path(cwd)
        self._new_gym_api = new_gym_api

        # results file
        self._results_file = self._cwd / "results" / "rl"
        self._results_file.mkdir(parents=True, exist_ok=True)

        # initialize price variation
        self._df_price_variation = None

        # preprocess data
        self._preprocess_data(order_df, normalize_df, tics_in_portfolio)

        # dims and spaces
        self._tic_list = self._df[self._tic_column].unique()
        self.portfolio_size = (
            len(self._tic_list)
            if tics_in_portfolio == "all"
            else len(tics_in_portfolio)
        )
        action_space = 1 + self.portfolio_size

        # sort datetimes and define episode length
        self._sorted_times = sorted(set(self._df[time_column]))
        self.episode_length = len(self._sorted_times) - time_window + 1

        # define action space
        self.action_space = spaces.Box(low=0, high=1, shape=(action_space,))

        # define observation state
        if self._return_last_action:
            # if  last action must be returned, a dict observation
            # is defined
            self.observation_space = spaces.Dict(
                {
                    "state": spaces.Box(
                        low=-np.inf,
                        high=np.inf,
                        shape=(
                            len(self._features),
                            len(self._tic_list),
                            self._time_window,
                        ),
                    ),
                    "last_action": spaces.Box(low=0, high=1, shape=(action_space,)),
                }
            )
        else:
            # if information about last action is not relevant,
            # a 3D observation space is defined
            self.observation_space = spaces.Box(
                low=-np.inf,
                high=np.inf,
                shape=(len(self._features), len(self._tic_list), self._time_window),
            )

        self._reset_memory()

        self._portfolio_value = self._initial_amount
        self._terminal = False

        # Use this to save the state in the last terminal state in case the environment resets 
        self._terminal_action_memory = None
        self._terminal_asset_memory = None
        self._terminal_date_memory = None

    def step(self, actions):
        """Performs a simulation step.

        Args:
            actions: An unidimensional array containing the new portfolio
                weights.

        Note:
            If the environment was created with "return_last_action" set to
            True, the next state returned will be a Dict. If it's set to False,
            the next state will be a Box. You can check the observation state
            through the attribute "observation_space".

        Returns:
            If "new_gym_api" is set to True, the following tuple is returned:
            (state, reward, terminal, truncated, info). If it's set to False,
            the following tuple is returned: (state, reward, terminal, info).

            state: Next simulation state.
            reward: Reward related to the last performed action.
            terminal: If True, the environment is in a terminal state.
            truncated: If True, the environment has passed it's simulation
                time limit. Currently, it's always False.
            info: A dictionary containing informations about the last state.
        """
        self._terminal = self._time_index >= len(self._sorted_times) - 1

        if self._terminal:
            metrics_df = pd.DataFrame(
                {
                    "date": self._date_memory,
                    "returns": self._portfolio_return_memory,
                    "rewards": self._portfolio_reward_memory,
                    "portfolio_values": self._asset_memory["final"],
                }
            )
            metrics_df.set_index("date", inplace=True)

            plt.plot(metrics_df["portfolio_values"], "r")
            plt.title("Portfolio Value Over Time")
            plt.xlabel("Time")
            plt.ylabel("Portfolio value")
            plt.savefig(self._results_file / "portfolio_value.png")
            plt.close()

            plt.plot(self._portfolio_reward_memory, "r")
            plt.title("Reward Over Time")
            plt.xlabel("Time")
            plt.ylabel("Reward")
            plt.savefig(self._results_file / "reward.png")
            plt.close()

            plt.plot(self._actions_memory)
            plt.title("Actions performed")
            plt.xlabel("Time")
            plt.ylabel("Weight")
            plt.savefig(self._results_file / "actions.png")
            plt.close()

            print("=================================")
            print("Initial portfolio value:{}".format(self._asset_memory["final"][0]))
            print(f"Final portfolio value: {self._portfolio_value}")
            print(
                "Final accumulative portfolio value: {}".format(
                    self._portfolio_value / self._asset_memory["final"][0]
                )
            )
            print(
                "Maximum DrawDown: {}".format(
                    qs.stats.max_drawdown(metrics_df["portfolio_values"])
                )
            )
            print("Sharpe ratio: {}".format(qs.stats.sharpe(metrics_df["returns"])))
            print("=================================")

            qs.plots.snapshot(
                metrics_df["returns"],
                show=False,
                savefig=self._results_file / "portfolio_summary.png",
            )

            # Save the asset memory in the terminal state before the environment is reset
            self._terminal_asset_memory = copy.deepcopy(self._asset_memory)
            self._terminal_date_memory = copy.deepcopy(self._date_memory)
            self._terminal_action_memory = copy.deepcopy(self._actions_memory)

            file_path = './action_dump/actions' + str(len(self._terminal_action_memory)) + '.csv'
            np.savetxt(file_path, self._terminal_action_memory, delimiter=',', fmt='%.6f')

            if self._new_gym_api:
                return self._state, self._reward, self._terminal, False, self._info
            return self._state, self._reward, self._terminal, self._info

        else:

            print("HERE actions")
            print(actions)
            
            # transform action to numpy array (if it's a list)
            actions = np.array(actions, dtype=np.float32)

            print("HERE actions")
            # print(actions)

            # if necessary, normalize weights
            if math.isclose(np.sum(actions), 1, abs_tol=1e-6) and np.min(actions) >= 0:
                weights = actions
            else:
                action_sum = np.sum(actions)
                weights = actions / action_sum
                if not action_sum: # TODO fix this
                    weights = np.zeros(len(weights))
                    weights[0] = 1
            print("HERE2  weights")        
            print(weights)
                # print(weights)
                # weights = self._softmax_normalization(actions)

            # save initial portfolio weights for this time step
            self._actions_memory.append(weights)

            # get last step final weights and portfolio_value
            last_weights = self._final_weights[-1]

            # load next state
            self._time_index += 1
            self._state, self._info = self._get_state_and_info_from_time_index(
                self._time_index
            )

            # if using weights vector modifier, we need to modify weights vector
            if self._comission_fee_model == "wvm":
                delta_weights = weights - last_weights
                delta_assets = delta_weights[1:]  # disconsider
                # calculate fees considering weights modification
                fees = np.sum(np.abs(delta_assets * self._portfolio_value))
                if fees > weights[0] * self._portfolio_value:
                    weights = last_weights
                    # maybe add negative reward
                else:
                    portfolio = weights * self._portfolio_value
                    portfolio[0] -= fees
                    self._portfolio_value = np.sum(portfolio)  # new portfolio value
                    weights = portfolio / self._portfolio_value  # new weights
            elif self._comission_fee_model == "trf":
                last_mu = 1
                mu = 1 - 2 * self._comission_fee_pct + self._comission_fee_pct**2
                while abs(mu - last_mu) > 1e-10:
                    last_mu = mu
                    mu = (
                        1
                        - self._comission_fee_pct * weights[0]
                        - (2 * self._comission_fee_pct - self._comission_fee_pct**2)
                        * np.sum(np.maximum(last_weights[1:] - mu * weights[1:], 0))
                    ) / (1 - self._comission_fee_pct * weights[0])
                self._info["trf_mu"] = mu
                self._portfolio_value = mu * self._portfolio_value

            # save initial portfolio value of this time step
            self._asset_memory["initial"].append(self._portfolio_value)

            # time passes and time variation changes the portfolio distribution
            portfolio = self._portfolio_value * (weights * self._price_variation)

            # calculate new portfolio value and weights
            self._portfolio_value = np.sum(portfolio)
            weights = portfolio / self._portfolio_value

            # save final portfolio value and weights of this time step
            self._asset_memory["final"].append(self._portfolio_value)
            self._final_weights.append(weights)

            # save date memory
            self._date_memory.append(self._info["end_time"])

            # define portfolio return
            rate_of_return = (
                self._asset_memory["final"][-1] / self._asset_memory["final"][-2]
            )
            portfolio_return = rate_of_return - 1
            portfolio_reward = np.log(rate_of_return)

            # save portfolio return memory
            self._portfolio_return_memory.append(portfolio_return)
            self._portfolio_reward_memory.append(portfolio_reward)

            # Define portfolio return
            self._reward = portfolio_reward
            self._reward = self._reward * self._reward_scaling
        
        if self._new_gym_api:
            return self._state, self._reward, self._terminal, False, self._info
        return self._state, self._reward, self._terminal, self._info

    def reset(self):
        """Resets the environment and returns it to its initial state (the
        fist date of the dataframe).

        Note:
            If the environment was created with "return_last_action" set to
            True, the initial state will be a Dict. If it's set to False,
            the initial state will be a Box. You can check the observation
            state through the attribute "observation_space".

        Returns:
            If "new_gym_api" is set to True, the following tuple is returned:
            (state, info). If it's set to False, only the initial state is
            returned.

            state: Initial state.
            info: Initial state info.
        """
        # time_index must start a little bit in the future to implement lookback
        self._time_index = self._time_window - 1
        self._reset_memory()

        self._state, self._info = self._get_state_and_info_from_time_index(
            self._time_index
        )
        self._portfolio_value = self._initial_amount
        self._terminal = False

        if self._new_gym_api:
            return self._state, self._info
        return self._state

    def _get_state_and_info_from_time_index(self, time_index):
        """Gets state and information given a time index. It also updates "data"
        attribute with information about the current simulation step.

        Args:
            time_index: An integer that represents the index of a specific datetime.
                The initial datetime of the dataframe is given by 0.

        Note:
            If the environment was created with "return_last_action" set to
            True, the returned state will be a Dict. If it's set to False,
            the returned state will be a Box. You can check the observation
            state through the attribute "observation_space".

        Returns:
            A tuple with the following form: (state, info).

            state: The state of the current time index. It can be a Box or a Dict.
            info: A dictionary with some informations about the current simulation
                step. The dict has the following keys::

                {
                "tics": List of ticker symbols,
                "start_time": Start time of current time window,
                "start_time_index": Index of start time of current time window,
                "end_time": End time of current time window,
                "end_time_index": Index of end time of current time window,
                "data": Data related to the current time window,
                "price_variation": Price variation of current time step
                }
        """
        # returns state in form (channels, tics, timesteps)
        end_time = self._sorted_times[time_index]
        start_time = self._sorted_times[time_index - (self._time_window - 1)]

        # define data to be used in this time step
        self._data = self._df[
            (self._df[self._time_column] >= start_time)
            & (self._df[self._time_column] <= end_time)
        ][[self._time_column, self._tic_column] + self._features]

        # define price variation of this time_step
        self._price_variation = self._df_price_variation[
            self._df_price_variation[self._time_column] == end_time
        ][self._valuation_feature].to_numpy()
        self._price_variation = np.insert(self._price_variation, 0, 1)

        # define state to be returned
        state = None
        for tic in self._tic_list:
            tic_data = self._data[self._data[self._tic_column] == tic]
            tic_data = tic_data[self._features].to_numpy().T
            tic_data = tic_data[..., np.newaxis]
            state = tic_data if state is None else np.append(state, tic_data, axis=2)
        state = state.transpose((0, 2, 1))
        info = {
            "tics": self._tic_list,
            "start_time": start_time,
            "start_time_index": time_index - (self._time_window - 1),
            "end_time": end_time,
            "end_time_index": time_index,
            "data": self._data,
            "price_variation": self._price_variation,
        }
        return self._standardize_state(state), info

    def render(self, mode="human"):
        """Renders the environment.

        Returns:
            Observation of current simulation step.
        """
        return self._state

    def _softmax_normalization(self, actions):
        """Normalizes the action vector using softmax function.

        Returns:
            Normalized action vector (portfolio vector).
        """
        numerator = np.exp(actions)
        denominator = np.sum(np.exp(actions))
        softmax_output = numerator / denominator
        return softmax_output

    def enumerate_portfolio(self):
        """Enumerates the current porfolio by showing the ticker symbols
        of all the investments considered in the portfolio.
        """
        print("Index: 0. Tic: Cash")
        for index, tic in enumerate(self._tic_list):
            print(f"Index: {index + 1}. Tic: {tic}")

    def _preprocess_data(self, order, normalize, tics_in_portfolio):
        """Orders and normalizes the environment's dataframe.

        Args:
            order: If true, the dataframe will be ordered by ticker list
                and datetime.
            normalize: Defines the normalization method applied to the dataframe.
                Possible values are "by_previous_time", "by_fist_time_window_value",
                "by_COLUMN_NAME" (where COLUMN_NAME must be changed to a real column
                name) and a custom function. If None no normalization is done.
            tics_in_portfolio: List of ticker symbols to be considered as part of the
                portfolio. If "all", all tickers of input data are considered.
        """
        # order time dataframe by tic and time
        if order:
            self._df = self._df.sort_values(by=[self._tic_column, self._time_column])
        # defining price variation after ordering dataframe
        self._df_price_variation = self._temporal_variation_df()
        # select only stocks in portfolio
        if tics_in_portfolio != "all":
            self._df_price_variation = self._df_price_variation[
                self._df_price_variation[self._tic_column].isin(tics_in_portfolio)
            ]
        # apply normalization
        if normalize:
            self._normalize_dataframe(normalize)
        # transform str to datetime
        self._df[self._time_column] = pd.to_datetime(self._df[self._time_column])
        self._df_price_variation[self._time_column] = pd.to_datetime(
            self._df_price_variation[self._time_column]
        )
        # transform numeric variables to float32 (compatibility with pytorch)
        self._df[self._features] = self._df[self._features].astype("float32")
        self._df_price_variation[self._features] = self._df_price_variation[
            self._features
        ].astype("float32")

    def _reset_memory(self):
        """Resets the environment's memory."""
        date_time = self._sorted_times[self._time_index]
        # memorize portfolio value each step
        self._asset_memory = {
            "initial": [self._initial_amount],
            "final": [self._initial_amount],
        }
        # memorize portfolio return and reward each step
        self._portfolio_return_memory = [0]
        self._portfolio_reward_memory = [0]
        # initial action: all money is allocated in cash
        self._actions_memory = [
            np.array([1] + [0] * self.portfolio_size, dtype=np.float32)
        ]
        # memorize portfolio weights at the ending of time step
        self._final_weights = [
            np.array([1] + [0] * self.portfolio_size, dtype=np.float32)
        ]
        # memorize datetimes
        self._date_memory = [date_time]

    def _standardize_state(self, state):
        """Standardize the state given the observation space. If "return_last_action"
        is set to False, a three-dimensional box is returned. If it's set to True, a
        dictionary is returned. The dictionary follows the standard below::

            {
            "state": Three-dimensional box representing the current state,
            "last_action": One-dimensional box representing the last action
            }
        """
        last_action = self._actions_memory[-1]
        if self._return_last_action:
            return {"state": state, "last_action": last_action}
        else:
            return state

    def _normalize_dataframe(self, normalize):
        """ "Normalizes the environment's dataframe.

        Args:
            normalize: Defines the normalization method applied to the dataframe.
                Possible values are "by_previous_time", "by_fist_time_window_value",
                "by_COLUMN_NAME" (where COLUMN_NAME must be changed to a real column
                name) and a custom function. If None no normalization is done.

        Note:
            If a custom function is used in the normalization, it must have an
            argument representing the environment's dataframe.
        """
        if type(normalize) == str:
            if normalize == "by_fist_time_window_value":
                print(
                    "Normalizing {} by first time window value...".format(
                        self._features
                    )
                )
                self._df = self._temporal_variation_df(self._time_window - 1)
            elif normalize == "by_previous_time":
                print(f"Normalizing {self._features} by previous time...")
                self._df = self._temporal_variation_df()
            elif normalize.startswith("by_"):
                normalizer_column = normalize[3:]
                print(f"Normalizing {self._features} by {normalizer_column}")
                for column in self._features:
                    self._df[column] = self._df[column] / self._df[normalizer_column]
        elif callable(normalize):
            print("Applying custom normalization function...")
            self._df = normalize(self._df)
        else:
            print("No normalization was performed.")

    def _temporal_variation_df(self, periods=1):
        """Calculates the temporal variation dataframe. For each feature, this
        dataframe contains the rate of the current feature's value and the last
        feature's value given a period. It's used to normalize the dataframe.

        Args:
            periods: Periods (in time indexes) to calculate temporal variation.

        Returns:
            Temporal variation dataframe.
        """
        df_temporal_variation = self._df.copy()
        prev_columns = []
        for column in self._features:
            prev_column = f"prev_{column}"
            prev_columns.append(prev_column)
            df_temporal_variation[prev_column] = df_temporal_variation.groupby(
                self._tic_column
            )[column].shift(periods=periods)
            df_temporal_variation[column] = (
                df_temporal_variation[column] / df_temporal_variation[prev_column]
            )
        df_temporal_variation = (
            df_temporal_variation.drop(columns=prev_columns)
            .fillna(1)
            .reset_index(drop=True)
        )
        return df_temporal_variation

    def _seed(self, seed=None):
        """Seeds the sources of randomness of this environment to guarantee
        reproducibility.

        Args:
            seed: Seed value to be applied.

        Returns:
            Seed value applied.
        """
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def get_sb_env(self, env_number=1):
        """Generates an environment compatible with Stable Baselines 3. The
        generated environment is a vectorized version of the current one.

        Returns:
            A tuple with the generated environment and an initial observation.
        """
        e = DummyVecEnv([lambda: self] * env_number)
        obs = e.reset()
        return e, obs

In [3]:
from stable_baselines3 import PPO

from stable_baselines3.common.callbacks import CheckpointCallback


MODELS = {
    "ppo": PPO, 

}

class DRLStableAgent:
    """Implementation for DRL algorithms for portfolio optimization.

    Note:
        During testing, the agent is optimized through online learning.
        The parameters of the policy is updated repeatedly after a constant
        period of time. To disable it, set learning rate to 0.

    Attributes:
        env: Gym environment class.
    """

    def __init__(self, env):
        """Agent initialization.

        Args:
            env: Gym environment to be used in training.
        """
        self.env = env

    def get_model(
        self, model_name, device="cpu", model_kwargs=None, policy_kwargs=None
    ):
        """Setups DRL model.

        Args:
            model_name: Name of the model according to MODELS list.
            device: Device used to instantiate neural networks.
            model_kwargs: Arguments to be passed to model class.
            policy_kwargs: Arguments to be passed to policy class.

        Note:
            model_kwargs and policy_kwargs are dictionaries. The keys must be strings
            with the same names as the class arguments. Example for model_kwargs::

            { "lr": 0.01, "policy": EIIE }

        Returns:
            An instance of the model.
        """
        if model_name not in MODELS:
            raise NotImplementedError("The model requested was not implemented.")

        model = MODELS[model_name]
        model_kwargs = {} if model_kwargs is None else model_kwargs
        policy_kwargs = {} if policy_kwargs is None else policy_kwargs

        # add device settings
        model_kwargs["device"] = device
        #policy_kwargs["device"] = device

        # add policy_kwargs inside model_kwargs
        model_kwargs["policy_kwargs"] = policy_kwargs

        # Default to use the MlpPolicy
        return model(env=self.env, policy="MlpPolicy", **model_kwargs)

    @staticmethod
    def train_model(model, env, tb_log_name=None, episodes=1):
        """Trains portfolio optimization model.

        Args:
            model: Instance of the model.
            episoded: Number of episodes.

        Returns:
            An instance of the trained model.
        """
        max_steps = len(env._df['date'].unique())

        print("Max number of time steps in an episode: ", max_steps)

        checkpoint_callback = CheckpointCallback(
            save_freq=1000000,
            save_path="./results/",
            name_prefix="model_checkpoint",
            save_replay_buffer=True,
            save_vecnormalize=True,
        )

        model.learn(
            total_timesteps = max_steps * episodes,
            callback=checkpoint_callback,
            tb_log_name=tb_log_name,
            progress_bar=False
        )
        return model

    @staticmethod
    def DRL_prediction(model, env, deterministic=True, verbose=False):
        """make a prediction and get results"""
        test_env, test_obs = env.get_sb_env()
        
        test_env.reset()
        max_steps = len(env._df['date'].unique())

        validation_assets = None
        validation_dates = None

        for i in range(max_steps):
            action, _states = model.predict(test_obs, deterministic=deterministic)

            if(verbose):
                print("Step: ", str(i))
                print("Observations: ")
                print(test_obs)
                print("Actions: ")
                print(action)

            # Pull out the latest assets and dates
            validation_assets = env._asset_memory["final"]
            validation_dates = env._date_memory

            test_obs, rewards, dones, info = test_env.step(action)

            if dones[0]:
                print("hit end!")
                break
        
        return validation_assets, validation_dates

In [4]:
df_train = portfolio_norm_df[(portfolio_norm_df["date"] >= TRAIN_START_DATE) & (portfolio_norm_df["date"] <= TRAIN_END_DATE)]
# df_2021 = portfolio_norm_df[(portfolio_norm_df["date"] >= TEST_START_DATE) & (portfolio_norm_df["date"] <= "2021-12-31")]
# df_2022 = portfolio_norm_df[(portfolio_norm_df["date"] >= "2022-01-01") & (portfolio_norm_df["date"] <= "2022-12-31")]
# df_2023 = portfolio_norm_df[(portfolio_norm_df["date"] >= "2023-01-01") & (portfolio_norm_df["date"] < TEST_END_DATE)]

# TODO use the start and end date here

df_train.groupby("tic").count()

Unnamed: 0_level_0,date,open,high,low,close,volume,day
tic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAPL,3211,3211,3211,3211,3211,3211,3211
BA,3211,3211,3211,3211,3211,3211,3211
INTC,3211,3211,3211,3211,3211,3211,3211
MSFT,3211,3211,3211,3211,3211,3211,3211
V,3211,3211,3211,3211,3211,3211,3211
WMT,3211,3211,3211,3211,3211,3211,3211


In [5]:
import torch
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [6]:
# Fit for the portfolio optimization model

from sklearn.preprocessing import MaxAbsScaler
from finrl.meta.preprocessor.preprocessors import GroupByScaler

from finrl.meta.preprocessor.preprocessors import data_split

from datetime import datetime, timedelta


environment_ppo = PortfolioOptimizationEnv(
    portfolio_norm_df,
    initial_amount=INITIAL_CASH,
    comission_fee_pct=COMMISSION_FEE_PERCENT,
    # time_window=TIME_WINDOW,
    features=["close", "high", "low"],
    normalize_df=None,
    reward_scaling=1e-4,
)

In [7]:
agent_ppo = DRLStableAgent(env = environment_ppo)
PPO_PARAMS = {
    "n_steps": 2048,
    "batch_size": 64,
    "ent_coef": 0.01,
    "learning_rate": 0.00025, # TODO tried raising the lr which caused vanishing problem
    "clip_range": 0.1,
    # "gae_lambda": 0.001,
}

POLICY_PARAMS = {
    "log_std_init": 1
}

# Lower clip_range makes the stocks flatter, very conservative policy

# TODO try playing around with the number of epochs? n_epochs
# TODO try playing around more with the entropy term, make sure agent does enough exploration during training
# TODO try playing around more with the clip papram here


model_ppo = agent_ppo.get_model("ppo", device, model_kwargs=PPO_PARAMS, policy_kwargs=POLICY_PARAMS)

# set up logger
tmp_path = RESULTS_DIR + '/ppo'
new_logger_ppo = configure(tmp_path, ["stdout", "csv", "tensorboard"])
# Set new logger
model_ppo.set_logger(new_logger_ppo)

Logging to results/ppo




In [8]:
model_ppo = DRLStableAgent.train_model(model_ppo, env=environment_ppo, episodes=5)

from finrl.config import TRAINED_MODEL_DIR

environment_ppo.reset()

model_ppo.save(TRAINED_MODEL_DIR + "/agent_opt_ppo_update")

  weights = actions / action_sum


Max number of time steps in an episode:  3211
HERE actions
HERE2  weights
[1. 0. 0. 0. 0. 0. 0.]
HERE actions
HERE2  weights
[0.         0.21363473 0.2357753  0.         0.         0.275295
 0.275295  ]
HERE actions
HERE2  weights
[0.05267202 0.473664   0.         0.473664   0.         0.
 0.        ]
HERE actions
HERE2  weights
[0.25 0.   0.   0.25 0.25 0.25 0.  ]
HERE actions
HERE2  weights
[0.         0.15592505 0.19700223 0.21569091 0.21569091 0.
 0.21569091]
HERE actions
HERE2  weights
[0.25 0.25 0.25 0.   0.   0.25 0.  ]
HERE actions
HERE2  weights
[0.         0.         0.19734548 0.2675515  0.2675515  0.
 0.2675515 ]
HERE actions
HERE2  weights
[0. 0. 0. 0. 0. 1. 0.]
HERE actions
HERE2  weights
[0.2 0.2 0.2 0.2 0.  0.2 0. ]
HERE actions
HERE2  weights
[0.25 0.25 0.25 0.   0.   0.   0.25]
HERE actions
HERE2  weights
[0.45090568 0.         0.         0.         0.5490943  0.
 0.        ]
HERE actions
HERE2  weights
[0.32811558 0.         0.32811558 0.32811558 0.         0.
 0.015

  weights = actions / action_sum


HERE actions
HERE2  weights
[0.  0.  0.5 0.  0.  0.5 0. ]
HERE actions
HERE2  weights
[0.5 0.  0.  0.5 0.  0.  0. ]
HERE actions
HERE2  weights
[0.23653685 0.23653685 0.23653685 0.13717817 0.         0.1532113
 0.        ]
HERE actions
HERE2  weights
[0.         0.33333334 0.         0.         0.33333334 0.33333334
 0.        ]
HERE actions
HERE2  weights
[0.33333334 0.33333334 0.         0.33333334 0.         0.
 0.        ]
HERE actions
HERE2  weights
[0.  0.  0.5 0.  0.  0.5 0. ]
HERE actions
HERE2  weights
[0.         0.33333334 0.33333334 0.33333334 0.         0.
 0.        ]
HERE actions
HERE2  weights
[0.05272838 0.         0.2483924  0.2020944  0.         0.2483924
 0.2483924 ]
HERE actions
HERE2  weights
[0.15606129 0.19036418 0.11036243 0.22317058 0.22317058 0.
 0.09687092]
HERE actions
HERE2  weights
[0.   0.25 0.25 0.   0.25 0.   0.25]
HERE actions
HERE2  weights
[0.         0.         0.02470408 0.         0.         0.5071745
 0.46812147]
HERE actions
HERE2  weights
[0.0

  weights = actions / action_sum


HERE actions
HERE2  weights
[0.2 0.  0.2 0.2 0.2 0.2 0. ]
HERE actions
HERE2  weights
[0.23924989 0.         0.         0.24781331 0.17037576 0.342561
 0.        ]
HERE actions
HERE2  weights
[0.25 0.   0.   0.   0.25 0.25 0.25]
HERE actions
HERE2  weights
[0.33333334 0.         0.         0.33333334 0.         0.33333334
 0.        ]
HERE actions
HERE2  weights
[0.        0.        0.8622928 0.1377072 0.        0.        0.       ]
HERE actions
HERE2  weights
[0.04827574 0.4423504  0.50937384 0.         0.         0.
 0.        ]
HERE actions
HERE2  weights
[0.25 0.   0.25 0.   0.25 0.25 0.  ]
HERE actions
HERE2  weights
[0.         0.24816564 0.00733742 0.         0.24816564 0.24816564
 0.24816564]
HERE actions
HERE2  weights
[1. 0. 0. 0. 0. 0. 0.]
HERE actions
HERE2  weights
[0.12832023 0.17433596 0.17433596 0.17433596 0.17433596 0.17433596
 0.        ]
HERE actions
HERE2  weights
[0.11467867 0.         0.         0.44266066 0.44266066 0.
 0.        ]
HERE actions
HERE2  weights
[0.

  weights = actions / action_sum


HERE actions
HERE2  weights
[0.20112279 0.         0.20112279 0.19550881 0.20112279 0.20112279
 0.        ]
HERE actions
HERE2  weights
[0.  0.2 0.2 0.2 0.  0.2 0.2]
HERE actions
HERE2  weights
[0.24278806 0.24278806 0.24278806 0.         0.         0.24278806
 0.02884781]
HERE actions
HERE2  weights
[0.1948891  0.         0.2514973  0.05061902 0.2514973  0.
 0.2514973 ]
HERE actions
HERE2  weights
[0.   0.25 0.25 0.   0.25 0.25 0.  ]
HERE actions
HERE2  weights
[0.33333334 0.         0.33333334 0.33333334 0.         0.
 0.        ]
HERE actions
HERE2  weights
[0.         0.31227344 0.31227344 0.06317962 0.         0.31227344
 0.        ]
HERE actions
HERE2  weights
[0.        0.        0.5915519 0.        0.        0.        0.4084481]
HERE actions
HERE2  weights
[0.5818579  0.         0.         0.         0.41814214 0.
 0.        ]
HERE actions
HERE2  weights
[0.  0.  0.  0.5 0.  0.5 0. ]
HERE actions
HERE2  weights
[0.        0.        0.        0.7838302 0.2161698 0.        0.    

  weights = actions / action_sum


HERE2  weights
[1. 0. 0. 0. 0. 0. 0.]
HERE actions
HERE2  weights
[0.         0.1613261  0.         0.27955797 0.27955797 0.27955797
 0.        ]
HERE actions
HERE2  weights
[0.25424483 0.26795128 0.         0.26795128 0.         0.20985259
 0.        ]
HERE actions
HERE2  weights
[0.         0.11456522 0.         0.16036578 0.         0.36253452
 0.36253452]
HERE actions
HERE2  weights
[0.25 0.25 0.25 0.   0.   0.25 0.  ]
HERE actions
HERE2  weights
[0.16253908 0.         0.27915365 0.         0.27915365 0.
 0.27915365]
HERE actions
HERE2  weights
[0.5 0.  0.  0.  0.  0.5 0. ]
HERE actions
HERE2  weights
[1. 0. 0. 0. 0. 0. 0.]
HERE actions
HERE2  weights
[0.  0.5 0.5 0.  0.  0.  0. ]
HERE actions
HERE2  weights
[0.         0.4906351  0.         0.         0.         0.01872975
 0.4906351 ]
HERE actions
HERE2  weights
[0.  0.5 0.  0.  0.  0.  0.5]
HERE actions
HERE2  weights
[0.         0.37354767 0.05838798 0.19451667 0.37354767 0.
 0.        ]
HERE actions
HERE2  weights
[0.17356339 

  weights = actions / action_sum


HERE actions
HERE2  weights
[0.25 0.25 0.   0.   0.   0.25 0.25]
HERE actions
HERE2  weights
[0.18809348 0.18809348 0.18809348 0.         0.18809348 0.05953263
 0.18809348]
HERE actions
HERE2  weights
[0.         0.33333334 0.33333334 0.33333334 0.         0.
 0.        ]
HERE actions
HERE2  weights
[0.29349422 0.00911015 0.29349422 0.         0.20975061 0.19415084
 0.        ]
HERE actions
HERE2  weights
[0.         0.29462752 0.29462752 0.         0.         0.29462752
 0.11611749]
HERE actions
HERE2  weights
[0.36583462 0.         0.36583462 0.26052755 0.00780316 0.
 0.        ]
HERE actions
HERE2  weights
[0.28423148 0.         0.14730552 0.28423148 0.         0.28423148
 0.        ]
HERE actions
HERE2  weights
[0. 0. 0. 0. 0. 0. 1.]
HERE actions
HERE2  weights
[0.  0.5 0.  0.  0.  0.  0.5]
HERE actions
HERE2  weights
[0.2041136  0.2041136  0.18354557 0.2041136  0.         0.2041136
 0.        ]
HERE actions
HERE2  weights
[0.24596411 0.24596411 0.24596411 0.24596411 0.01614351 0.


  weights = actions / action_sum


HERE actions
HERE2  weights
[0.         0.30147386 0.3492631  0.         0.3492631  0.
 0.        ]
HERE actions
HERE2  weights
[0.16391441 0.2786952  0.         0.         0.         0.2786952
 0.2786952 ]
HERE actions
HERE2  weights
[0.2569106  0.2569106  0.22926824 0.         0.2569106  0.
 0.        ]
HERE actions
HERE2  weights
[0.         0.296417   0.         0.26704246 0.         0.2843196
 0.15222092]
HERE actions
HERE2  weights
[0.27662015 0.27662015 0.         0.17013961 0.27662015 0.
 0.        ]
HERE actions
HERE2  weights
[0.         0.21486135 0.         0.21486135 0.21486135 0.14055459
 0.21486135]
HERE actions
HERE2  weights
[0.25 0.   0.25 0.   0.25 0.25 0.  ]
HERE actions
HERE2  weights
[0.23783287 0.         0.         0.23783287 0.23783287 0.04866849
 0.23783287]
HERE actions
HERE2  weights
[0.        0.        0.1990629 0.        0.        0.        0.8009371]
HERE actions
HERE2  weights
[0.2 0.2 0.2 0.  0.2 0.2 0. ]
HERE actions
HERE2  weights
[0.33333334 0.     

  weights = actions / action_sum


HERE actions
HERE2  weights
[0.14343414 0.         0.28552195 0.28552195 0.28552195 0.
 0.        ]
HERE actions
HERE2  weights
[0.3678741 0.2642518 0.        0.        0.        0.        0.3678741]
HERE actions
HERE2  weights
[0.  0.2 0.  0.2 0.2 0.2 0.2]
HERE actions
HERE2  weights
[0.18631488 0.40684253 0.         0.         0.         0.40684253
 0.        ]
HERE actions
HERE2  weights
[0.2916871  0.14366528 0.         0.27296054 0.2916871  0.
 0.        ]
HERE actions
HERE2  weights
[0.33333334 0.33333334 0.33333334 0.         0.         0.
 0.        ]
HERE actions
HERE2  weights
[0.36922246 0.         0.         0.02778306 0.36922246 0.23377202
 0.        ]
HERE actions
HERE2  weights
[0.33333334 0.33333334 0.33333334 0.         0.         0.
 0.        ]
HERE actions
HERE2  weights
[0.25 0.25 0.   0.   0.25 0.25 0.  ]
HERE actions
HERE2  weights
[0.2493532  0.25028455 0.         0.2071894  0.04288834 0.25028455
 0.        ]
HERE actions
HERE2  weights
[0.         0.18883595 0.

  weights = actions / action_sum


HERE actions
HERE2  weights
[0.11639883 0.02628049 0.21433015 0.21433015 0.21433015 0.
 0.21433015]
HERE actions
HERE2  weights
[0.25 0.   0.   0.25 0.25 0.25 0.  ]
HERE actions
HERE2  weights
[0. 0. 0. 1. 0. 0. 0.]
HERE actions
HERE2  weights
[0.         0.         0.41042522 0.         0.         0.41042522
 0.1791496 ]
HERE actions
HERE2  weights
[0.20049256 0.05902239 0.24682836 0.         0.24682836 0.24682836
 0.        ]
HERE actions
HERE2  weights
[0.         0.         0.1405473  0.         0.         0.5449577
 0.31449503]
HERE actions
HERE2  weights
[0.         0.04189239 0.47905383 0.         0.         0.47905383
 0.        ]
HERE actions
HERE2  weights
[0.11292455 0.         0.22341105 0.22341105 0.         0.22341105
 0.21684226]
HERE actions
HERE2  weights
[0.         0.46314877 0.         0.46314877 0.07370247 0.
 0.        ]
HERE actions
HERE2  weights
[0.32417727 0.         0.         0.33791137 0.33791137 0.
 0.        ]
HERE actions
HERE2  weights
[0.14589569 0.170

  weights = actions / action_sum


HERE actions
HERE2  weights
[0.3862428  0.         0.         0.         0.22751443 0.3862428
 0.        ]
HERE actions
HERE2  weights
[0.12072607 0.         0.         0.2930913  0.2930913  0.2930913
 0.        ]
HERE actions
HERE2  weights
[0.25 0.25 0.   0.   0.25 0.25 0.  ]
HERE actions
HERE2  weights
[0.         0.33333334 0.         0.         0.         0.33333334
 0.33333334]
HERE actions
HERE2  weights
[0.28586057 0.28586057 0.         0.         0.14241831 0.28586057
 0.        ]
HERE actions
HERE2  weights
[0. 0. 0. 0. 1. 0. 0.]
HERE actions
HERE2  weights
[0.10666307 0.         0.27596295 0.27596295 0.27596295 0.02696319
 0.0384849 ]
HERE actions
HERE2  weights
[0.         0.         0.         0.07415627 0.         0.
 0.9258437 ]
HERE actions
HERE2  weights
[0.17620862 0.17620862 0.17620862 0.11895692 0.         0.17620862
 0.17620862]
HERE actions
HERE2  weights
[0.         0.         0.         0.32273105 0.33863446 0.
 0.33863446]
HERE actions
HERE2  weights
[0.       

  weights = actions / action_sum


HERE actions
HERE2  weights
[0.         0.         0.02184509 0.48907748 0.         0.48907748
 0.        ]
HERE actions
HERE2  weights
[0.32939658 0.         0.25111496 0.         0.09009189 0.
 0.32939658]
HERE actions
HERE2  weights
[0.33333334 0.         0.33333334 0.         0.33333334 0.
 0.        ]
HERE actions
HERE2  weights
[0.5 0.5 0.  0.  0.  0.  0. ]
HERE actions
HERE2  weights
[0.2555866  0.         0.         0.09985663 0.2555866  0.13338356
 0.2555866 ]
HERE actions
HERE2  weights
[0.         0.03166961 0.         0.         0.48416522 0.48416522
 0.        ]
HERE actions
HERE2  weights
[0.         0.         0.         0.33333334 0.33333334 0.33333334
 0.        ]
HERE actions
HERE2  weights
[0.         0.28092757 0.         0.3209685  0.3209685  0.
 0.07713538]
HERE actions
HERE2  weights
[0.         0.33333334 0.33333334 0.33333334 0.         0.
 0.        ]
HERE actions
HERE2  weights
[0.         0.33333334 0.         0.         0.33333334 0.
 0.33333334]
HERE actio

  weights = actions / action_sum
  weights = actions / action_sum


HERE actions
HERE2  weights
[0.         0.         0.         0.7435148  0.25648522 0.
 0.        ]
HERE actions
HERE2  weights
[0.  0.5 0.  0.  0.  0.5 0. ]
HERE actions
HERE2  weights
[0.25 0.   0.   0.25 0.   0.25 0.25]
HERE actions
HERE2  weights
[0.         0.35828653 0.         0.2834269  0.         0.
 0.35828653]
HERE actions
HERE2  weights
[0.         0.         0.07631537 0.11171857 0.28233963 0.
 0.5296264 ]
HERE actions
HERE2  weights
[0.25945187 0.25945187 0.         0.25945187 0.         0.
 0.22164442]
HERE actions
HERE2  weights
[0.16666667 0.16666667 0.16666667 0.16666667 0.16666667 0.
 0.16666667]
HERE actions
HERE2  weights
[0.30795375 0.30795375 0.30795375 0.         0.         0.07613872
 0.        ]
HERE actions
HERE2  weights
[0.26271746 0.26271746 0.         0.         0.2118477  0.26271746
 0.        ]
HERE actions
HERE2  weights
[0.         0.14204714 0.45952994 0.         0.11383079 0.
 0.2845921 ]
HERE actions
HERE2  weights
[0.25 0.25 0.   0.25 0.   0.   0.

  weights = actions / action_sum


HERE2  weights
[1. 0. 0. 0. 0. 0. 0.]
HERE actions
HERE2  weights
[0.7325102  0.         0.26748976 0.         0.         0.
 0.        ]
HERE actions
HERE2  weights
[0.3125257  0.16319837 0.         0.         0.38039684 0.14387912
 0.        ]
HERE actions
HERE2  weights
[0.         0.07649508 0.30783498 0.         0.30783498 0.
 0.30783498]
HERE actions
HERE2  weights
[0.20706101 0.17864951 0.20706101 0.         0.20706101 0.
 0.20016746]
HERE actions
HERE2  weights
[0.  0.  0.  0.5 0.  0.  0.5]
HERE actions
HERE2  weights
[0.25 0.   0.25 0.   0.25 0.   0.25]
HERE actions
HERE2  weights
[0.         0.         0.27446237 0.3627688  0.3627688  0.
 0.        ]
HERE actions
HERE2  weights
[0.  0.  0.  0.  0.  0.5 0.5]
HERE actions
HERE2  weights
[0.   0.25 0.25 0.   0.25 0.25 0.  ]
HERE actions
HERE2  weights
[0.29078758 0.         0.3546062  0.3546062  0.         0.
 0.        ]
HERE actions
HERE2  weights
[0.  0.  0.  0.5 0.  0.5 0. ]
HERE actions
HERE2  weights
[0.         0.        

  weights = actions / action_sum


HERE actions
HERE2  weights
[0.  0.2 0.2 0.  0.2 0.2 0.2]
HERE actions
HERE2  weights
[0.5 0.  0.  0.  0.5 0.  0. ]
HERE actions
HERE2  weights
[0. 1. 0. 0. 0. 0. 0.]
HERE actions
HERE2  weights
[0.         0.         0.         0.         0.4069007  0.4069007
 0.18619856]
HERE actions
HERE2  weights
[0.         0.3203757  0.3203757  0.3203757  0.00098484 0.03788804
 0.        ]
HERE actions
HERE2  weights
[0.30879283 0.07705621 0.30879283 0.12034191 0.         0.
 0.18501624]
HERE actions
HERE2  weights
[0. 0. 1. 0. 0. 0. 0.]
HERE actions
HERE2  weights
[0. 0. 0. 0. 1. 0. 0.]
HERE actions
HERE2  weights
[0.         0.         0.         0.9134933  0.08650672 0.
 0.        ]
HERE actions
HERE2  weights
[0.2201785 0.119286  0.        0.2201785 0.        0.2201785 0.2201785]
HERE actions
HERE2  weights
[0.         0.20712903 0.         0.         0.32820648 0.
 0.46466446]
HERE actions
HERE2  weights
[0.5222707 0.        0.        0.        0.        0.        0.4777293]
HERE actions
HER

  weights = actions / action_sum


HERE actions
HERE2  weights
[0. 0. 0. 0. 0. 1. 0.]
HERE actions
HERE2  weights
[0.19076726 0.         0.19076726 0.19076726 0.19076726 0.04616372
 0.19076726]
HERE actions
HERE2  weights
[0.        0.        0.3195919 0.        0.6804081 0.        0.       ]
HERE actions
HERE2  weights
[0.5 0.  0.  0.5 0.  0.  0. ]
HERE actions
HERE2  weights
[0.2111514  0.         0.2111514  0.15719916 0.03924377 0.2111514
 0.17010291]
HERE actions
HERE2  weights
[0.25103772 0.25103772 0.         0.         0.         0.24688688
 0.25103772]
HERE actions
HERE2  weights
[0.24141043 0.04634627 0.         0.24141043 0.         0.22942238
 0.24141043]
HERE actions
HERE2  weights
[0.33333334 0.         0.         0.         0.33333334 0.
 0.33333334]
HERE actions
HERE2  weights
[0.         0.254488   0.254488   0.254488   0.23653603 0.
 0.        ]
HERE actions
HERE2  weights
[0.24357612 0.24357612 0.         0.02569551 0.24357612 0.24357612
 0.        ]
HERE actions
HERE2  weights
[0.         0.         0

  weights = actions / action_sum


HERE actions
HERE2  weights
[0.   0.   0.25 0.25 0.   0.25 0.25]
HERE actions
HERE2  weights
[0.         0.         0.40675887 0.         0.18648218 0.
 0.40675887]
HERE actions
HERE2  weights
[0.         0.61424845 0.38575152 0.         0.         0.
 0.        ]
HERE actions
HERE2  weights
[0.21093033 0.09830034 0.05797841 0.         0.21093033 0.21093033
 0.21093033]
HERE actions
HERE2  weights
[0.         0.         0.40623012 0.40623012 0.00251765 0.18502212
 0.        ]
HERE actions
HERE2  weights
[0.16513503 0.16697298 0.         0.16697298 0.16697298 0.16697298
 0.16697298]
HERE actions
HERE2  weights
[0.46480232 0.         0.         0.         0.         0.0703954
 0.46480232]
HERE actions
HERE2  weights
[0.         0.         0.28013277 0.28013277 0.         0.28013277
 0.15960175]
HERE actions
HERE2  weights
[0.5 0.  0.  0.  0.5 0.  0. ]
HERE actions
HERE2  weights
[0.27262053 0.         0.27262053 0.         0.         0.18213837
 0.27262053]
HERE actions
HERE2  weights
[0

  weights = actions / action_sum


KeyboardInterrupt: 

In [None]:
from stable_baselines3 import PPO
from finrl.config import TRAINED_MODEL_DIR


# Load the trained models
# trained_ppo_opt = PPO.load(TRAINED_MODEL_DIR + "/agent_opt_ppo_10_27") 


trained_ppo_opt = PPO.load(TRAINED_MODEL_DIR + "/agent_opt_ppo_update") 

In [None]:


PPO_results = {
    "train": {},
}

values, dates = DRLStableAgent.DRL_prediction(trained_ppo_opt, environment_ppo, verbose=True)
PPO_results["train"]["value"] = environment_ppo._terminal_asset_memory["final"]
PPO_results["train"]["date"] = environment_ppo._terminal_date_memory


# Write this out to a csv file, with date and net worth
df_ppo_opt = pd.DataFrame(PPO_results["train"]["value"], columns=['ppo_opt_net_worth'])
df_ppo_date = pd.DataFrame(PPO_results["train"]["date"], columns=['Date'])
if len(df_ppo_opt) == len(df_ppo_date):
    df_ppo_opt['Date'] = df_ppo_date['Date']
else:
    raise ValueError("DataFrames do not have the same number of rows.")

print(df_ppo_opt)


print(df_ppo_opt.loc[0, 'Date'])
