# Asset Portfolio Management using Deep Reinforcement Learning
---

## 6.0 Deep Reinforcement Learning Portfolio without transaction costs

### 6.1 Import Packages

In [58]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pylab as plt
matplotlib.use('Agg')
import datetime
from pypfopt import EfficientFrontier, objective_functions
from pypfopt import risk_models
from pypfopt import expected_returns
from gym.utils import seeding
import gym
from gym import spaces
matplotlib.use('Agg')
from stable_baselines3.common.vec_env import DummyVecEnv

In [60]:
#pip install shimmy

In [62]:
#pip install stockstats

In [64]:
#pip install exchange-calendars

In [66]:
#pip install wrds

### 6.2 Load Data

In [69]:
train_data = pd.read_csv('./datasets/train_data.csv')
test_data = pd.read_csv('./datasets/test_data.csv')

In [71]:
tech_indicator_list = ["open", "high", "low", "close", "volume", "atr", "sma", "rsi", "oil_chg", 'yield_chg']

In [73]:
#train_data

### 6.3 Implement DRL

In [76]:
import importlib
import my_env
from my_env import StockPortfolioEnv
import gym

In [78]:
import my_models
from my_models import DRLAgent

In [80]:
stock_dimension = len(train_data.tic.unique())
state_space = stock_dimension


In [82]:
weights_initial = [1/stock_dimension]*stock_dimension

In [84]:
env_kwargs = { 
    "hmax": 500, 
    "initial_amount": 1000000, 
    "transaction_cost_pct": 0, 
    "state_space": 15, 
    "stock_dim": stock_dimension, 
    "tech_indicator_list": tech_indicator_list, 
    "action_space": stock_dimension, 
    "reward_scaling": 1,
    'initial_weights': [1/stock_dimension]*stock_dimension
}

In [86]:
#print(train_data.dtypes)

In [88]:
#train_data

In [90]:
#train_data['date'] = pd.to_datetime(train_data['date'], errors='coerce')
#train_data['date'] = train_data['date'].map(pd.Timestamp.timestamp).astype('float64')
#train_data['tic'] = train_data['tic'].astype('category').cat.codes.astype('float64')
#train_data['obv'] = train_data['obv'].astype('float64')
#train_data['volume'] = train_data['volume'].astype('float64')
#print(train_data.dtypes)

In [92]:
e_train_gym = StockPortfolioEnv(df = train_data, **env_kwargs)

In [94]:
env_train, _ = e_train_gym.get_sb_env()
print(type(env_train))

<class 'stable_baselines3.common.vec_env.dummy_vec_env.DummyVecEnv'>




In [96]:
real_env = env_train.envs[0]    # <-- This is your actual StockPortfolioEnv instance
#print("Shape of self.data.close:", real_env.data.close.shape)
#print("Shape of self.action space:", real_env.action_space.shape)
#print("Shape of self.state", real_env.state.shape)
#print("Length of date_memory:", len(real_env.date_memory))
#print(real_env.data['date'].iloc[0])  # Should print the first date
#print("Length of actions_memory:", len(real_env.actions_memory))
#actions = real_env.actions_memory[-1]  # Replace this with how you're storing actions
#weights = real_env.softmax_normalization(actions)
#print("Shape of weights:", weights.shape)
#print("Weights:", weights)

In [98]:
#real_env.data

In [100]:
#real_env.df

In [102]:
#real_env.data.close

#### 6.3.1 Model: A2C

In [105]:
# initialize
agent = DRLAgent(env = env_train)

#A2C_PARAMS = {"n_steps": 5, "ent_coef": 0.005, "learning_rate": 0.0002}
A2C_PARAMS = {
    "n_steps": 10,  # Number of steps to roll out the environment
    "gamma": 0.95,  # Discount factor
    "learning_rate": 0.0002,  # Learning rate for the optimizer
    "ent_coef": 0.01,  # Entropy coefficient for exploration
    "vf_coef": 0.5,  # Value function coefficient
    "max_grad_norm": 0.3,  # Maximum norm for gradient clipping
}
model_a2c = agent.get_model(model_name="a2c",model_kwargs = A2C_PARAMS)

{'n_steps': 10, 'gamma': 0.95, 'learning_rate': 0.0002, 'ent_coef': 0.01, 'vf_coef': 0.5, 'max_grad_norm': 0.3}
Using cpu device


In [107]:
from stable_baselines3.common.callbacks import BaseCallback
import matplotlib.pyplot as plt
import os

class PortfolioMonitorCallback(BaseCallback):
    """
    Custom callback for tracking environment rewards (portfolio rewards)
    and cumulative rewards during training in Jupyter.
    It saves both reward and cumulative reward plots to disk.
    """
    def __init__(self, verbose=0):
        super(PortfolioMonitorCallback, self).__init__(verbose)
        self.portfolio_rewards = []  # To store step-by-step rewards
        self.cumulative_rewards = []  # To store cumulative rewards
        self.cumulative_reward = 0.0  # Running total for cumulative rewards
        os.makedirs("results", exist_ok=True)  # Ensure "results/" folder exists

    def _on_step(self) -> bool:
        # self.model.env is a VecEnv, so the actual env is self.model.env.envs[0]
        env = self.model.env.envs[0]
        
        step_reward = env.reward
        self.portfolio_rewards.append(step_reward)
        
        self.cumulative_reward += step_reward
        self.cumulative_rewards.append(self.cumulative_reward)
        
        return True  # Return False to stop training early if needed

    def _on_training_end(self) -> None:
        # Plot the reward curve
        plt.figure(figsize=(8,4))
        plt.plot(self.portfolio_rewards, label="Portfolio Reward", color='green',linewidth=0.1)
        plt.xlabel("Time Step")
        plt.ylabel("Reward")
        plt.legend()
        plt.savefig("results/portfolio_rewards.png")  # Save reward plot to disk
        plt.close()

        plt.figure(figsize=(8,4))
        plt.plot(self.cumulative_rewards, label="Cumulative Reward", color='green')
        plt.xlabel("Time Step")
        plt.ylabel("Cumulative Reward")
        plt.legend()
        plt.savefig("results/cumulative_rewards.png")  # Save cumulative reward plot to disk
        plt.close()

In [109]:
model_a2c = agent.get_model(model_name="a2c", model_kwargs=A2C_PARAMS)

monitor_callback = PortfolioMonitorCallback()

trained_a2c = agent.train_model(
    model=model_a2c,
    tb_log_name='a2c',
    total_timesteps=100000,
    log_interval=10,
    callback=monitor_callback
)

{'n_steps': 10, 'gamma': 0.95, 'learning_rate': 0.0002, 'ent_coef': 0.01, 'vf_coef': 0.5, 'max_grad_norm': 0.3}
Using cpu device
Logging to tensorboard_log/a2c\a2c_415
------------------------------------
| time/                 |          |
|    fps                | 235      |
|    iterations         | 10       |
|    time_elapsed       | 0        |
|    total_timesteps    | 100      |
| train/                |          |
|    entropy_loss       | -14.2    |
|    explained_variance | -267     |
|    learning_rate      | 0.0002   |
|    n_updates          | 9        |
|    policy_loss        | -1.54    |
|    std                | 1        |
|    value_loss         | 0.141    |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 247      |
|    iterations         | 20       |
|    time_elapsed       | 0        |
|    total_timesteps    | 200      |
| train/                |          |
|    entropy_loss 

In [16]:
%load_ext tensorboard
%tensorboard --logdir tensorboard_log
#%tensorboard --logdir tensorboard_log/a2c --host localhost --port 6007
#%tensorboard --logdir=tensorboard_log/a2c --host 0.0.0.0 --port 6008

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 23920), started 4 days, 4:36:46 ago. (Use '!kill 23920' to kill it.)

### 8.5 Backtesting

In [113]:
#A2C
e_trade_gym = StockPortfolioEnv(df = test_data, **env_kwargs)
env_trade, obs_trade = e_trade_gym.get_sb_env()

a2c_test_daily_return, a2c_test_weights = DRLAgent.DRL_prediction(model=trained_a2c,
                        test_data = test_data,
                        test_env = env_trade,
                        test_obs = obs_trade)



In [114]:
a2c_test_weights.to_csv('weights/a2c_test_weights.csv')

In [115]:
a2c_test_weights

tic,BOREO,ELISA,ICP1V,MEKKO,NDA-FI,NESTE,OLVAS,SAMPO,UPM,YIT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-01-03,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000
2023-01-04,0.230318,0.000000,0.230318,0.002210,0.230318,0.000000,0.230318,0.000000,0.076519,0.000000
2023-01-05,0.267989,0.091622,0.000000,0.000000,0.267989,0.104412,0.000000,0.000000,0.000000,0.267989
2023-01-09,0.036414,0.183312,0.000000,0.074548,0.000000,0.183312,0.000000,0.155788,0.183312,0.183312
2023-01-10,0.256263,0.000000,0.236219,0.256263,0.000000,0.242695,0.008560,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
2024-12-19,0.200000,0.000000,0.200000,0.200000,0.200000,0.000000,0.000000,0.000000,0.000000,0.200000
2024-12-20,0.000000,0.334561,0.000000,0.000000,0.330877,0.000000,0.334561,0.000000,0.000000,0.000000
2024-12-23,0.250000,0.250000,0.250000,0.000000,0.250000,0.000000,0.000000,0.000000,0.000000,0.000000
2024-12-27,0.226167,0.226167,0.000000,0.000000,0.226167,0.000000,0.095331,0.226167,0.000000,0.000000


### 8.6 Save the Portfolios

In [117]:
a2c_test_daily_return.to_csv('datasets/a2c_test_daily_return.csv', index=False)