# Deep Reinforcement Learning for Portfolio Optimization


This experiement demonstrates the application of deep reinforcement learning (DRL) techniques for portfolio optimization.

By leveraging state-of-the-art DRL algorithms, we aim to create a robust trading strategy that dynamically adjusts portfolio allocations to maximize returns while minimizing risks.

The workflow includes:

- Data preprocessing
- Feature engineering
- Environment setup
- Training of DRL agents
- Backtesting
- Benchmarking against traditional strategies


## Dependencies


In [39]:
! pip install pandas numpy matplotlib \
               stable-baselines3 \
               PyPortfolioOpt \
               pandas_market_calendars quantstats gymnasium \
               git+https://github.com/AI4Finance-Foundation/FinRL.git -q

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

from stable_baselines3 import A2C, PPO, SAC, DDPG
from stable_baselines3.common.noise import NormalActionNoise

from finrl import config
from finrl import config_tickers
from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.meta.preprocessor.preprocessors import FeatureEngineer, data_split
from finrl.meta.env_portfolio_allocation.env_portfolio import StockPortfolioEnv
from finrl.agents.stablebaselines3.models import DRLAgent
from finrl.plot import backtest_stats, get_daily_return, get_baseline, convert_daily_return_to_pyfolio_ts

from pypfopt.efficient_frontier import EfficientFrontier
from pypfopt import risk_models

%matplotlib inline

## Data loading and pre-processing


In [None]:
def download_data(tickers, start_date, end_date):
    print(f"Downloading {start_date} → {end_date}")
    return YahooDownloader(
        start_date=start_date, end_date=end_date, ticker_list=tickers
    ).fetch_data()

In [42]:
def preprocess_data(df):
    fe = FeatureEngineer(use_technical_indicator=True, use_turbulence=True)
    return fe.preprocess_data(df)

In [None]:
tickers = [
    "AXP",
    "AMGN",
    "AAPL",
    "BA",
    "CAT",
    "CSCO",
    "CVX",
    "GS",
    "HD",
    "HON",
    "IBM",
    "INTC",
    "JNJ",
    "KO",
    "JPM",
    "MCD",
    "MMM",
    "MRK",
    "MSFT",
    "NKE",
    "PG",
    "TRV",
    "UNH",
    "V",
    "VZ",
    "WBA",
    "WMT",
    "DIS",
    "RTX",
    "CRM",
]

start_date = "2010-01-01"
end_date = (datetime.now() - pd.Timedelta(days=1)).strftime("%Y-%m-%d")

df = download_data(tickers, start_date, end_date)
df_feat = preprocess_data(df)

Downloading 2010-01-01 → 2025-04-24


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

Shape of DataFrame:  (115500, 8)
Successfully added technical indicators
Successfully added turbulence index


## Covariance & Returns for State


In [None]:
def compute_covariance_and_returns(df_feat, lookback=252):
    df_sorted = df_feat.sort_values(["date", "tic"], ignore_index=True)
    df_sorted.index = df_sorted.date.factorize()[0]
    cov_list, return_list = [], []
    dates = df_sorted.date.unique()
    for i in range(lookback, len(dates)):
        win = df_sorted.loc[i - lookback : i]
        pm = win.pivot_table(index="date", columns="tic", values="close")
        rm = pm.pct_change().dropna()
        cov_list.append(rm.cov().values)
        return_list.append(rm)
    df_cov = pd.DataFrame(
        {"date": dates[lookback:], "cov_list": cov_list, "return_list": return_list}
    )
    return pd.merge(df_feat, df_cov, on="date", how="left").dropna(subset=["cov_list"])

In [45]:
df_all = compute_covariance_and_returns(df_feat)

## Train/Trade split


In [46]:
def split_data(df_all, train_dates, test_dates):
    train = data_split(df_all, *train_dates)
    test = data_split(df_all, *test_dates)
    return train, test

In [None]:
train_dates = ("2010-01-01", "2020-12-31")
test_dates = ("2021-01-01", end_date)

train, test = split_data(df_all, train_dates, test_dates)

## Environment setup


In [None]:
def configure_environment(train, test, fe):
    stock_dim = len(train.tic.unique())
    env_kwargs = dict(
        stock_dim=stock_dim,
        hmax=100,
        initial_amount=1e6,
        transaction_cost_pct=0.001,
        reward_scaling=1e-4,
        state_space=stock_dim,
        action_space=stock_dim,
        tech_indicator_list=fe.tech_indicator_list,
        # TODO: turbulence_threshold=fe.turbulence_threshold,
    )

    env_train = StockPortfolioEnv(df=train, **env_kwargs)
    env_test = StockPortfolioEnv(df=test, **env_kwargs)

    env_train_sb3, _ = env_train.get_sb_env()
    env_test_sb3, _ = env_test.get_sb_env()
    return env_train_sb3, env_test_sb3, env_kwargs

In [None]:
env_train_sb3, env_test_sb3, env_kwargs = configure_environment(
    train, test, FeatureEngineer()
)

## Training


In [None]:
def train_rl_agents(env_train_sb3, timesteps=200_000):
    models = {}
    n_actions = env_train_sb3.action_space.shape[-1]
    action_noise = NormalActionNoise(
        mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)
    )
    for cls, name, kwargs in [
        (A2C, "A2C", {}),
        (PPO, "PPO", {}),
        (SAC, "SAC", {}),
        (DDPG, "DDPG", {"action_noise": action_noise}),
    ]:
        print(f"Training {name}...")
        model = cls("MlpPolicy", env_train_sb3, verbose=0, **kwargs)
        model.learn(total_timesteps=timesteps)
        models[name] = model
        model.save(f"results/{name}_model")
    return models

In [None]:
models = train_rl_agents(env_train_sb3)

Training A2C...
begin_total_asset:1000000.0
end_total_asset:4526705.276504569
Sharpe:  0.9645611665181906
begin_total_asset:1000000.0
end_total_asset:4407884.594115716
Sharpe:  0.9424184680629185
begin_total_asset:1000000.0
end_total_asset:4618795.128355286
Sharpe:  0.9788109678546886
begin_total_asset:1000000.0
end_total_asset:4522837.477134737
Sharpe:  0.9680127672757749
begin_total_asset:1000000.0
end_total_asset:4404655.045164579
Sharpe:  0.9520750906668861
begin_total_asset:1000000.0
end_total_asset:4477620.515757265
Sharpe:  0.9639195996256185
begin_total_asset:1000000.0
end_total_asset:4205063.396328809
Sharpe:  0.925079494373844
begin_total_asset:1000000.0
end_total_asset:4369285.600857628
Sharpe:  0.9413500024323063
begin_total_asset:1000000.0
end_total_asset:4416370.7868353035
Sharpe:  0.9514361508324057
begin_total_asset:1000000.0
end_total_asset:4643768.856934177
Sharpe:  0.9722558411801913
begin_total_asset:1000000.0
end_total_asset:4638371.719404037
Sharpe:  0.97485915345



begin_total_asset:1000000.0
end_total_asset:4444225.941176385
Sharpe:  0.9314760401079535
begin_total_asset:1000000.0
end_total_asset:4400402.850950147
Sharpe:  0.9237713552415274
begin_total_asset:1000000.0
end_total_asset:4400713.978236635
Sharpe:  0.9238320980324328
begin_total_asset:1000000.0
end_total_asset:4398623.729228515
Sharpe:  0.9235843769664085
begin_total_asset:1000000.0
end_total_asset:4398742.537807043
Sharpe:  0.923596632617525
begin_total_asset:1000000.0
end_total_asset:4399726.331278161
Sharpe:  0.9238441911012124
begin_total_asset:1000000.0
end_total_asset:4395442.785470747
Sharpe:  0.9233167139114684
begin_total_asset:1000000.0
end_total_asset:4401461.62684177
Sharpe:  0.9241649186746232


## Backtesting


In [None]:
def backtest_rl_strategies(models, env_test, env_kwargs):
    results = {}
    for name, model in models.items():
        print(f"Backtesting {name}...")
        from finrl.agents.stablebaselines3.models import DRLAgent

        df_ret, _ = DRLAgent.DRL_prediction(model=model, environment=env_test)
        df_ret["account_value"] = (df_ret["daily_return"] + 1).cumprod() * env_kwargs[
            "initial_amount"
        ]
        stats = backtest_stats(df_ret, value_col_name="account_value")
        results[name] = {"df": df_ret, "stats": stats}
    return results

In [None]:
results = backtest_rl_strategies(models, env_test_sb3, env_kwargs)

## Benchmarks


In [None]:
def compute_mpt_benchmark(test, env_kwargs):
    dates_test = test.date.unique()
    min_vals = [env_kwargs["initial_amount"]]
    for i in range(len(dates_test) - 1):
        curr = test[test.date == dates_test[i]]
        nxt = test[test.date == dates_test[i + 1]]
        covm = np.array(curr.cov_list.values[0])
        ef = EfficientFrontier(None, covm, weight_bounds=(0, 1))
        ef.min_volatility()
        w = ef.clean_weights()
        prices = curr.close.values
        nextp = nxt.close.values
        shares = np.array(list(w.values())) * min_vals[-1] / prices
        min_vals.append(np.dot(shares, nextp))
    min_df = pd.DataFrame({"date": dates_test, "account_value": min_vals})
    stats_mpt = backtest_stats(min_df, value_col_name="account_value")
    return {"df": min_df, "stats": stats_mpt}

In [None]:
def compute_benchmarks(test, env_kwargs):
    ew_daily = test.groupby("date").apply(
        lambda d: d.close.pct_change().fillna(0).mean()
    )
    ew_df = ew_daily.reset_index(name="daily_return")
    ew_df["account_value"] = (ew_df.daily_return + 1).cumprod() * env_kwargs[
        "initial_amount"
    ]
    stats_ew = backtest_stats(ew_df, value_col_name="account_value")

    spy = get_baseline("SPY", test.date.min(), test.date.max())

    spy_ret = spy.pct_change().dropna()
    spy_df = pd.DataFrame({"date": spy_ret.index, "daily_return": spy_ret.values})
    spy_df["account_value"] = (spy_df.daily_return + 1).cumprod() * env_kwargs[
        "initial_amount"
    ]
    stats_spy = backtest_stats(spy_df, value_col_name="account_value")

    return {
        "EW": {"df": ew_df, "stats": stats_ew},
        "SPY": {"df": spy_df, "stats": stats_spy},
    }

In [None]:
results["MPT"] = compute_mpt_benchmark(test, env_kwargs)
benchmarks = compute_benchmarks(test, env_kwargs)
results.update(benchmarks)

## Performance Summary


In [None]:
def plot_performance(results):
    plt.figure(figsize=(12, 6))
    for k, v in results.items():
        dfc = v["df"]
        cum = (dfc["daily_return"] + 1).cumprod() - 1
        plt.plot(dfc["date"], cum, label=k)
    plt.legend()
    plt.title("Cumulative Returns Comparison")
    plt.xlabel("Date")
    plt.ylabel("Cumulative Return")
    plt.show()

In [None]:
plot_performance(results)