In [5]:
from optionsbacktrader import OptionsBroker, OptionsTradingEnvironment
from stable_baselines3 import PPO, SAC, DDPG

import matplotlib.pyplot as plt

In [2]:
broker = OptionsBroker()
broker.load_historical_data('./Sample_SPX_20151001_to_20151030.sqlite3', fidelity='day')

env = OptionsTradingEnvironment(broker)



In [None]:
policy_kwargs = dict(net_arch=[1024, 768, 512, 256])
model = PPO('MlpPolicy',
            env,
            verbose=1,
            n_steps=len(broker.get_trading_days()),
            policy_kwargs=policy_kwargs,
            tensorboard_log='./ppo_tensorboard')
model.learn(total_timesteps=20000)

rewards = []
obs = env.reset()

for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)

    day_delta = broker.current_date - broker.data_start_date

    rewards.append((day_delta.days, reward))

    env.render()

    if done:
        break

plt.title('Total Return Over Time')
plt.xlabel('Time (Days)')
plt.ylabel('Total Return')
plt.plot(*zip(*rewards))
plt.show()

Using cpu device
Wrapping the env in a DummyVecEnv.
Logging to ./ppo_tensorboard/PPO_3
---------------------------
| time/              |    |
|    fps             | 3  |
|    iterations      | 1  |
|    time_elapsed    | 6  |
|    total_timesteps | 22 |
---------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1            |
|    iterations           | 2            |
|    time_elapsed         | 32           |
|    total_timesteps      | 44           |
| train/                  |              |
|    approx_kl            | -0.026287187 |
|    clip_fraction        | 0.227        |
|    clip_range           | 0.2          |
|    entropy_loss         | -8.51        |
|    explained_variance   | -4.38e+04    |
|    learning_rate        | 0.0003       |
|    loss                 | 2.23e+04     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0374      |
|    std                  | 1

In [None]:
policy_kwargs = dict(net_arch=[1024, 768, 512, 256])
model = DDPG('MlpPolicy',
            env,
            verbose=1,
            n_steps=len(broker.get_trading_days()),
            policy_kwargs=policy_kwargs,
            tensorboard_log='./ddpg_tensorboard')
model.learn(total_timesteps=20000)

rewards = []
obs = env.reset()

for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)

    day_delta = broker.current_date - broker.data_start_date

    rewards.append((day_delta.days, reward))

    env.render()

    if done:
        break

plt.title('Total Return Over Time')
plt.xlabel('Time (Days)')
plt.ylabel('Total Return')
plt.plot(*zip(*rewards))
plt.show()

In [None]:
policy_kwargs = dict(net_arch=[1024, 768, 512, 256])
model = SAC('MlpPolicy',
            env,
            verbose=1,
            n_steps=len(broker.get_trading_days()),
            policy_kwargs=policy_kwargs,
            tensorboard_log='./sac_tensorboard')
model.learn(total_timesteps=20000)

rewards = []
obs = env.reset()

for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)

    day_delta = broker.current_date - broker.data_start_date

    rewards.append((day_delta.days, reward))

    env.render()

    if done:
        break

plt.title('Total Return Over Time')
plt.xlabel('Time (Days)')
plt.ylabel('Total Return')
plt.plot(*zip(*rewards))
plt.show()

In [None]:
policy_kwargs = dict(net_arch=[1024, 768, 512, 256])
model = PPO('MlpPolicy', env, verbose=1, n_steps=len(broker.get_trading_days()), policy_kwargs=policy_kwargs)
model.learn(total_timesteps=100)

obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
        break

print('DONE')

env.close()