In [7]:
import gym
import pandas as pd
from stable_baselines3 import PPO

# Load the CSV data
data = pd.read_csv('../predictor/dataset/AAPL_intraday.csv')

# drop the date column
data = data.drop('Date', axis=1)


# Define the Gym environment
class StockTradingEnv(gym.Env):
    def __init__(self, data):
        super(StockTradingEnv, self).__init__()
        self.data = data
        self.action_space = gym.spaces.Discrete(3)  # Buy, Sell, Hold
        self.observation_space = gym.spaces.Box(
            low=0, high=1, shape=(len(data.columns),))

    def reset(self):
        # Reset the environment
        self.current_step = 0
        self.account_balance = 10000
        self.shares_held = 0
        self.net_worth = self.account_balance
        self.max_steps = len(self.data) - 1
        return self._next_observation()

    def _next_observation(self):
        # Get the next observation
        obs = self.data.iloc[self.current_step].values
        return obs

    def step(self, action):
        # Execute the action and return the next observation, reward, done, and info
        if action == 0:  # Buy
            self.shares_held += 1
            self.account_balance -= self.data.iloc[self.current_step]['Close']
        elif action == 1:  # Sell
            self.shares_held -= 1
            self.account_balance += self.data.iloc[self.current_step]['Close']
        self.net_worth = self.account_balance + self.shares_held * \
            self.data.iloc[self.current_step]['Close']
        self.current_step += 1
        done = self.current_step >= self.max_steps
        obs = self._next_observation()
        reward = self.net_worth - self.account_balance
        return obs, reward, done, {}

In [8]:

# Create the environment
env = StockTradingEnv(data)

# Create the RL agent
model = PPO("MlpPolicy", env, verbose=1)

# Train the agent
model.learn(total_timesteps=10000)

# Save the trained model
model.save("trader_model")

# Load the trained model
model = PPO.load("trader_model")

# Test the agent
obs = env.reset()
done = False
while not done:
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)
    print(f"Action: {action}, Reward: {reward}, Done: {done}")

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




-----------------------------
| time/              |      |
|    fps             | 438  |
|    iterations      | 1    |
|    time_elapsed    | 4    |
|    total_timesteps | 2048 |
-----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 462           |
|    iterations           | 2             |
|    time_elapsed         | 8             |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 7.4497075e-07 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.1          |
|    explained_variance   | 1.13e-06      |
|    learning_rate        | 0.0003        |
|    loss                 | 6.67e+09      |
|    n_updates            | 10            |
|    policy_gradient_loss | -2.98e-05     |
|    value_loss           | 1.23e+10      |
------------------------------------------

In [9]:
# render results
env.render()

NotImplementedError: 