In [1]:
import gym
from gym import spaces
import numpy as np
import pandas as pd
from stable_baselines3 import PPO

In [2]:
class EnhancedStockTradingEnv(gym.Env):
    """A stock trading environment for OpenAI gym"""
    metadata = {'render.modes': ['human']}

    def __init__(self, df):
        super(EnhancedStockTradingEnv, self).__init__()

        self.df = df
        self.reward_range = (0, 1)
        
        # Assuming 'Symbols' represents different assets, and you might want to trade multiple assets
        # Adjust according to your exact dataset and requirements
        self.n_assets = len(df['Symbols'].unique())
        
        # Action space: [sell(0), hold(1), buy(2)] for each asset
        self.action_space = spaces.MultiDiscrete([3] * self.n_assets)
        
        # Observation space: Include all features for each asset
        self.feature_names = ['Open', 'Adj Close', 'High', 'Low', 'Volume', 'RSI', 'MACD', 'post utility']
        self.n_features = len(self.feature_names)
        
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.n_assets *
                                                                             self.n_features,), dtype=np.float32)
        
        self.current_step = 0
        self.done = False

        self.initial_account_balance = 10000
        self.current_account_balance = self.initial_account_balance
        self.positions = np.zeros(self.n_assets)
        self.total_profit = 0

    def _next_observation(self):
        obs = np.zeros(self.n_assets * self.n_features)
        for i, symbol in enumerate(self.df['Symbols'].unique()):
            # Filter df by symbol and step
            symbol_df = self.df[self.df['Symbols'] == symbol].reset_index(drop=True)
            if self.current_step < len(symbol_df):
                step_data = symbol_df.loc[self.current_step, self.feature_names]
                obs[i*self.n_features:(i+1)*self.n_features] = step_data.values
        return obs

    def step(self, action):
        prev_total_value = self._calculate_total_portfolio_value()
        self.done = self.current_step >= (len(self.df) // self.n_assets) - 1
        current_prices = self._get_current_prices()

        transaction_cost = 0.001  # Example transaction cost: 0.1%

        for i, act in enumerate(action):
            # Current price of the asset
            current_price = current_prices[i]

            if act == 0:  # Sell
                if self.positions[i] > 0:  # Check if we hold the asset
                    # Update account balance, consider transaction costs
                    self.current_account_balance += (1 - transaction_cost) * current_price * self.positions[i]
                    self.positions[i] = 0
            elif act == 2 and current_price > 0:
                # Calculate max possible shares to buy
                max_shares = self.current_account_balance / (current_price * (1 + transaction_cost))
                # Buy a fractional share or multiple, depending on your strategy
                shares_to_buy = max_shares  # For simplicity, spend all balance on this asset
                self.current_account_balance -= shares_to_buy * current_price * (1 + transaction_cost)
                self.positions[i] += shares_to_buy

        self.current_step += 1

        # Update total profit based on the portfolio value change
        current_total_value = self._calculate_total_portfolio_value()
        self.total_profit += current_total_value - prev_total_value

        next_obs = self._next_observation()
        reward = self.total_profit  # You might want to refine this
        info = {'current_balance': self.current_account_balance, 'total_profit': self.total_profit}

        return next_obs, reward, self.done, info

    def _calculate_total_portfolio_value(self):
        """Calculate the total value of the portfolio: cash + assets"""
        current_prices = self._get_current_prices()
        assets_value = np.dot(self.positions, current_prices)
        total_value = self.current_account_balance + assets_value
        return total_value

    def _get_current_prices(self):
        """Get the current prices of assets"""
        current_prices = np.zeros(self.n_assets)
        for i, symbol in enumerate(self.df['Symbols'].unique()):
            # Filter df by symbol and step
            symbol_df = self.df[self.df['Symbols'] == symbol].reset_index(drop=True)
            if self.current_step < len(symbol_df):
                current_prices[i] = symbol_df.loc[self.current_step, 'Adj Close']
        return current_prices


    def reset(self):
        self.current_step = 0
        self.done = False
        self.positions = np.zeros(self.n_assets)
        self.current_account_balance = self.initial_account_balance
        self.total_profit = 0
        return self._next_observation()

    def render(self, mode='human', close=False):
        print(f'Step: {self.current_step}, Total Profit: {self.total_profit}')

In [6]:
df = pd.read_csv('stocks_data/stocks_panel_data_with_utlity.csv', index_col=0)

In [7]:
df.sort_values('Date', inplace=True)
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,Open,Adj Close,High,Low,Volume,Date,RSI,MACD,Symbols,post utility
0,11.505,12.890,14.414,11.500,93857400.0,2022-03-24,0.000000,0.000000,AFKS,0.000000
1,1186.500,1209.500,1300.000,1182.500,1958.0,2022-03-24,0.000000,0.000000,TRNFP,-14.556419
2,313.000,329.000,369.900,300.400,1767605.0,2022-03-24,0.000000,0.000000,TATNP,0.000000
3,82.000,89.390,99.800,82.000,20195080.0,2022-03-24,0.000000,0.000000,ALRS,0.000000
4,390.000,424.000,442.800,362.500,3129553.0,2022-03-24,0.000000,0.000000,TATN,0.000000
...,...,...,...,...,...,...,...,...,...,...
26273,18.475,19.005,19.110,18.475,110298200.0,2024-03-01,67.703863,0.945649,SVCB,-4944.711108
26274,18.500,18.439,18.569,18.278,26059700.0,2024-03-01,54.357798,0.305021,AFKS,316.776831
26275,1609.000,1597.000,1616.000,1575.000,923957.0,2024-03-01,51.936620,22.220096,TRNFP,-490.039163
26276,1349.800,1353.600,1355.000,1347.000,458343.0,2024-03-01,31.111111,-29.176730,NVTK,0.000000


In [8]:
env = EnhancedStockTradingEnv(df)

model = PPO("MlpPolicy", env, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [None]:
model.learn(total_timesteps=10000)

----------------------------------
| rollout/           |           |
|    ep_len_mean     | 469       |
|    ep_rew_mean     | -9.11e+05 |
| time/              |           |
|    fps             | 4         |
|    iterations      | 1         |
|    time_elapsed    | 449       |
|    total_timesteps | 2048      |
----------------------------------


In [None]:
obs = env.reset()
done = False
while not done:
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, done, info = env.step(action)
    env.render()