In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import gym # openai gym

import pandas as pd
import numpy as np

from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler

import os
from dotenv import load_dotenv
from data import get_data, p2df

In [3]:
# # Generate some fake data
# dates = pd.date_range(start='2022-01-01', end='2023-01-31', freq='D')
# open_prices = np.random.randint(low=100, high=150, size=len(dates))
# high_prices = open_prices + np.random.randint(low=0, high=50, size=len(dates))
# low_prices = open_prices - np.random.randint(low=0, high=50, size=len(dates))
# close_prices = open_prices + np.random.randint(low=-10, high=10, size=len(dates))
# volume = np.random.randint(low=1000, high=10000, size=len(dates))

# # Create a DataFrame with the data
# df = pd.DataFrame({
#     'date': dates,
#     'open': open_prices,
#     'high': high_prices,
#     'low': low_prices,
#     'close': close_prices,
#     'volume': volume
# })


In [4]:
# FOR THIS WE WILL USE THE POLYGON API
load_dotenv()
POLYGON_API_KEY = os.getenv('POLYGON_API_KEY')

In [5]:
# # load some data, we will use TSLA
data = get_data(POLYGON_API_KEY, ticker="TSLA", multiplier=1, timespan="hour", from_="2021-01-09", to="2023-02-10", limit=50000)

In [8]:
# df = pd.read_csv('data/BTC-2017min.csv')

# # convert date to datetime
# df['date'] = pd.to_datetime(df['date'])

# # lets sort the data with date and reset index
# df = df.sort_values('date')
# df = df.reset_index(drop=True)

# # grabbing the values we care about
# df = df[['open', 'high', 'low', 'close', 'Volume USD']]

# # rename Volume USD to volume
# df = df.rename(columns={'Volume USD': 'volume'})
# df

In [9]:
# turn our data into a dataframe
df = p2df(data, convert_timestamp=True) # convert timestamp to datetime

In [11]:
df = df[['open', 'high', 'low', 'close', 'volume']]

In [12]:
def normalize_data(data, cols2norm=['open', 'high', 'low', 'volume'], target='close'):
    target = data[target].values # not normalized
    data_2_normalize = data[cols2norm].values
    scaler = MinMaxScaler(feature_range=(0,1))
    scaler = scaler.fit(data_2_normalize)
    data_normalized = scaler.transform(data_2_normalize)
    return data_normalized, target

In [89]:
class TradingEnvironment(gym.Env):
    def __init__(self, data, target, initial_capital=100000, start_date=0, end_date=None):
        # initialize variables
        self.data = data
        self.target = target
        self.initial_capital = initial_capital
        self.start_date = start_date
        self.end_date = end_date if end_date is not None else len(data) - 1
        self.reset()

    def reset(self):
        # reset variables
        self.current_date = self.start_date
        self.current_price = self.target[self.current_date]
        self.current_capital = self.initial_capital
        self.current_shares = 0
        self.done = False
        self.history = []
        self.profit_losses = []
        self.purchase_price = None

        return self.get_state()

    def step(self, action, buy_fraction=0):
        assert action in [0, 1, 2]

        # Simulate price movement based on historical data
        if self.current_date < self.end_date:
            self.current_date += 1
            self.current_price = self.target[self.current_date]

        # Calculate the reward based on the action taken
        reward = 0
        if action == 0:  # Buy
            if buy_fraction > 0:
                buy_amount = self.current_capital * buy_fraction
                buy_shares = int(buy_amount // self.current_price)
                if buy_shares > 0 and buy_shares * self.current_price <= self.current_capital:
                    self.purchase_price = self.current_price  # record purchase price
                    self.current_shares += buy_shares
                    self.current_capital -= buy_shares * self.current_price
        elif action == 1:  # Sell
            if self.current_shares > 0:
                profit_loss = self.current_shares * (self.current_price - self.purchase_price)
                if profit_loss >= 0:
                    reward = profit_loss / self.purchase_price  # positive reward for profit
                else:
                    reward = profit_loss / self.purchase_price * 2  # negative reward for loss NOTE: This penalizes negative returns more than positive returns
                self.current_capital += self.current_shares * self.current_price
                self.current_shares = 0
                self.purchase_price = None
                self.profit_losses.append(profit_loss)
        else:  # Hold
            pass

        # Check if done
        if self.current_date >= self.end_date:
            self.done = True

        # Update the history
        self.history.append((self.current_date, self.current_price, self.current_shares, self.current_capital))

        return self.get_state(), reward, self.done, {}
    
    def get_state(self):
        # Return the current state
        features = self.data[self.current_date] # numpy array
        cur_state =  np.array([self.current_price, self.current_shares, self.current_capital])
        return np.concatenate((features, cur_state))


In [109]:
import time

class Logger:
    def __init__(self, file_path, flush_interval=1000, flush_time=60):
        self.file_path = file_path
        self.buffer = []
        self.flush_interval = flush_interval
        self.flush_time = flush_time
        self.last_flush_time = time.time()

    def log(self, data):
        self.buffer.append(data)
        if len(self.buffer) >= self.flush_interval or time.time() - self.last_flush_time >= self.flush_time:
            self.flush()

    def flush(self):
        with open(self.file_path, 'a') as f:
            for data in self.buffer:
                f.write(data + '\n')
        self.buffer = []
        self.last_flush_time = time.time()

In [90]:
# Define the agent's policy
class Policy(nn.Module):
    def __init__(self, act_fn=nn.Mish):
        super(Policy, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(7, 64),
            act_fn(),
            nn.Linear(64, 128),
            act_fn(),
            nn.Linear(128, 3),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.layers(x)

class BuyPolicy(nn.Module):
    def __init__(self, act_fn=nn.Mish):
        super(BuyPolicy, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(7, 64),
            act_fn(),
            nn.Linear(64, 128),
            act_fn(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)

In [91]:
data, target = normalize_data(df, target='close')

In [119]:
# define the en
env = TradingEnvironment(data, target=target, initial_capital=10000, start_date=0)

# Define the agent's policies
policy = Policy()
buy_policy = BuyPolicy()

# Define the optimizers
optimizer = optim.Adam(policy.parameters(), lr=0.01)
buy_optimizer = optim.Adam(buy_policy.parameters(), lr=0.01)

gamma = 0.99
eps_clip = 0.2

# Define the logger
logger_fname = 'log.txt'
if os.path.exists(logger_fname):
    os.remove(logger_fname)
logger = Logger('log.txt')

In [118]:
# Train the agent using the PPO algorithm
states_dict = {}
for i in range(10000):
    state = env.reset()
    done = False
    while not done:
        state = torch.tensor(state, dtype=torch.float32)
        dist = torch.distributions.Categorical(logits=policy(state))
        action = dist.sample()
        log_prob = dist.log_prob(action)

        with torch.no_grad():
            buy_fraction = buy_policy(state).item()

        next_state, reward, done, _ = env.step(action.item(), buy_fraction)

        with torch.no_grad():
            next_state = torch.tensor(next_state, dtype=torch.float32)
            dist = torch.distributions.Categorical(logits=policy(next_state))
            next_action = dist.sample()
            next_log_prob = dist.log_prob(next_action)
            v_next = policy(next_state)[1]

        advantage = reward + gamma * (1 - done) * v_next - policy(state)[1]
        critic_loss = F.smooth_l1_loss(policy(state)[1], reward + gamma * (1 - done) * v_next)

        ratio = torch.exp(log_prob - next_log_prob)
        surr1 = ratio * advantage
        surr2 = torch.clamp(ratio, 1 - eps_clip, 1 + eps_clip) * advantage
        actor_loss = -torch.min(surr1, surr2).mean() - 0.01 * dist.entropy().mean()

        buy_loss = -torch.log(buy_policy(state)[0] + 1e-5) * buy_fraction

        loss = actor_loss + critic_loss + buy_loss
        optimizer.zero_grad()
        buy_optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        buy_optimizer.step()

        state = next_state.numpy()
        
        logger.log(f"Episode: {i}, Action: {action}, Reward: {reward}, Loss: {loss.item()}, Working Capital: {env.current_capital}, Current Shares: {env.current_shares}, Total Capital: {env.current_capital + env.current_shares * env.current_price}, current price: {env.current_price}, buy fraction: {buy_fraction}")

    if (i+1) % 10 == 0:
        print(f"Episode: {i}, Total Capital: {env.current_capital + env.current_shares * env.current_price}")

    states_dict[i] = {"state": state, "total_capital": env.current_capital + env.current_shares * env.current_price}


Episode: 9, Total Capital: 1266.1192999999994
