In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import math
import numpy as np
from common.memory import ReplayBuffer
from common.model import MLP
import torch.nn.functional as F

In [None]:
from typing import Any, List, Tuple
import numpy as np
import pandas as pd
import random
import gym
import time
from gym import spaces
from sklearn import preprocessing

# Deep Q Networks Algorithm

In [None]:
class DQN:
    def __init__(self, state_dim, action_dim, cfg):

        self.action_dim = action_dim  # total number of actions
        self.device = cfg.device  # computing device, cpu or gpu
        self.gamma = cfg.gamma  # discount factor for rewards
        # Parameters for epsilon-greedy policy
        self.frame_idx = 0  # counter used for epsilon decay
        self.epsilon = lambda frame_idx: cfg.epsilon_end + \
            (cfg.epsilon_start - cfg.epsilon_end) * \
            math.exp(-1. * frame_idx / cfg.epsilon_decay)
        self.batch_size = cfg.batch_size
        self.policy_net = MLP(state_dim, action_dim, hidden_dim=cfg.hidden_dim).to(self.device)
        self.target_net = MLP(state_dim, action_dim, hidden_dim=cfg.hidden_dim).to(self.device)
        # Copy parameters from policy network to target network
        for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
            target_param.data.copy_(param.data)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
        self.memory = ReplayBuffer(cfg.memory_capacity)

    def choose_action(self, state):
        '''Choose an action'''
        self.frame_idx += 1
        if random.random() > self.epsilon(self.frame_idx):
            action = self.predict(state)
        else:
            action = random.randrange(self.action_dim)
        return action

    def predict(self, state):
        # No gradient computation required
        with torch.no_grad():
            state = torch.tensor([state], device=self.device, dtype=torch.float32)
            q_values = self.policy_net(state)
            action = q_values[0][0].tolist().index(q_values[0][0].max())
            # action = q_values.max(1)[1].item()
        return action

    def update(self):
        if len(self.memory) < self.batch_size:
            return
        # Randomly sample transitions from replay memory
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
            self.batch_size)
        state_batch = [np.squeeze(i.T) for i in state_batch]
        '''Convert to tensor
        e.g., tensor([[-4.5543e-02, -2.3910e-01,  1.8344e-02,  2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02,  2.3400e-01]])'''
        state_batch = torch.tensor(
            state_batch, device=self.device, dtype=torch.float)
        action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(
            1)  # e.g., tensor([[1],...,[0]])
        reward_batch = torch.tensor(
            reward_batch, device=self.device, dtype=torch.float)  # tensor([1., 1.,...,1])
        next_state_batch = torch.tensor(
            next_state_batch, device=self.device, dtype=torch.float)
        done_batch = torch.tensor(np.float32(
            done_batch), device=self.device)

        '''Calculate Q(s_t, a) for current (s_t,a)'''
        '''torch.gather example: a=torch.Tensor([[1,2],[3,4]]), a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]])'''
        q_values = self.policy_net(state_batch).gather(
            dim=1, index=action_batch)  # equivalent to self.forward

        # Compute V(s_{t+1}) for all next states, picking max reward from target_net
        next_q_values = self.target_net(next_state_batch)[0].max(
            1)[0].detach()  # e.g., tensor([ 0.0060, -0.0171,...,])
        # Calculate expected Q values
        # For terminal states (done_batch[0]=1), expected_q_value equals reward
        expected_q_values = reward_batch + \
            self.gamma * next_q_values * (1-done_batch)

        # Calculate mean squared error loss
        loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1))

        # Optimize model
        self.optimizer.zero_grad()  # Clear old gradients
        loss.backward()  # Compute gradients through backpropagation
        # Clip gradients to prevent exploding (optional)
        # for param in self.policy_net.parameters():
        #     param.grad.data.clamp_(-1, 1)
        self.optimizer.step()  # Update model parameters

    def save(self, path):
        torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
        torch.save(self.target_net, path+'full_dqn_model.pth')

    def load(self, path):
        self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
        for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
            param.data.copy_(target_param.data)


# Stock Market Environment

In [None]:
class StockLearningEnv(gym.Env):

    def render(self, mode="human"):
        pass


    def __init__(
            self,
            df: pd.DataFrame,
            buy_cost_pct: float = 3e-3,
            sell_cost_pct: float = 3e-3,
            print_verbosity: int = 10,
            initial_amount: int = 1e6,
            patient: bool = False,
            currency: str = "￥",
            is_train: bool = True,
    ) -> None:

        self.df = df
        self.dates = df['date']
        self.date_index = 0
        self.df = self.df.set_index('date')
        self.assets = df['tic']
        self.patient = patient
        self.currency = currency
        self.is_train = is_train
        self.initial_amount = initial_amount
        self.print_verbosity = print_verbosity
        self.buy_cost_pct = buy_cost_pct
        self.sell_cost_pct = sell_cost_pct
        self.window_size = 20
        self.state_list = self.state
        self.state_space = len(self.state_list)*self.window_size
        self.action_space = spaces.Discrete(3)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(self.state_space,)
        )
        self.seed()
        self.episode = -1
        self.episode_history = []
        self.printed_header = False
        self.max_total_assets = 0
        self.account_information = {
            "cash": [],
            "asset_value": [],
            "total_assets": [],
            "reward": []
        }
        self.rolling_window = True
        self.normalization = 'div_self' # choose thw way to process normalization ('div_self' / 'div_close' / 'standardization')
        self.do_normalization()

    def reset(self) -> np.ndarray:
        self.seed()
        self.max_total_assets = self.initial_amount
        self.starting_point = 0
        self.date_index = self.starting_point
        self.episode += 1
        self.actions_memory = []
        self.transaction_memory = []
        self.state_memory = []
        self.coh_memory = [1e+6]
        self.holdings_memory = [0]
        self.account_information = {
            "cash": [],
            "asset_value": [],
            "total_assets": [],
            "reward": []
        }
        init_state = np.zeros(self.state_space)
        self.state_memory.append(init_state)
        return np.array([init_state])

    def step(
            self, actions: np.ndarray
    ) -> Tuple[list, float, bool, dict]:
        self.log_header()
        if (self.current_step + 1) % self.print_verbosity == 0:
            self.log_step(reason="update")
        # save evaluate information on last step for each episode
        if self.date_index == len(self.dates) - 1:
            if self.is_train:
                save_path = f"train_record/train_action{self.episode}.csv"
                self.save_transaction_information().to_csv(save_path)
            return self.return_terminal(reward=self.reward)
        else:
            self.action = actions - 1
            transactions = self.action * 1000
            begin_cash = self.cash_on_hand
            assert_value = np.dot(self.holdings, self.closings)  # total assets
            reward = self.reward
            #save account_information
            self.account_information["cash"].append(begin_cash)
            self.account_information["asset_value"].append(assert_value)
            self.account_information["total_assets"].append(begin_cash + assert_value)
            self.account_information["reward"].append(reward)
            self.actions_memory.append(self.action)
            self.transaction_memory.append(transactions)

            sells = -np.clip(transactions, -np.inf, 0)
            proceeds = np.dot(sells, self.closings)
            costs = proceeds * self.sell_cost_pct
            coh = begin_cash + proceeds  # calculate current cash
            buys = np.clip(transactions, 0, np.inf)
            spend = np.dot(buys, self.closings)
            costs += spend * self.buy_cost_pct
            coh = coh - spend - costs
            holdings_updated = self.holdings + transactions
            self.date_index += 1

            # do standardization in a sliding window
            if self.normalization == "standardization":
                temp = self.df.loc[
                        self.df.index[self.date_index]:self.df.index[self.date_index + self.window_size - 1],
                        ["open", "close", "high", "low", "volume"]]
                scaler = preprocessing.StandardScaler().fit(temp)
                self.df.loc[self.df.index[self.date_index]:self.df.index[self.date_index + self.window_size - 1],
                ["open_", "close_", "high_", "low_", "volume_"]] = scaler.transform(temp)
            state = self.df.loc[self.df.index[self.date_index]:self.df.index[self.date_index + self.window_size - 1],
                    self.state_list]
            state = state.values.reshape(1,220)
            self.state_memory.append(state)
            self.coh_memory.append(coh)
            self.holdings_memory.append(holdings_updated)
            return state, reward, False, {}


    def seed(self, seed: Any = None) -> None:
        """random seed"""
        if seed is None:
            seed = int(round(time.time() * 1000))
        random.seed(seed)

    def log_step(
            self, reason: str, terminal_reward: float = None
    ) -> None:
        """print"""
        if terminal_reward is None:
            terminal_reward = self.account_information["reward"][-1]
        assets = self.account_information["total_assets"][-1]
        gl_pct = self.account_information["total_assets"][-1] / self.initial_amount  # GAINLOSS_PCT

        rec = [
            self.episode,
            self.date_index - self.starting_point,
            reason,
            f"{self.currency}{'{:0,.0f}'.format(float(self.account_information['cash'][-1]))}",
            f"{self.currency}{'{:0,.0f}'.format(float(assets))}",
            f"{terminal_reward * 100:0.5f}%",
            f"{(gl_pct - 1) * 100:0.5f}%",
        ]
        self.episode_history.append(rec)
        print(self.template.format(*rec))

    def return_terminal(
            self, reason: str = "Last Date", reward: int = 0
    ) -> Tuple[list, int, bool, dict]:
        """terminal"""
        state = self.state_memory[-1]
        self.log_step(reason=reason, terminal_reward=reward)
        gl_pct = self.account_information["total_assets"][-1] / self.initial_amount
        reward_pct = gl_pct
        return state, reward, True, {}

    def log_header(self) -> None:
        """Log column name"""
        if not self.printed_header:
            self.template = "{0:4}|{1:4}|{2:15}|{3:15}|{4:15}|{5:10}|{6:10}"
            # 0, 1, 2, ... 是序号
            # 4, 4, 15, ... 是占位格的大小
            print(
                self.template.format(
                    "EPISODE",
                    "STEPS",
                    "TERMINAL_REASON",
                    "CASH",
                    "TOT_ASSETS",
                    "TERMINAL_REWARD",
                    "GAINLOSS_PCT",
                )
            )
            self.printed_header = True

    def save_transaction_information(self) -> pd.DataFrame:
        if self.current_step == 0:
            return None
        else:
            action_df = pd.DataFrame(
                {
                    "close": self.df["close"][-len(self.account_information["cash"]):],
                    "episode": self.episode,
                    "actions": self.actions_memory,
                    "transactions": self.transaction_memory,
                    "total_assets": self.account_information["total_assets"],
                    "reward": self.reward,
                    "assets_baseline": self.assets_baseline()
                })
            return action_df

    # buy and hold
    def assets_baseline(self):
        # Based on the amount of assets that are no longer traded after the first full-position purchase of stocks,
        # reflecting the changes in individual stocks themselves
        close = self.df["close"][0]
        initial_assets = int(self.account_information["total_assets"][0] / (close * 100)) * 100
        return self.df["close"][-len(self.account_information["cash"]):] * initial_assets

    @property
    def current_step(self) -> int:
        return self.date_index - self.starting_point

    @property
    def cash_on_hand(self) -> float:
        coh = self.coh_memory[-1]
        return coh

    @property
    def holdings(self) -> List:
        holdings = self.holdings_memory[-1]
        return holdings

    @property
    def closings(self) -> List:"
        close = self.df.loc[self.df.index[self.date_index], "close"]
        return close

    @property
    def state(self):
        state1 = ['kdjk', 'kdjd', 'kdjj', 'rsi_6', 'rsi_12', 'rsi_24']
        state2 = ['kdjk', 'kdjd', 'kdjj', 'rsi_6', 'rsi_12', 'rsi_24', "open_", "close_", "high_", "low_", "volume_"]
        return state2


    @property
    def reward(self) -> float:
        # init
        epsilon = 0.001
        signal_dict = {'-':0,"v":-1,"^":1}
        current_date = self.df.index[self.date_index]
        signal = signal_dict.get(self.df.loc[self.df.index[self.date_index],'landmark'])
        # get y
        y_current = self.closings
        y_valley = self.df[(self.df.landmark == 'v') & (self.df.index > current_date)].iloc[0,4]
        y_peak = self.df[(self.df.landmark == '^') & (self.df.index > current_date)].iloc[0,4]
        y_valley_date = self.df[(self.df.landmark == 'v') & (self.df.index > current_date)].index[0]
        y_peak_date = self.df[(self.df.landmark == '^') & (self.df.index > current_date)].index[0]
        # calculate reward
        if self.action != 0:
            reward = (y_peak - y_valley)/(y_peak - y_current + epsilon) if (y_peak_date > y_valley_date) else (y_peak - y_valley)/(y_current - y_valley + epsilon)
            if signal != self.action:
                reward = -reward
        else:
            reward = 0
        print(np.tanh(reward))
        return np.tanh(reward)

In [None]:
df = pd.read_csv(r"apple_preprocessed_data.csv")
s = StockLearningEnv(df)
print(s.reward)