In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt

from datetime import datetime
import itertools
import argparse
import re
import os
import gym
import pickle

from sklearn.preprocessing import StandardScaler
from google.colab import drive
drive.mount('/content/new_drive')



In [None]:
def identify_market_periods(data, window=30, threshold=0.02):
    """
    Define market periods based on rolling window volatility.
    :param data: Historical stock price data
    :param window: Rolling window size
    :param threshold: Volatility threshold to distinguish stable and crisis periods
    :return: Labeled market periods
    """
    returns = data.pct_change()
    volatility = returns.rolling(window=window).std()
    market_periods = np.where(volatility > threshold, 'crisis', 'stable')
    # Flatten the market_periods array to match the index
    market_periods = pd.DataFrame(market_periods, index=data.index, columns=data.columns)
    return market_periods


In [None]:
def get_data(tickers, start, end):
    data = yf.download(tickers, start=start, end=end)
    return data['Adj Close']

In [None]:
def get_scaler(env):
    # return scikit-learn scaler object to scale the states
    states = []
    for _ in range(env.n_step):
        action = env.action_space.sample()  # Use sample() to generate a random action
        state, reward, done, info = env.step(action)
        states.append(state)
        if done:
            break

    scaler = StandardScaler()
    scaler.fit(states)
    return scaler


In [None]:
def maybe_make_dir(directory):
  if not os.path.exists(directory):
    os.makedirs(directory)

In [None]:
def buy_and_hold(data, initial_investment):
    # Purchase of all shares of equal value on the first day of the investment period
    n_stocks = data.shape[1]
    investment_per_stock = initial_investment / n_stocks
    first_day_prices = data.iloc[0]
    shares_bought = investment_per_stock / first_day_prices

    # Calculate the total value on the last day of the investment period
    last_day_prices = data.iloc[-1]
    final_portfolio_value = (shares_bought * last_day_prices).sum()
    return final_portfolio_value


In [None]:
def test_agent(agent, env, scaler):
    state = env.reset()
    state = scaler.transform([state])
    done = False
    while not done:
        action_values = agent.act(state)
        action = np.clip(action_values, env.action_space.low, env.action_space.high)
        next_state, reward, done, info = env.step(action)
        next_state = scaler.transform([next_state])
        state = next_state
    return info['cur_val']


In [None]:
class LinearModel:
    """ A linear regression model """
    def __init__(self, input_dim, n_action):
        self.W = np.random.randn(input_dim, n_action) / np.sqrt(input_dim)
        self.b = np.zeros(n_action)

        # momentum terms
        self.vW = 0
        self.vb = 0

        self.losses = []

    def predict(self, X):
        assert(len(X.shape) == 2)
        return X.dot(self.W) + self.b

    def sgd(self, X, Y, learning_rate=0.01, momentum=0.9):
        assert(len(X.shape) == 2)
        num_values = np.prod(Y.shape)

        Yhat = self.predict(X)
        gW = 2 * X.T.dot(Yhat - Y) / num_values
        gb = 2 * (Yhat - Y).sum(axis=0) / num_values

        self.vW = momentum * self.vW - learning_rate * gW
        self.vb = momentum * self.vb - learning_rate * gb

        self.W += self.vW
        self.b += self.vb

        mse = np.mean((Yhat - Y)**2)
        self.losses.append(mse)

    def load_weights(self, filepath):
        npz = np.load(filepath)
        self.W = npz['W']
        self.b = npz['b']

    def save_weights(self, filepath):
        np.savez(filepath, W=self.W, b=self.b)


In [None]:
class MultiStockEnv:
    """Multi-stock trading environment"""
    def __init__(self, data, initial_investment=20000, market_periods=None):
        self.stock_price_history = data
        self.market_periods = market_periods
        self.n_step, self.n_stock = self.stock_price_history.shape

        self.initial_investment = initial_investment
        self.cur_step = None
        self.stock_owned = None
        self.stock_price = None
        self.cash_in_hand = None

        self.action_space = gym.spaces.Box(low=-1, high=1, shape=(self.n_stock,), dtype=np.float32)
        self.observation_space = gym.spaces.Box(low=0, high=np.inf, shape=(self.n_stock * 2 + 2,), dtype=np.float32)

        self.reset()

    def reset(self):
        self.cur_step = 0
        self.stock_owned = np.zeros(self.n_stock)
        self.stock_price = self.stock_price_history.iloc[self.cur_step].values
        self.cash_in_hand = self.initial_investment
        return self._get_obs()

    def step(self, action):
        action = np.clip(action, self.action_space.low, self.action_space.high)
        for i, val in enumerate(action):
            assert self.action_space.low[i] <= val <= self.action_space.high[i], f"Action {action} is not valid for the action space {self.action_space}"

        action = (action * self.initial_investment // self.stock_price).astype(int)

        prev_val = self._get_val()
        self.cur_step += 1
        self.stock_price = self.stock_price_history.iloc[self.cur_step].values
        self._trade(action)
        cur_val = self._get_val()

        reward = cur_val - prev_val
        done = self.cur_step == self.n_step - 1
        info = {'cur_val': cur_val}
        return self._get_obs(), reward, done, info

    def _get_obs(self):
        obs = np.empty(self.n_stock * 2 + 2)
        obs[:self.n_stock] = self.stock_owned
        obs[self.n_stock:2*self.n_stock] = self.stock_price
        obs[-2] = self.cash_in_hand
        obs[-1] = 1 if 'crisis' in self.market_periods.iloc[self.cur_step].values else 0
        return obs

    def _get_val(self):
        return self.stock_owned.dot(self.stock_price) + self.cash_in_hand

    def _trade(self, action):
        sell_index = np.where(action < 0)[0]
        buy_index = np.where(action > 0)[0]

        for i in sell_index:
            self.cash_in_hand += self.stock_price[i] * min(self.stock_owned[i], -action[i])
            self.stock_owned[i] -= min(self.stock_owned[i], -action[i])

        for i in buy_index:
            num_shares_to_buy = min(self.cash_in_hand // self.stock_price[i], action[i])
            self.stock_owned[i] += num_shares_to_buy
            self.cash_in_hand -= self.stock_price[i] * num_shares_to_buy


In [None]:
class DQNAgent(object):
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.model = LinearModel(state_size, action_size)

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.uniform(low=-1, high=1, size=self.action_size)
        act_values = self.model.predict(state)
        return np.clip(act_values[0], -1, 1)

    def train(self, state, action, reward, next_state, done):
        if done:
            target = reward
        else:
            target = reward + self.gamma * np.amax(self.model.predict(next_state), axis=1)

        target_full = self.model.predict(state)
        action_idx = np.argmax(action)
        target_full[0, action_idx] = target

        self.model.sgd(state, target_full)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)


In [None]:
def play_one_episode(agent, env, is_train):
    state = env.reset()
    state = scaler.transform([state])
    done = False
    #episode_actions = [] # Used to record the action in each iteration

    while not done:
        action_values = agent.act(state)
        action = np.clip(action_values, env.action_space.low, env.action_space.high)
        next_state, reward, done, info = env.step(action)
        next_state = scaler.transform([next_state])
        if is_train == 'train':
            agent.train(state, action_values, reward, next_state, done)
        state = next_state
        #episode_actions.append(action)  # record the action

    return info['cur_val']


In [None]:
if __name__ == '__main__':
    models_folder = '/content/new_drive/MyDrive/UCL/ML/linear_rl_trader_models'
    rewards_folder = '/content/new_drive/MyDrive/UCL/ML/linear_rl_trader_rewards'
    num_episodes = 2000
    initial_investment = 20000

    maybe_make_dir(models_folder)
    maybe_make_dir(rewards_folder)

    tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN']
    start_date = '2010-01-01'
    end_date = '2020-12-31'

    data = yf.download(tickers, start=start_date, end=end_date)['Adj Close']

    market_periods = identify_market_periods(data)

    print(data.head())
    print(market_periods.head())
    print(f"Data shape: {data.shape}, Market periods shape: {market_periods.shape}")

    n_timesteps, n_stocks = data.shape
    n_train = n_timesteps // 2
    train_data = data.iloc[:n_train]
    test_data = data.iloc[n_train:]
    train_periods = market_periods.iloc[:n_train]
    test_periods = market_periods.iloc[n_train:]

    env = MultiStockEnv(train_data, initial_investment, train_periods)
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    agent = DQNAgent(state_size, action_size)
    scaler = get_scaler(env)

    portfolio_value = []
    #all_actions = []  # for recording all actions

    for e in range(num_episodes):
        t0 = datetime.now()
        val = play_one_episode(agent, env, 'train')
        dt = datetime.now() - t0
        print(f"episode: {e + 1}/{num_episodes}, episode end value: {val:.2f}, duration: {dt}")
        portfolio_value.append(val)
        #all_actions.append(actions) # record all actions in each iteration

    if 'train' == 'train':
        agent.save(f'{models_folder}/linear.npz')
        with open(f'{models_folder}/scaler.pkl', 'wb') as f:
            pickle.dump(scaler, f)
        plt.plot(agent.model.losses)
        plt.show()

    np.save(f'{rewards_folder}/train.npy', portfolio_value)


    # Save all actions to a file
    #with open(f'{rewards_folder}/actions.pkl', 'wb') as f:
        #pickle.dump(all_actions, f)
    # Print all actions in the last iteration
    #print(f"Actions in the last episode: {all_actions[-1]}")

       # Validation: Compare with Buy and Hold strategy
    # Calculate Buy and Hold performance
    buy_and_hold_value = buy_and_hold(test_data, initial_investment)
    print(f"Buy and Hold strategy final portfolio value: {buy_and_hold_value:.2f}")

    # Test the trained agent
    test_env = MultiStockEnv(test_data, initial_investment, test_periods)
    final_portfolio_value = test_agent(agent, test_env, scaler)
    print(f"Trained agent final portfolio value: {final_portfolio_value:.2f}")

    # Compare the results
    if final_portfolio_value > buy_and_hold_value:
        print("The trained agent outperformed the Buy and Hold strategy.")
    else:
        print("The Buy and Hold strategy outperformed the trained agent.")
    

In [None]:
# View the final iteration's portfolio value at the end of training
print(f"Final portfolio value: {portfolio_value[-1]}")