<a href="https://colab.research.google.com/github/geekpradd/Reinforcement-Learning-Stock-Trader/blob/master/Stock.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%matplotlib inline
import gym
from gym import spaces
from matplotlib import pyplot as plt
import time
from tqdm import tqdm_notebook
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import random
from tensorflow.keras.layers import Dense, Concatenate, Lambda
from tensorflow.keras import Input
from tensorflow import convert_to_tensor as convert
# COLAB = False
# if not COLAB:
#     import os
#     os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
path_base = '/content/drive/My Drive/Stock/'

In [0]:
class StockEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    
    def __init__(self, df, params, train = True):
        super(StockEnv,self).__init__()
        
        self.num_stocks = params['num_stocks']
        self.min_brokerage = params['min_brokerage']
        self.brokerage_rate = params['brokerage_rate']
        self.balance_normal = params['balance_normal']
        self.shares_normal = params['shares_normal']
        self.volume_normal = params['volume_normal']
        self.dfs = df
        self.state_dimensions = self.num_stocks*5+1
        self.train = train

        assert len(df) == self.num_stocks, "Size of database not equal to number of stocks"

        self.max_steps = min([len(d.loc[:,'Open']) for d in self.dfs])
        self.action_space = spaces.Box(low = -1, high = 1, shape =  (1, self.num_stocks), dtype = np.float32)
        self.observation_space = spaces.Box(low = 0, high = 1, shape = (1, self.state_dimensions), dtype = np.float32)

    def reset(self, intial_balance = 10000, shares_held = None):

        if self.train:
            self.current_step = np.random.randint(0, self.max_steps)
        else:
            self.current_step = 0
        self.balance = intial_balance
        self.shares_held = shares_held
        if self.shares_held is None:
            self.shares_held = np.zeros((1, self.num_stocks))
        self.current_price = self.get_price()
        self.highest_price = 0
        self.net_worth = self.balance + np.sum(self.shares_held*self.current_price)
        self.initial_worth = self.net_worth
        self.max_net_worth = self.net_worth
        self.set_high()
        self.done = False
        self.frame = np.zeros((1, self.state_dimensions))
        self.info = {
            'current_step' : self.current_step,
            'current_price': self.current_price,
            'highest_price': self.highest_price,
            'net_worth' : self.net_worth,
            'max_net_worth': self.max_net_worth,
            'shares_held' : self.shares_held,
        }
        return  self.observe()
        
    def get_price(self):
        return np.array([np.random.uniform(df.loc[self.current_step,"Low"], df.loc[self.current_step,"High"]) for df in self.dfs]).reshape((1, self.num_stocks))
      
    def set_high(self):
        high = np.array([df.loc[self.current_step, 'High'] for df in self.dfs]).reshape((1, self.num_stocks))
        self.highest_price = np.maximum(self.highest_price, high)
    
    def validate(self, action):
        sum = 0
        for i in range(self.num_stocks):
            if action[i] < 0:
                if self.shares_held[0][i] < -action[i]:
                    return False, 0
            sum -= self.broke(self.current_price[0][i]*abs(action[i]))
        
        sum -= np.sum(self.current_price[0]*action)
        if sum + self.balance < 0:
            return False, 0
        return True, sum

    def observe(self):
        for i in range(self.num_stocks):
            self.frame[0, 4*i:4*i+4] = np.array([self.dfs[i].loc[self.current_step,'Open'],self.dfs[i].loc[self.current_step,'High'],self.dfs[i].loc[self.current_step,'Low'],self.dfs[i].loc[self.current_step,'Close']])/self.highest_price[0, i]
        self.frame[0, self.num_stocks*4:self.num_stocks*5] = self.shares_held/self.shares_normal
        self.frame[0, 5*self.num_stocks] = self.balance/self.balance_normal
        self.info = {
            'current_step' : self.current_step,
            'current_price': self.current_price,
            'highest_price': self.highest_price,
            'net_worth' : self.net_worth,
            'max_net_worth': self.max_net_worth,
            'shares_held' : self.shares_held
        }
        return self.frame, self.info
        
    def broke(self, amount):
        return max(amount * self.brokerage_rate, self.min_brokerage)
    
    def update(self, reward):
        self.net_worth += reward
        self.max_net_worth = max(self.max_net_worth, self.net_worth)
    
    def take_action(self, action):
        action *= self.shares_normal
        self.current_price = self.get_price()
        validation = self.validate(action)
        if not validation[0]:
            return -5000, False
        self.set_high()
        self.balance += validation[1]
        self.shares_held += action
        reward = self.balance + np.sum(self.shares_held * self.current_price) - self.net_worth
        self.update(reward)
        return reward, True
            
    def step(self, action):
        self.current_step += 1
        if self.current_step >= self.max_steps or self.done:
            self.done = True
            return np.zeros((1, self.state_dimensions)), 0, self.done, self.info
        reward, status = self.take_action(action)
        self.done = self.net_worth <= self.initial_worth*0.1
        obs, info = self.observe()
        return obs, reward, self.done, info
    
    def render(self, mode='human', close = False):
        profit = self.net_worth - self.initial_worth
        print('Step: {}'.format(self.current_step))
        print('Net Worth: {}'.format(self.net_worth))
        print('Profit: {}'.format(profit))
        
def create_stock_env(locations, train=True):
    dfs = [pd.read_csv(location).sort_values('Date') for location in locations]
    params = {
        'num_stocks' : 2,
        'min_brokerage' : 30.0,
        'brokerage_rate' : 0.001,
        'balance_normal' : 1000000,
        'shares_normal' : 10000,
        'volume_normal' : 2147483647,
    }
    return StockEnv(dfs, params, train)

In [0]:
class ReplayMemory:
    def __init__(self, max_size):
        self.buffer = [None] * max_size
        self.max_size = max_size
        self.index = 0
        self.size = 0

    def append(self, obj):
        self.buffer[self.index] = obj
        self.size = min(self.size + 1, self.max_size)
        self.index = (self.index + 1) % self.max_size

    def sample(self, batch_size):
        indices = random.sample(range(self.size), batch_size)
        return [self.buffer[index] for index in indices]
        
class OrnsteinUhlenbeckActionNoise:
    def __init__(self, mu, sigma=0.3, theta=.15, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

    def __repr__(self):
        return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)

In [0]:
class Actor:
    def __init__(self, params):
        self.output_range = params["output_range"]
        self.state_dimensions = params["state_dimensions"]
        self.action_dimensions = params["action_dimensions"]
        self.actor = self.build_model()
        
    def build_model(self):
        inputs = Input(shape=(self.state_dimensions, ))
        x = Dense(60, activation = 'relu')(inputs)
        x = Dense(16, activation = 'relu')(x)
        x = Dense(self.action_dimensions, activation = 'tanh')(x)
        output = Lambda(lambda x: x*self.output_range)(x)
        model = keras.Model(inputs = inputs, outputs = output)
        return model
    
    def get_action(self, state):
        return self.actor(convert(state))

    def save(self):
        self.actor.save_weights(path_base + 'actor.h5')
    
    def load(self):
        self.actor.load_weights(path_base + 'actor.h5')
        print('Successfully Loaded')

class Critic:
    def __init__(self, params):
        self.state_dimensions = params["state_dimensions"]
        self.action_dimensions = params["action_dimensions"]
        self.optimizer = params["critic_optimizer"]
        self.tau = params['tau']
        self.critic_online = self.build_model()
        self.critic_target = self.build_model()
        self.critic_online.set_weights(self.critic_target.get_weights())

    def build_model(self):
        input_a = Input(shape = (self.state_dimensions, ))
        input_b = Input(shape = (self.action_dimensions, ))
        input = Concatenate(axis = -1)([input_a, input_b])
        x = Dense(60, activation = 'relu')(input)
        x = Dense(16, activation = 'relu')(x)
        output = Dense(1)(x)
        model = keras.Model(inputs=[input_a, input_b], outputs = output)
        model.compile(loss='mse', optimizer = keras.optimizers.Adam(learning_rate = 0.001))
        # model.summary()
        return model

    def save(self):
        self.critic_online.save(path_base + 'critic_online.h5')
        self.critic_target.save(path_base + 'critic_target.h5')

    def load(self):
        self.critic_online = keras.models.load_model(path_base + 'critic_online.h5')
        self.critic_target = keras.models.load_model(path_base + 'critic_target.h5')

    def get_qvalues(self, state_array, action_array, online=True):
        if online:
            return self.critic_online([convert(state_array), convert(action_array)])
        else:
            return self.critic_target([convert(state_array), convert(action_array)])

    def call(self, state_tensor, action_tensor):
        return self.critic_online([state_tensor, action_tensor])
    
    def merge(self):
        self.critic_target.set_weights(self.tau*np.array(self.critic_online.get_weights())
                                                                    + (1-self.tau)*np.array(self.critic_target.get_weights()))

In [0]:
class Agent:
    def __init__(self, params, train = True, resume = True):
        self.train = train
        self.actor = Actor(params)
        self.critic = Critic(params)
        self.buffer = ReplayMemory(params["buffer_size"])
        self.state_dimensions = params["state_dimensions"]
        self.action_dimensions = params["action_dimensions"]
        self.discount = params["discount"]
        self.action_range = params["output_range"]
        self.save_frequency = params["save_frequency"]
        self.batch_size = params["batch_size"]
        self.optimizer = params["actor_optimizer"]
        self.num_steps = 0
        self.noise_func =  OrnsteinUhlenbeckActionNoise(mu=np.zeros(params["action_dimensions"]))
        if resume:
            self.load()
        
    def agent_start(self, observation):
        observation = np.reshape(observation, (1, self.state_dimensions))
        action = self.actor.get_action(observation)[0]
        if self.train:
            action = self.clip_action(action + self.noise_func())
        else:
            action = self.clip_action(action)

        self.prev_state = observation
        self.prev_action = action
        return action

    def clip_action(self, action):
        action = np.clip(action, -self.action_range, self.action_range)
        return action

    def agent_step(self, reward, observation):
        observation = np.reshape(observation, (1, self.state_dimensions))
        if self.train:
            replay = (self.prev_state, self.prev_action, reward, observation)
            self.buffer.append(replay)
        action = self.actor.get_action(observation)[0]
        if self.train:
            action = self.clip_action(action + self.noise_func())
            self.run()
        else:
            action = self.clip_action(action)
        self.prev_action = action
        self.prev_state = observation
        return self.prev_action 
    
    def save(self):
        self.actor.save()
        self.critic.save()

    def load(self):
        self.actor.load()
        self.critic.load()
    
    def run(self):
        self.num_steps += 1
        size = min(self.batch_size, self.buffer.size)
        batch = self.buffer.sample(size)

        prev_states = np.array([x[0] for x in batch]).reshape((-1, self.state_dimensions))
        prev_actions = np.array([x[1] for x in batch]).reshape((-1, self.action_dimensions))
        rewards = np.array([x[2] for x in batch]).reshape((-1, 1))
        states = np.array([x[3] for x in batch]).reshape((-1, self.state_dimensions))

        actions = self.actor.get_action(states)
        q_values = self.critic.get_qvalues(states, actions, False)
        q_values += self.discount*rewards
        self.critic.critic_online.fit([states, actions], q_values, epochs = 1, verbose=0)

        prev_state_tensor = convert(prev_states)
        prev_action_tensor = convert(prev_actions)
        
        with tf.GradientTape(persistent=True) as tape:
            tape.watch(prev_action_tensor)
            value = self.critic.call(prev_state_tensor, prev_action_tensor)
            action = self.actor.actor(prev_state_tensor)
        gradient = -tape.gradient(value, prev_action_tensor)
        gradient = tf.cast(gradient, tf.float32)
        gradient_actor = tape.gradient(action, self.actor.actor.trainable_weights, gradient)
        gradient_actor = list(np.array(gradient_actor)/size)

        self.optimizer.apply_gradients(zip(gradient_actor, self.actor.actor.trainable_weights))
        self.critic.merge()

        if self.num_steps % self.save_frequency == 0:
            self.save()

In [0]:
AGENT_PARAMS = {
	"output_range": 1,
	"state_dimensions": 11,
	"action_dimensions": 2,
	"critic_optimizer": tf.keras.optimizers.Adam(learning_rate = 0.001),
	"actor_optimizer": tf.keras.optimizers.Adam(learning_rate = 0.0001),
	"batch_size": 64,
	"buffer_size":100000,
	"discount": 0.99,
	"tau": 0.001,
	"save_frequency": 5000
}

In [0]:
files = ['/content/drive/My Drive/AAPL.csv','/content/drive/My Drive/MSFT.csv']
env = create_stock_env(files)
tf.keras.backend.set_floatx('float32')
agent = Agent(AGENT_PARAMS, resume = False)

In [0]:
ITERATIONS = 5000
profit = np.zeros((10,ITERATIONS))
action = np.zeros((10,ITERATIONS+1,AGENT_PARAMS["action_dimensions"]))
shares = np.zeros((10,ITERATIONS+1,AGENT_PARAMS["action_dimensions"]))
for iter in range(1):
    prev_profit = 0
    y, info = env.reset()
    action[iter, 0,:] = agent.agent_start(y)
    for i in tqdm_notebook(range(ITERATIONS)):
        shares[iter,i,:] = info['shares_held']
        y, reward, done, info = env.step(action[iter,i])
        if done:
            break
        action[iter,i+1,:] = agent.agent_step(reward, y)
        profit[iter][i] += reward + prev_profit
        prev_profit += reward

In [0]:
# Test
env_t = create_stock_env(files,0)
tf.keras.backend.set_floatx('float32')
agent_t = Agent(AGENT_PARAMS, True)
ITERATIONS = 5000
profit_t = np.zeros((1,ITERATIONS))
action_t = np.zeros((1,ITERATIONS+1,AGENT_PARAMS["action_dimensions"]))
shares_t = np.zeros((1,ITERATIONS+1,AGENT_PARAMS["action_dimensions"]))
for iter in range(1):
    prev_profit = 0
    y, info = env_t.reset()
    action_t[iter,0,:] = agent_t.agent_start(y)
    for i in tqdm_notebook(range(ITERATIONS)):
        shares_t[iter,i,:] = info['shares_held']
        y, reward, done, info = env_t.step(action_t[iter,i])
        action_t[iter,i+1,:] = agent_t.agent_step(reward, y)
        profit_t[iter][i] += reward + prev_profit
        prev_profit += reward
        if done:
            print ("Terminated because broke")
            break

In [0]:
plt.figure(figsize = [15, 10])
#plt.plot(profit[0])
plt.plot(profit[0,:])
# print((env.dfs[0].loc[5000, "Open"]*10000)/env.dfs[0].loc[0, "Open"])

plt.grid(True)
plt.show()

In [0]:
plt.figure(figsize = [15, 10])
#plt.plot(profit[0])
plt.plot(action[0,:,0])
plt.legend(['action '+str(i+1) for i in range(10)])

plt.grid(True)
plt.show()

In [0]:
plt.figure(figsize = [15, 10])
plt.plot(shares[9,:,0])
plt.legend(['iteration '+str(i+1) for i in range(10)])

plt.grid(True)
plt.show()

In [0]:
plt.figure(figsize = [15, 10])
#plt.plot(profit[0])
plt.plot(shares[9,:,1])
plt.legend(['iteration '+str(i+1) for i in range(10)])

plt.grid(True)
plt.show()

In [0]:
plt.plot(profit)

In [0]:
print(min(profit),max(profit),profit[-1])

In [0]:
env = create_stock_env(files,False)
tf.keras.backend.set_floatx('float32')
agent = Agent(AGENT_PARAMS, True)
ITERATIONS = 5000
profit = np.zeros(ITERATIONS)
action = np.zeros((ITERATIONS+1,AGENT_PARAMS["action_dimensions"]))
shares = np.zeros((ITERATIONS+1,AGENT_PARAMS["action_dimensions"]))
balance = np.zeros(ITERATIONS+1)
prev_profit = 0
y, info = env.reset()
action[0,:] = agent.agent_start(y)
for i in tqdm(range(ITERATIONS)):
    shares[i,:] = info['shares_held']
    balance[i] =  info['balance']
    y, reward, done, info = env.step(action[i])
    action[i+1,:] = agent.agent_step(reward, y)
    profit[i] += reward + prev_profit
    prev_profit += reward
    if done:
        break

In [0]:
plt.figure(figsize = [15, 10])
#plt.plot(profit[0])
plt.plot(action[:,1])
plt.legend(['action '+str(i+1) for i in range(10)])

plt.grid(True)
plt.show()

In [0]:
plt.figure(figsize = [15, 10])
#plt.plot(profit[0])
plt.plot(action[:,0])
plt.legend(['action '+str(i+1) for i in range(10)])

plt.grid(True)
plt.show()

In [0]:

plt.figure(figsize = [15, 10])
#plt.plot(profit[0])
plt.plot(profit)
plt.legend(['iteration '+str(i+1) for i in range(10)])

plt.grid(True)
plt.show()

In [0]:

plt.figure(figsize = [15, 10])
plt.plot(shares[:,1])
plt.legend(['iteration '+str(i+1) for i in range(10)])

plt.grid(True)
plt.show()

In [0]:
plt.figure(figsize = [15, 10])
plt.plot(shares[:,0])
plt.legend(['iteration '+str(i+1) for i in range(10)])

plt.grid(True)
plt.show()

In [0]:

plt.figure(figsize = [15, 10])
plt.plot(balance)
plt.legend(['iteration '+str(i+1) for i in range(10)])
plt.grid(True)
plt.show()