In [1]:
import gym
from tqdm.notebook import tqdm
COLAB = False
if not COLAB:
    import os
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.layers import *
from tensorflow.keras import Input
from utils import ReplayBuffer, OrnsteinUhlenbeckActionNoise
path_base = "models/"
RESUME = False

In [2]:
import gym
from gym import spaces
import numpy as np
import pandas as pd
import json
import datetime as dt

MAX_Money = 100000
class StockEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    
    def __init__(self,dfs, train, number=1, **kwargs):
        super(StockEnv,self).__init__()
        self.train = train
        self.MAX_shares = 2147483647
        self.Min_Brokerage = 30
        self.count = number
        self.Brokerage_rate = 0.001
        
        if "balance" in kwargs.keys():
            Max_Money = kwargs["balance"]
        if "Max_Shares" in kwargs.keys():
            self.MAX_shares = kwargs["Shares"]
        if "Broke_limit" in kwargs.keys():
            self.Min_Brokerage = kwargs["Broke_limit"]
        if "Broke_rate" in kwargs.keys():
            self.Brokerage_rate = kwargs["Broke_rate"]
        
        self.dfs = dfs
        self.action_space = spaces.Box(low = np.array([-1]), high = np.array([1]), dtype = np.float16)
        lower = [0]*number
        higher = [1]*number
        self.observation_space = spaces.Box(low=np.array(lower),high=np.array(higher),dtype=np.float32)
    
    def _get_prices(self):
#         print ("Day {0}".format(self.df.loc[self.current_step,"Date"]))
#         print ("low: {0} high: {1}".format(self.df.loc[self.current_step,"Open"],self.df.loc[self.current_step,"Close"]))
        return np.array([np.random.uniform(df.loc[self.current_step,"Open"], df.loc[self.current_step,"Close"]) for df in self.dfs])
    
    def _observe(self, prices):
        frame = prices
        frame = frame / self.highest_price
        info = {
            'balance' : self.balance,
            'highest_price': self.highest_price,
            'current_price': self.current_prices,
            #'time': self.df.loc[self.current_step,'time_stamp'],
            'shares_held': self.shares_held,
            'max_worth': self.max_net_worth,
            'broke_limit': self.Min_Brokerage,
            'broke_rate': self.Brokerage_rate
        }
        
        return frame, info
        
    def reset(self,balance = MAX_Money,**kwargs):
        if "balance" in kwargs.keys():
            Max_Money = kwargs["balance"]
        if "Max_Shares" in kwargs.keys():
            self.MAX_shares = kwargs["Shares"]
        if "Broke_limit" in kwargs.keys():
            self.Min_Brokerage = kwargs["Broke_limit"]
        if "Broke_rate" in kwargs.keys():
            self.Brokerage_rate = kwargs["Broke_rate"]
        
        if self.train:
            self.current_step = np.random.randint(0,len(self.dfs[0].loc[:,'Open'].values)-1)
        else:
            self.current_step = 0
       
        self.balance = balance
        self.shares_held = np.array([0]*self.count)
        self.current_prices = self._get_prices() 
        self.net_worth = self.balance + sum(self.shares_held*self.current_prices)
        self.initial_worth = self.net_worth
        self.max_net_worth = self.net_worth
        self.highest_price = np.max(self.current_prices)
        frame,_ =  self._observe(self.current_prices)
        return frame
    
    def _broke(self,amount):
        return max(amount * self.Brokerage_rate,self.Min_Brokerage)
    
    def _take_action(self, action_vector):
        self.current_prices = self._get_prices()
        self.highest_price = max(self.highest_price,np.max(self.current_prices))
        action_vector  = action_vector*self.MAX_shares
        for i in range(self.count):
            if action_vector[i] < 0:
                # sell
                action_vector[i] = -1*action_vector[i]
                if action_vector[i] > self.shares_held[i]:
                    action_vector[i] = self.shares_held[i]
                amount_gained = action_vector[i]*self.current_prices[i]
                broke = self._broke(amount_gained)
                amount_gained -= broke
                if self.balance + amount_gained < 0:
                    a1 = np.floor(self.balance/((self.Brokerage_rate-1)*self.current_prices[i]))
                    action = np.floor(-(self.balance-self.Min_Brokerage)/self.current_prices[i])
                    if self._broke(a1*self.current_prices[i]) == a1*self.current_prices[i]*self.Brokerage_rate:
                        action_vector[i] = max(a1,action_vector[i])
                    action_vector[i] = max(action_vector[i],0)
                    amount_gained = action_vector[i]*self.current_price
                    amount_gained -= self._broke(amount_gained)
                self.balance +=amount_gained
                self.shares_held[i] = self.shares_held[i]-action_vector[i]
            elif action_vector[i]>0:
                #buy
                amount_required = self.current_prices[i]*action_vector[i] + self._broke(self.current_prices[i]*action_vector[i])
                if amount_required > self.balance:
                    a1 = np.floor(self.balance/((self.Brokerage_rate+1)*self.current_prices[i]))
                    action_vector[i] = np.floor((self.balance-self.Min_Brokerage)/self.current_prices[i])
                    if self._broke(a1*self.current_prices[i]) == a1*self.current_prices[i]*self.Brokerage_rate:
                        action_vector[i] = max(a1,action_vector[i])
                    action_vector[i] = max(action_vector[i],0)
                    amount_required = action_vector[i]*self.current_prices[i]
                    amount_required -= self._broke(amount_required)
                self.balance -= amount_required
                self.shares_held[i] += action_vector[i]
        reward = self.balance + sum(self.shares_held* self.current_prices) - self.net_worth
        self.net_worth = self.balance + sum(self.shares_held* self.current_prices)
        if self.net_worth > self.max_net_worth:
            self.max_net_worth = self.net_worth
        return reward, self.current_prices
            
    def step(self, action):
        reward, prices = self._take_action(action)
        self.current_step+=1
        if self.current_step > len(self.dfs[0].loc[:,'Open'].values)-1:
            self.current_step = 0
        
        done = self.net_worth<=0
        obs, info = self._observe(prices)
        
        return obs, reward, done, info
    
    def render(self, mode='human', close = False):
        profit = self.net_worth - self.initial_worth
        print(f'Step: {self.current_step}')
        print(f'Net Worth:{self.net_worth}')
        print(f'Profit: {profit}')


def create_stock_env(locations, train=True):
    dfs = [pd.read_csv(location) for location in locations]
    for df in dfs:
        (df.sort_values("Date"))
    return StockEnv(dfs, train, len(locations)), dfs[0].shape[0]



In [3]:
class Actor:
    def __init__(self, params):
        self.output_range = params["output_range"]
        self.hidden_layers = params["actor_hidden_layers"]
        self.state_dimensions = params["state_dimensions"]
        self.action_dimensions = params["action_dimensions"]
        self.actor = self.model()
        
    def model(self):
        inputs = Input(shape=(1, self.state_dimensions))
        x = Lambda(lambda x: x)(inputs)
        for layer in self.hidden_layers:
            x = Dense(layer, activation='relu')(x)
        x = Dense(self.action_dimensions, activation='tanh')(x)
        x = Lambda(lambda x: x*self.output_range)(x)
        model = tf.keras.Model(inputs = inputs, outputs = x)
        return model
    
    def get_action(self, state):
        state_tensor = tf.Variable(shape = state.shape, initial_value = state)
        return (self.actor(state_tensor)).numpy()

    def save_weights(self):
        self.actor.save_weights(path_base + "actor.h5")
               
    def load_weights(self):
        self.actor.load_weights(path_base + "actor.h5")
        
    
class Critic:
    def __init__(self, params):
        self.hidden_layers = params["critic_hidden_layers"]
        self.state_dimensions = params["state_dimensions"]
        self.action_dimensions = params["action_dimensions"]
        self.optimizer = params["critic_optimizer"]
        self.critic_online = self.model()
        self.critic_target = self.model()


    def model(self):
        input_a = Input(shape = (1, self.state_dimensions))
        input_b = Input(shape = (1, self.action_dimensions))
        x = concatenate([input_a, input_b], axis=-1)
        for layer in self.hidden_layers:
            x = Dense(layer, activation='relu')(x)
        x = Dense(1, activation='linear')(x)
        model = tf.keras.Model(inputs=[input_a, input_b], outputs = x)
        model.compile(loss='mse', optimizer=self.optimizer)
        return model
    
    def save_weights(self):
        self.critic_online.save_weights(path_base + "critic_online.h5")
        self.critic_target.save_weights(path_base +  "critic_target.h5")
               
    def load_weights(self):
        self.critic_online.load_weights(path_base + "critic_online.h5")
        self.critic_target.load_weights(path_base + "critic_target.h5")

    def get_qvalues(self, state_array, action_array, online=True):
        state_tensor = tf.Variable(shape = state_array.shape, initial_value = state_array)
        action_tensor = tf.Variable(shape = action_array.shape, initial_value = action_array)
        return (self.critic_online([state_tensor, action_tensor]).numpy() if online else self.critic_target([state_tensor, action_tensor]).numpy())
    
    def call(self, state_tensor, action_tensor, online = True):
        return (self.critic_online([state_tensor, action_tensor]) if online else self.critic_target([state_tensor, action_tensor]))
    def merge_networks(self, tau):
        self.critic_target.set_weights(tau*np.array(self.critic_online.get_weights())
                                                                    + (1-tau)*np.array(self.critic_target.get_weights()))
        

In [20]:
class Agent:
    def __init__(self, params, test=False):
        self.test = test
        self.actor = Actor(params)
        self.critic = Critic(params)
        self.buffer = ReplayBuffer(params["buffer_size"])
        self.state_dimensions = params["state_dimensions"]
        self.action_dimensions = params["action_dimensions"]
        self.discount = params["discount"]
        self.action_range = params["output_range"]
        self.save_frequency = params["save_frequency"]
        self.batch_size = params["batch_size"]
        self.optimizer = params["actor_optimizer"]
        self.tau = params["tau"]
        self.step = 0
        self.noise_func =  OrnsteinUhlenbeckActionNoise(mu=np.zeros(params["action_dimensions"]))
        if RESUME:
            self.load_networks()
        
    def agent_start(self, observation):
        observation = np.reshape(observation, (1, self.state_dimensions))
        act = np.squeeze(self.actor.get_action(observation))
        if not self.test:
            for i in range(act.shape[0]):
                act[i] = np.squeeze(self.clip_action((act[i] + self.noise_func())[0]))
        else:
            for i in range(act.shape[0]):
                act[i] = np.squeeze(self.clip_action(act[i]))
        self.prev_state = observation
        self.prev_action = act
        return act

    def clip_action(self, action):
        if abs(action) > self.action_range:
            action *= abs(self.action_range)/abs(action)
        
        return action

    def agent_step(self, reward, observation):
        observation = np.reshape(observation, (1, self.state_dimensions))
        if not self.test:
            relay = (self.prev_state, self.prev_action, reward, observation)
            self.buffer.add(relay)
        self.prev_state = observation
        act = np.squeeze(self.actor.get_action(observation))
        if not self.test:
            self.prev_action = self.clip_action(act + self.noise_func())
            self.train(self.batch_size)
        else:
            self.prev_action = [self.clip_action(act)]
        
        return self.prev_action 
    
    def save_networks(self):
        self.actor.save_weights()
        self.critic.save_weights()

    def load_networks(self):
        self.actor.load_weights()
        self.critic.load_weights()


    def train(self, sample_size):
        self.step += 1
        batch, batch_size = self.buffer.sample(sample_size)

        state_array = np.array([ element[3] for element in batch])
        action_array = self.actor.get_action(state_array)
        prev_state_array = np.array([ element[0] for element in batch])
        prev_action_array = np.array([ [[element[2]]] for element in batch])
        output = self.critic.get_qvalues(state_array, action_array, False)
        output = np.array([element[2] + self.discount*out[0] for element, out in zip(batch, output)])
        self.critic.critic_online.fit([state_array, action_array], output, verbose=0)

        prev_state_tensor = tf.Variable(shape = prev_state_array.shape, initial_value = prev_state_array)
        prev_action_tensor = tf.Variable(shape = prev_action_array.shape, initial_value = prev_action_array)

        with tf.GradientTape(persistent=True) as g:
            g.watch(prev_action_tensor) 
            g.watch(prev_state_tensor)
            value = self.critic.call(prev_state_tensor, prev_action_tensor)
            action = self.actor.actor(prev_state_tensor)
            
        gradient = -tf.squeeze(g.gradient(value, prev_action_tensor))
        gradient = tf.cast(gradient, tf.float32)
        gradient_actor = g.gradient(action, self.actor.actor.trainable_weights, gradient)
        gradient_actor = list(map(lambda x: tf.math.divide(x, batch_size), gradient_actor))
        self.optimizer.apply_gradients(zip(gradient_actor, self.actor.actor.trainable_weights))
        self.critic.merge_networks(self.tau)

        if self.step%self.save_frequency == 0:
            self.save_networks()



In [21]:
AGENT_PARAMS = {
	"output_range": 1,
	"actor_hidden_layers": [60, 16],
	"critic_hidden_layers": [60, 16],
	"state_dimensions": 29,
	"action_dimensions": 29,
	"critic_optimizer": tf.keras.optimizers.Adam(learning_rate = 0.001),
	"actor_optimizer": tf.keras.optimizers.Adam(learning_rate = 0.0001),
	"batch_size": 64,
	"buffer_size":1000000,
	"discount": 0.99,
	"tau": 0.001,
	"save_frequency": 100
}

In [22]:
import os
files = ["train/" + f for f in os.listdir("train/")]

In [23]:
env, stamps = create_stock_env(files)

In [24]:
y = env.reset()

In [25]:
y

array([0.34131135, 0.42253686, 0.23074003, 0.49295835, 0.23768464,
       0.49309801, 0.50289128, 0.45724847, 0.97002253, 0.31276553,
       0.72543937, 0.45306388, 0.23746844, 0.48913126, 0.15434905,
       0.66211881, 0.5361211 , 0.17378914, 0.59805951, 0.2758236 ,
       0.30583717, 0.44894854, 0.53689521, 1.        , 0.12496449,
       0.32904595, 0.50755846, 0.4530461 , 0.26231207])

In [26]:
env.step(y)

(array([0.34053008, 0.42214879, 0.23102212, 0.49124809, 0.23773338,
        0.493377  , 0.50198586, 0.45811289, 0.97118697, 0.30899051,
        0.72549735, 0.45374968, 0.2383658 , 0.48890162, 0.15507515,
        0.66051056, 0.52666711, 0.17403264, 0.60927722, 0.27638189,
        0.3060475 , 0.44998448, 0.54177366, 0.99905866, 0.12496449,
        0.32891922, 0.50763107, 0.4491711 , 0.26166008]),
 939.9137519145443,
 False,
 {'balance': 71.25027630259115,
  'highest_price': 184.5324148206363,
  'current_price': array([ 62.83883768,  77.90013554,  42.63106939,  90.65119689,
          43.86951473,  91.04404938,  92.63266222,  84.53667734,
         179.21547706,  57.01876513, 133.87777871,  83.73152481,
          43.98621644,  90.21819709,  28.61639171, 121.88560903,
          97.18715337,  32.11466331, 112.43139676,  51.00141849,
          56.47568476,  83.03672233,  99.97480106, 184.35870616,
          23.059999  ,  60.69625728,  93.6743867 ,  82.88662841,
          48.28476587]),
  'shar

In [27]:
agent = Agent(AGENT_PARAMS)
ITERATIONS = 20000
pbar = tqdm(desc="Iteration: ", total=ITERATIONS)
action = agent.agent_start(env.reset())
observation, reward, done, info = env.step(action)
profit = np.zeros(ITERATIONS)
prev_profit = 0
for _ in range(ITERATIONS):
    action = agent.agent_step(reward, observation)
    observation, reward, done, info = env.step(action)
    profit[i] += reward + prev_profit
    prev_profit += reward
    pbar.update(1)

HBox(children=(IntProgress(value=0, description='Iteration: ', max=20000, style=ProgressStyle(description_widt…

AttributeError: 'StockEnv' object has no attribute 'current_price'