# DQN Trading Agent

## Read database

In [1]:
#TO READ THE DATABASE
import h5py

DB_FILE = "../data/dataset_1h_1000.hdf5"
DB = h5py.File(DB_FILE, "r")

for item in DB:
    print(item)

bitcoin_usd
etherium_usd
ripple_usd


In [13]:
# Show the values of the first timestep
print(DB["bitcoin_usd"][:,1:])

[[ 14793.          14694.          14860.          14669.            786.84789462]
 [ 14400.          14789.          14923.          14347.           4207.24393896]
 [ 14395.          14400.          14678.          13755.           9933.78908875]
 ..., 
 [  9689.           9799.           9799.           9647.59358788
    2605.26284098]
 [  9628.3          9689.           9700.           9625.            885.36753194]
 [  9663.6          9628.           9676.1          9625.            946.37556537]]


## Define packages

In [3]:
import random
import numpy as np

## Define DQN Agent

In [4]:
from collections import deque
from keras.models import Model 
from keras.layers import Input, Dense, Flatten, Reshape
from keras.optimizers import Adam
from keras import backend as K
import keras

K.set_image_dim_ordering('tf')

class DQNAgent:
    def __init__(self, frame_size, frame_parameter_number, aux_number, action_size):
        self.frame_size = frame_size
        self.frame_parameter_number =  frame_parameter_number
        self.aux_number = aux_number
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        # Main input (the time frame)
        main_input = Input(shape=(frame_parameter_number,self.frame_size))
        main = Flatten()(main_input)
        
        # Additional input (other inputs)
        aux_input = Input(shape=(aux_number,))
        
        x = keras.layers.concatenate([main, aux_input])
        x = Dense(64, activation='relu')(x)
        x = Dense(64, activation='relu')(x)
        x = Dense(32, activation='relu')(x)
        out = Dense(self.action_size, activation='linear')(x)
        
        model = Model(inputs=[main_input, aux_input], outputs=out)
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        print(model.summary())
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            # The agent acts randomly
            return random.randrange(self.action_size)
        
        # Predict the reward value based on the given state
        act_values = self.model.predict(state)
        
        # Pick the action based on the predicted reward
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        # Sample minibatch from the memory
        minibatch = random.sample(self.memory, batch_size)
        
        # Extract informations from each memory
        for state, action, reward, next_state, done in minibatch:
            
            # if done, make our target reward
            target = reward
            if not done:
                # predict the future discounted reward
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
                
            # make the agent to approximately map
            # the current state to future discounted reward
            # We'll call that target_f
            target_f = self.model.predict(state)
            target_f[0][action] = target
            
            # Train the Neural Net with the state and target_f
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

Using TensorFlow backend.


## Define Trading Game

In [5]:
class TradingGame:
    def __init__(self, start_capital, frame_size, buy_step_size, database):
        self.start_capital = start_capital
        self.frame_size = frame_size
        self.buy_step_size = buy_step_size
        
        self.database = np.asarray(database)
        
        self.capital = start_capital
        self.liquid = start_capital
        self.timestep = 0
        self.time_frame = self.database[0:self.frame_size]
        
        self.position_price = 0.0
        self.position_count = 0
        self.state = None
    
    ### Reset function
    def reset(self):
        # Set everything to the initial start value
        self.capital = self.start_capital
        self.liquid = self.start_capital
        self.timestep = 0
        self.time_frame = self.database[0:self.frame_size]
        
        self.position_price = 0.0
        self.position_count = 0
        
        # state => (time_frame, capital, position)
        frame = np.reshape(self.time_frame, [1, 6, frame_size])
        aux = np.reshape([self.liquid, self.capital, self.position_count, self.position_price], [1, 4])
        self.state = [frame, aux]
        
        return self.state
    
    
    ### Next frame function
    def get_next_frame(self):
        self.timestep += 1
        
        if (self.frame_size+self.timestep) < self.database.shape[0]:
            next_frame = self.database[self.timestep:self.frame_size+self.timestep]
            done = False
        else:
            next_frame = self.time_frame
            done = True
            
        return next_frame, done
    
    def buy_positions(self, new_price, count):
        # check if the position count is positive
        if self.position_count >= 0:
            # check if there is enough liquid money
            if self.liquid > (count*new_price):
                self.position_price = ((count*new_price)+(self.position_count*self.position_price))/(self.position_count+count)
                self.position_count += count
                self.liquid -= (count*new_price)
                reward = 0
            else:
                reward = -100 * count
                
        elif (self.position_count <= -count):
            self.position_count += count
            self.liquid += (count*new_price)
            reward = count*(self.position_price - new_price)
            
        else:
            number_bought = self.position_count + count
            number_sold = count - number_bought
            
            if self.liquid > ((number_bought*new_price)-(number_sold*new_price)):
                self.position_price = new_price
                self.position_count += count
                self.liquid -= (number_bought*new_price)
                self.liquid += (number_sold*new_price)
                reward = number_sold*(self.position_price - new_price)
            else:
                reward = -100 * (number_bought-number_sold)
        
        if self.position_count == 0:
            self.position_price = 0
            
        return reward
    
    def sell_positions(self, new_price, count):
        # check if the position count is negative
        if self.position_count <= 0:
            # check if there is enough liquid money
            if self.liquid > (count*new_price):
                self.position_price = ((count*new_price)+((-1)*self.position_count*self.position_price))/((-1)*self.position_count+count)
                self.position_count -= count
                self.liquid -= (count*new_price)
                reward = 0
            else:
                reward = -100 * count
                
        elif (self.position_count >= count):
            self.position_count -= count
            self.liquid += (count*new_price)
            reward = count*(new_price - self.position_price)
            
        else:
            number_sold = count - self.position_count
            number_bought = count - number_sold
            
            if self.liquid > ((number_bought*new_price)-(number_sold*new_price)):
                self.position_price = new_price
                self.position_count += count
                self.liquid -= (number_bought*new_price)
                self.liquid += (number_sold*new_price)
                reward = number_sold*(self.position_price - new_price)
            else:
                reward = -100 * (number_bought-number_sold)
        
        if self.position_count == 0:
            self.position_price = 0
            
        return reward
    
    ### Reward function
    def calc_reward(self, action, next_frame):
        # Get new price
        new_price = next_frame[self.frame_size - 1, 2]
        
        # Check the action and calculate the reward
        #hold
        if action == 0: 
            reward = 0
            
        #buy one
        elif action == 1: 
            reward = self.buy_positions(new_price, 1)
                
        #buy five
        elif action == 2: 
            reward = self.buy_positions(new_price, self.buy_step_size)
                
        #sell one
        elif action == 3: 
            reward = self.sell_positions(new_price, 1)
                
        #sell five        
        elif action == 4: 
            reward = self.sell_positions(new_price, self.buy_step_size)
            
        return reward
    
    ### Next step function
    def step(self, action):
        # Get next time frame
        next_frame, done = self.get_next_frame()
        
        # Get reward
        reward = self.calc_reward(action, next_frame)
        
        # check if done
        self.capital = self.liquid + self.position_count * self.position_price
        if self.capital <= 0:
            done = True
            
        # Next state
        next_frame = np.reshape(next_frame, [1, 6, frame_size])
        next_aux = np.reshape([self.liquid, self.capital, self.position_count, self.position_price], [1, 4])
        next_state = [next_frame, next_aux]
        
        # update state
        self.state = next_state
        
        return next_state, reward, done
    
    def get_score(self):
        return self.capital

## Train the agent

### Set parameters

In [6]:
# Parameter for Agent
frame_size = 100 # Time frame
frame_parameter_number = 6
aux_number = 4 # Additional parameters
action_size = 5 # hold, buy+1, buy+5, sell-1, sell-5
buy_step_size = 3

# Parameter for Trading Game
database = DB["bitcoin_usd"][1:6]
batch_size = 64
EPISODES = 1000
CAPITAL = 100000
#POSITION = 0

### Training

In [7]:
# Create agent
agent = DQNAgent(frame_size, frame_parameter_number, aux_number, action_size)

# Initialize Environment
env = TradingGame(CAPITAL, frame_size, buy_step_size, database)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 6, 100)        0                                            
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 600)           0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 4)             0                                            
____________________________________________________________________________________________________
concatenate_1 (Concatenate)      (None, 604)           0                                            
___________________________________________________________________________________________

In [8]:
#Episodes to train
for e in range(EPISODES):
    
    #Create initial state from time frame
    state = env.reset()
    #state = np.asarray(state)
    #state = np.reshape(state, [1, 6, frame_size,])

    #time frames
    for time_frame in range(frame_size, 1000):
        
        #Agent takes action
        action = agent.act(state)

        #Calc reward
        next_state, reward, done = env.step(action)
        #print(next_state)
        #next_state = np.reshape(next_state, [1, 6, frame_size])
        
        #Remember action
        agent.remember(state, action, reward, next_state, done)
        
        #Override state with next state
        state = next_state
        
        if done:
            print("episode: {}/{}, score: {}, e: {:.2}"
                  .format(e, EPISODES, env.get_score(), agent.epsilon))
            break
       
    #Replay
    if len(agent.memory) > batch_size:
        agent.replay(batch_size)
    # if e % 10 == 0:
    #     agent.save("./save/cartpole-dqn.h5")
        
    

episode: 0/1000, score: -65096.88041056285, e: 1.0
episode: 1/1000, score: -14942.691174700012, e: 0.99
episode: 2/1000, score: -44966.0, e: 0.99
episode: 3/1000, score: -16192.850539510022, e: 0.99
episode: 4/1000, score: -53487.16044541332, e: 0.98
episode: 5/1000, score: -13600.5723154, e: 0.98
episode: 6/1000, score: -15907.049254429992, e: 0.97
episode: 7/1000, score: -18378.21668828001, e: 0.97
episode: 8/1000, score: 350911.48173313844, e: 0.96
episode: 9/1000, score: 490035.41448619775, e: 0.96
episode: 10/1000, score: 397360.15790782927, e: 0.95
episode: 11/1000, score: -17693.355627120007, e: 0.95
episode: 12/1000, score: -54190.706513059995, e: 0.94
episode: 13/1000, score: -31244.867226443253, e: 0.94
episode: 14/1000, score: -21920.939142632007, e: 0.93
episode: 15/1000, score: -71846.0, e: 0.93
episode: 16/1000, score: -15938.0, e: 0.92
episode: 17/1000, score: 349053.99953820015, e: 0.92
episode: 18/1000, score: -76498.54409129001, e: 0.91
episode: 19/1000, score: -50796

episode: 159/1000, score: 133046.33411364135, e: 0.45
episode: 160/1000, score: 140887.99855607998, e: 0.45
episode: 161/1000, score: 109674.06919632078, e: 0.45
episode: 162/1000, score: -20247.283019399998, e: 0.44
episode: 163/1000, score: 91184.81415676, e: 0.44
episode: 164/1000, score: 114322.74201688948, e: 0.44
episode: 165/1000, score: 84049.5227739746, e: 0.44
episode: 166/1000, score: -45479.0, e: 0.44
episode: 167/1000, score: 114267.85347606253, e: 0.43
episode: 168/1000, score: 107514.34047763274, e: 0.43
episode: 169/1000, score: 178895.41401095194, e: 0.43
episode: 170/1000, score: 104519.39687399109, e: 0.43
episode: 171/1000, score: 114498.89656072749, e: 0.42
episode: 172/1000, score: 95719.52831522629, e: 0.42
episode: 173/1000, score: 151909.95062760243, e: 0.42
episode: 174/1000, score: 146250.7113504796, e: 0.42
episode: 175/1000, score: 101290.41105928094, e: 0.42
episode: 176/1000, score: 134652.70098934334, e: 0.41
episode: 177/1000, score: 154495.25391182504,

episode: 314/1000, score: 75567.27843909556, e: 0.21
episode: 315/1000, score: 127587.31076671128, e: 0.21
episode: 316/1000, score: 118119.72119518406, e: 0.21
episode: 317/1000, score: 81028.34872677477, e: 0.2
episode: 318/1000, score: 103704.25569777413, e: 0.2
episode: 319/1000, score: 93095.83044746147, e: 0.2
episode: 320/1000, score: 76565.99713461801, e: 0.2
episode: 321/1000, score: 92761.64565635851, e: 0.2
episode: 322/1000, score: 74282.71567490722, e: 0.2
episode: 323/1000, score: 73640.16614920883, e: 0.2
episode: 324/1000, score: 79919.0886538135, e: 0.2
episode: 325/1000, score: 72782.55125761149, e: 0.2
episode: 326/1000, score: 99946.0585535941, e: 0.2
episode: 327/1000, score: 73113.26438757342, e: 0.19
episode: 328/1000, score: 78505.45966998779, e: 0.19
episode: 329/1000, score: 77222.25519118199, e: 0.19
episode: 330/1000, score: 75193.30929475595, e: 0.19
episode: 331/1000, score: 122703.62302283282, e: 0.19
episode: 332/1000, score: 122866.71098789474, e: 0.19


episode: 470/1000, score: 68692.28498336714, e: 0.095
episode: 471/1000, score: 73944.12526040158, e: 0.094
episode: 472/1000, score: 82276.82456694811, e: 0.094
episode: 473/1000, score: 84760.53561844327, e: 0.093
episode: 474/1000, score: 75622.21092970119, e: 0.093
episode: 475/1000, score: 75460.45401766902, e: 0.092
episode: 476/1000, score: 72074.80013398378, e: 0.092
episode: 477/1000, score: 74524.32838277875, e: 0.092
episode: 478/1000, score: 78579.55150883201, e: 0.091
episode: 479/1000, score: 79207.11869957992, e: 0.091
episode: 480/1000, score: 76396.59733313893, e: 0.09
episode: 481/1000, score: 91009.68687028313, e: 0.09
episode: 482/1000, score: 80972.40143935385, e: 0.089
episode: 483/1000, score: 75229.38505682505, e: 0.089
episode: 484/1000, score: 81341.30958655447, e: 0.088
episode: 485/1000, score: 74968.08814365517, e: 0.088
episode: 486/1000, score: 79934.56751885888, e: 0.088
episode: 487/1000, score: 110055.92799593428, e: 0.087
episode: 488/1000, score: 834

episode: 623/1000, score: 97670.55757985282, e: 0.044
episode: 624/1000, score: 82129.88931571099, e: 0.044
episode: 625/1000, score: 84643.09984894283, e: 0.044
episode: 626/1000, score: 81735.27414962105, e: 0.043
episode: 627/1000, score: 88874.58044125306, e: 0.043
episode: 628/1000, score: 86840.60989444744, e: 0.043
episode: 629/1000, score: 74391.0750700402, e: 0.043
episode: 630/1000, score: 74456.13158400533, e: 0.043
episode: 631/1000, score: 151651.04668040958, e: 0.042
episode: 632/1000, score: 91755.52195755388, e: 0.042
episode: 633/1000, score: 76380.12515453229, e: 0.042
episode: 634/1000, score: 96441.21678606921, e: 0.042
episode: 635/1000, score: 114013.54215534033, e: 0.041
episode: 636/1000, score: 87338.91372296352, e: 0.041
episode: 637/1000, score: 89903.27872279128, e: 0.041
episode: 638/1000, score: 80806.64337663191, e: 0.041
episode: 639/1000, score: 83952.48193596797, e: 0.041
episode: 640/1000, score: 89566.33757000329, e: 0.04
episode: 641/1000, score: 10

episode: 776/1000, score: 82296.38593857792, e: 0.02
episode: 777/1000, score: 87146.04664723032, e: 0.02
episode: 778/1000, score: 80002.70458011937, e: 0.02
episode: 779/1000, score: 96616.51187005415, e: 0.02
episode: 780/1000, score: 89752.85223699044, e: 0.02
episode: 781/1000, score: 88161.60444793245, e: 0.02
episode: 782/1000, score: 110455.32494078901, e: 0.02
episode: 783/1000, score: 114491.18050960297, e: 0.02
episode: 784/1000, score: 85723.0932210842, e: 0.02
episode: 785/1000, score: 88792.50949009342, e: 0.02
episode: 786/1000, score: 110154.10133324635, e: 0.019
episode: 787/1000, score: 93578.86012604795, e: 0.019
episode: 788/1000, score: 110417.63358905236, e: 0.019
episode: 789/1000, score: 91592.74108038706, e: 0.019
episode: 790/1000, score: 106441.65334039225, e: 0.019
episode: 791/1000, score: 91958.8169098975, e: 0.019
episode: 792/1000, score: 103519.34693877552, e: 0.019
episode: 793/1000, score: 92747.9562536504, e: 0.019
episode: 794/1000, score: 107866.17

episode: 928/1000, score: 108707.35714285714, e: 0.01
episode: 929/1000, score: 116708.39917695473, e: 0.01
episode: 930/1000, score: 112042.4170860574, e: 0.01
episode: 931/1000, score: 80385.65827273137, e: 0.01
episode: 932/1000, score: 99429.0612244898, e: 0.01
episode: 933/1000, score: 115322.69429404414, e: 0.01
episode: 934/1000, score: 92841.3302969001, e: 0.01
episode: 935/1000, score: 113306.35714285714, e: 0.01
episode: 936/1000, score: 100134.58490190524, e: 0.01
episode: 937/1000, score: 99590.91836734692, e: 0.01
episode: 938/1000, score: 106559.29761904762, e: 0.01
episode: 939/1000, score: 94750.10575950496, e: 0.01
episode: 940/1000, score: 83898.49271137026, e: 0.01
episode: 941/1000, score: 95329.70208841554, e: 0.01
episode: 942/1000, score: 113789.71428571429, e: 0.01
episode: 943/1000, score: 75620.86454219493, e: 0.01
episode: 944/1000, score: 99356.79092044981, e: 0.01
episode: 945/1000, score: 103881.18687964999, e: 0.01
episode: 946/1000, score: 80039.25230762

## Testing

In [None]:
# Testing parameters
TEST_EPISODES = 10
TEST_CAPITAL = 100000
TEST_DATABASE = DB["etherium_usd"]
TEST_FRAME_SIZE = 100
TEST_BUY_STEP_SIZE = 3

In [None]:
# Initialize Environment
test_env = TradingGame(CAPITAL, TEST_FRAME_SIZE, TEST_BUY_STEP_SIZE, TEST_DATABASE)

In [None]:
#Episodes to train
for e in range(TEST_EPISODES):
    
    #Create initial state from time frame
    state = test_env.reset()

    #time frames
    for time_frame in range(frame_size, 1000):
        
        #Agent takes action
        action = agent.act(state)

        #Calc reward
        next_state, reward, done = test_env.step(action)
        
        #Override state with next state
        state = next_state
        
        if done:
            print("episode: {}/{}, score: {}, e: {:.2}"
                  .format(e, EPISODES, env.get_score(), agent.epsilon))
            break