In [1]:
import random
import math
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display
import pybithumb

In [2]:
# Hyperparameters
EPISOPES = 1000
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
GAMMA = 0.9
LR = 0.001
BATCH_SIZE = 50

connect_key = "d5c7f4458a58322ac7573f9f8193d4f2"
secret_key = "aacd7c9c31a4bbcf30d5088a1b22e338"
bithumb = pybithumb.Bithumb(connect_key, secret_key)
sample = bithumb.get_candlestick("ETH", "KRW", "30m")

cut = int(len(sample.index)*0.7)
train_sample = sample.iloc[:cut, :]
test_sample = sample.iloc[cut:, :]

In [3]:
class TradeBook:
    def __init__(self, sample):
        self.acc_rtn = 1.
        self.bought = False
        self.block = 0
        self.sample = self.preprosess(sample)

    def preprosess(self, sample):
        sample = sample[['open', 'high', 'low', 'close', 'volume']]
        sample['rtn'] = sample['open']/sample.shift(1)['open']
        sample['open'] = sample['open'].pct_change()
        sample['high'] = sample['high'].pct_change().shift(1)
        sample['low'] = sample['low'].pct_change().shift(1)
        sample['close'] = sample['close'].pct_change().shift(1)
        sample['volume'] = sample['volume'].pct_change().shift(1)
        sample = sample.dropna()
        #sample = sample.iloc[-48*7:, :]
        return sample.to_numpy()

    def step(self, action):
        actions = ['buy', 'sell']
        action = actions[action]

        # update current status
        if self.bought and action == "sell":
            self.bought = False
        elif not self.bought and action == "buy":
            self.bought = True
        else:
            pass

        # get reward
        if self.bought and action == "buy":
            reward = self.sample[self.block][5]
        elif self.bought and action == "sell":
            reward = self.sample[self.block][5] - 0.004
        elif not self.bought and action == "buy":
            reward = 1. - 0.004
        else:
            reward = 1.
        
        self.acc_rtn *= reward

        self.block += 1

        done = self.is_done()
        return self.sample[self.block-1][:5], reward, done
    
    def is_done(self):
        if self.acc_rtn <= 0.8:
            return True
        elif self.block == len(self.sample):
            return True
        else:
            return False
    
    def reset(self):
        self.acc_rtn = 1.
        self.bought = False
        self.block = 0
        return self.sample[self.block][:5]



In [4]:
class DQNAgent:
    def __init__(self):
        self.model = nn.Sequential(
            nn.Linear(5, 64),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(64, 256),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(64, 2)
        )
        self.optimizer = optim.Adam(self.model.parameters(), LR)
        self.steps_done = 0
        self.memory = deque(maxlen=100)
    
    def memorize(self, state, action, reward, next_state):
        self.memory.append((state, action, torch.FloatTensor([reward]), torch.FloatTensor([next_state])))
    
    def act(self, state):
        eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1.*self.steps_done / EPS_DECAY)
        self.steps_done += 1
        if random.random() > eps_threshold:
            return self.model(state).data.max(1)[1].view(1, 1)
        else:
            return torch.LongTensor([[random.randrange(2)]])
    
    def learn(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        states, actions, rewards, next_states = zip(*batch)
        
        states = torch.cat(states)
        actions = torch.cat(actions)
        rewards = torch.cat(rewards)
        next_states = torch.cat(next_states)

        current_q = self.model(states).gather(1, actions)
        max_next_q = self.model(next_states).detach().max(1)[0]

        expected_q = rewards + (GAMMA*max_next_q)

        #loss = F.mse_loss(current_q.squeeze(), expected_q)
        loss = F.smooth_l1_loss(current_q.squeeze(), expected_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [5]:
env = TradeBook(train_sample)
agent = DQNAgent()
score_history = []
for e in range(1, EPISOPES+1):
    state = env.reset()
    steps = 0

    while True:
        state = torch.FloatTensor([state])
        action = agent.act(state)
        next_state, reward, done = env.step(action.item())

        agent.memorize(state, action, reward, next_state)
        agent.learn()

        state = next_state
        steps += 1 

        if done:
            print(f"EPISODE: {e}\t ACCUMULATED RETURNS: {env.acc_rtn}")
            score_history.append(env.acc_rtn)
            break

EPISODE: 1	 ACCUMULATED RETURNS: 1.217876985407098
EPISODE: 2	 ACCUMULATED RETURNS: 1.0984566346543
EPISODE: 3	 ACCUMULATED RETURNS: 1.3189032445670528
EPISODE: 4	 ACCUMULATED RETURNS: 1.107893576079098
EPISODE: 5	 ACCUMULATED RETURNS: 1.2711624142638478
EPISODE: 6	 ACCUMULATED RETURNS: 1.1385980946256924
EPISODE: 7	 ACCUMULATED RETURNS: 1.2680479011949355
EPISODE: 8	 ACCUMULATED RETURNS: 1.2265661743511442
EPISODE: 9	 ACCUMULATED RETURNS: 1.2550331054100727
EPISODE: 10	 ACCUMULATED RETURNS: 1.1552487106217868
EPISODE: 11	 ACCUMULATED RETURNS: 1.2334430880766656
EPISODE: 12	 ACCUMULATED RETURNS: 1.8261765402533088
EPISODE: 13	 ACCUMULATED RETURNS: 1.1222763271256766
EPISODE: 14	 ACCUMULATED RETURNS: 1.2458554328752958
EPISODE: 15	 ACCUMULATED RETURNS: 1.3576112164859868
EPISODE: 16	 ACCUMULATED RETURNS: 1.0990983925661508
EPISODE: 17	 ACCUMULATED RETURNS: 1.245014898069071
EPISODE: 18	 ACCUMULATED RETURNS: 1.4058473066598434
EPISODE: 19	 ACCUMULATED RETURNS: 1.2821855468418673
EPISODE:

KeyboardInterrupt: 

In [None]:
# test
env = TradeBook(test_sample)
state = env.reset()
steps = 0

while True:
    state = torch.FloatTensor([state])
    action = agent.act(state)
    next_state, reward, done = env.step(action.item())

    agent.memorize(state, action, reward, next_state)
    agent.learn()

    state = next_state
    steps += 1

    if done:
        print(f"ACCUMULATED RETURNS for test: {env.acc_rtn}")
        break

In [None]:
plt.plot(score_history)
plt.ylabel('returns')
plt.show()