In [1]:
import warnings
warnings.filterwarnings("ignore")

import random
from collections import deque
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

import pybithumb

In [2]:
# define global variables
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"using {DEVICE}")
EPOCHS = 50
BATCH_SIZE= 200

using cuda


In [3]:
# preprocessing
def preprocess(sample):
    sample["open_lclose_ratio"] = sample['open']/sample['close'].shift(1)
    sample["high_close_ratio"] = sample['high']/sample['close']
    sample['low_close_ratio'] = sample['low']/sample['close']
    sample['close_lclose_ratio'] = sample['close']/sample['close'].shift(1)
    sample['volume_lvolume_ratio'] = sample['volume']/sample['volume'].shift(1)
    sample['close_ma5_ratio'] = sample['close']/sample['close'].rolling(window=5).mean()
    sample['close_ma10_ratio'] = sample['close']/sample['close'].rolling(window=10).mean()
    sample['close_ma20_ratio'] = sample['close']/sample['close'].rolling(window=20).mean()
    sample['close_ma60_ratio'] = sample['close']/sample['close'].rolling(window=60).mean()
    sample['close_ma120_ratio'] = sample['close']/sample['close'].rolling(window=120).mean()
    sample['volume_ma5_ratio'] = sample['volume']/sample['volume'].rolling(window=5).mean()
    sample['volume_ma10_ratio'] = sample['volume']/sample['volume'].rolling(window=10).mean()
    sample['volume_ma20_ratio'] = sample['volume']/sample['volume'].rolling(window=20).mean()
    sample['volume_ma60_ratio'] = sample['volume']/sample['volume'].rolling(window=60).mean()
    sample['volume_ma120_ratio'] = sample['volume']/sample['volume'].rolling(window=120).mean()

    return sample.dropna()

In [4]:
# import datasets
samples = dict()
coins = ["BTC", "ETH", "EOS", "XLM", "QTUM"]

connect_key = "d5c7f4458a58322ac7573f9f8193d4f2"
secret_key = "aacd7c9c31a4bbcf30d5088a1b22e338"
bithumb = pybithumb.Bithumb(connect_key, secret_key)

In [5]:
# Preprocess samples
samples = dict()

for coin in coins:
    temp = list()
    sample = bithumb.get_candlestick(coin, "KRW", chart_intervals="1h")
    print(len(sample))
    sample = preprocess(sample)
    
    for i in range(4):
        start_cut = int(i*0.25*len(sample.index))
        end_cut = int((i+1)*0.25*len(sample.index))

        block_sample = sample.iloc[start_cut:end_cut]
        temp.append(block_sample)
    samples[coin] = temp

4344
4344
4344
4344
4344


In [6]:
for i in range(4):
    # get features
    sample = samples["ETH"][i].copy()

    # make target
    sample['target'] = np.nan
    skyrocket = 0
    stalemate = 0
    for idx in sample.index:
        try:
            rtn = sample.loc[idx, 'close']/sample.shift(10).loc[idx, 'close']
            if rtn > 1.005:
                sample.loc[idx, 'target'] = 0
                skyrocket += 1
            else:
                sample.loc[idx, 'target'] = 1
                stalemate += 1
        except:
            sample.loc[idx, 'target'] = 1
    print(f"[block {i+1}] long: {skyrocket}, hold: {stalemate}")

    features = sample.shift(1).drop(['open', 'high', 'low', 'close', 'volume', 'target'], axis=1).dropna()
    target = sample['target'].iloc[1:]

    X_train, X_test, y_train, y_test = train_test_split(features, target, train_size=0.8, test_size=0.2, random_state=42, shuffle=False)

    tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42)
    tpot.fit(X_train, y_train)
    print(f"[block {i+1}] tpot test score: {tpot.score(X_test, y_test)}")
    tpot.export(f"sample{i+1}_tpot_bestfit.py")

    clf = TPOTClassifier(config_dict="TPOT NN", template="Selector-Transformer-PytorchLRClassifier", generations=5, population_size=10, verbosity=2)
    clf.fit(X_train, y_train)
    print(f"block [{i+1}] clf test score: {clf.score(X_test, y_test)}")
    clf.export(f"sample{i+1}_neuralnet_bestfit.py")
    

[block 1] long: 494, hold: 562


Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8282051282051281

Generation 2 - Current best internal CV score: 0.8282051282051281

Generation 3 - Current best internal CV score: 0.8282051282051281

Generation 4 - Current best internal CV score: 0.8305790363482671

Generation 5 - Current best internal CV score: 0.830607213299521

Best pipeline: XGBClassifier(input_matrix, learning_rate=0.1, max_depth=2, min_child_weight=11, n_estimators=100, n_jobs=1, subsample=0.4, verbosity=0)
[block 1] tpot test score: 0.8862559241706162


Optimization Progress:   0%|          | 0/60 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8306001690617075

Generation 2 - Current best internal CV score: 0.832952944491406

Generation 3 - Current best internal CV score: 0.832952944491406

Generation 4 - Current best internal CV score: 0.832952944491406

Generation 5 - Current best internal CV score: 0.8341222879684418

Best pipeline: PytorchLRClassifier(RobustScaler(SelectFwe(input_matrix, alpha=0.011)), batch_size=16, learning_rate=0.1, num_epochs=5, weight_decay=0)
block [1] clf test score: 0.8436018957345972
[block 2] long: 535, hold: 521


Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8116089039165961

Generation 2 - Current best internal CV score: 0.8116089039165961

Generation 3 - Current best internal CV score: 0.8128134685826993

Generation 4 - Current best internal CV score: 0.8128134685826993

Generation 5 - Current best internal CV score: 0.8151873767258382

Best pipeline: ExtraTreesClassifier(BernoulliNB(RFE(input_matrix, criterion=gini, max_features=0.1, n_estimators=100, step=0.4), alpha=10.0, fit_prior=False), bootstrap=False, criterion=entropy, max_features=0.9000000000000001, min_samples_leaf=12, min_samples_split=14, n_estimators=100)
[block 2] tpot test score: 0.8530805687203792


Optimization Progress:   0%|          | 0/60 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8139335023950409

Generation 2 - Current best internal CV score: 0.8139335023950409

Generation 3 - Current best internal CV score: 0.817483798253029

Generation 4 - Current best internal CV score: 0.817483798253029

Generation 5 - Current best internal CV score: 0.817483798253029

Best pipeline: PytorchLRClassifier(FastICA(SelectFwe(input_matrix, alpha=0.031), tol=0.1), batch_size=8, learning_rate=0.1, num_epochs=15, weight_decay=0)
block [2] clf test score: 0.8672985781990521
[block 3] long: 465, hold: 591


Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7961327134404057

Generation 2 - Current best internal CV score: 0.8032403493941956

Generation 3 - Current best internal CV score: 0.8032403493941956

Generation 4 - Current best internal CV score: 0.8103691180614258

Generation 5 - Current best internal CV score: 0.8115736827275288

Best pipeline: XGBClassifier(MultinomialNB(BernoulliNB(input_matrix, alpha=10.0, fit_prior=False), alpha=10.0, fit_prior=False), learning_rate=0.5, max_depth=9, min_child_weight=13, n_estimators=100, n_jobs=1, subsample=0.45, verbosity=0)
[block 3] tpot test score: 0.7725118483412322


Optimization Progress:   0%|          | 0/60 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8056142575373345

Generation 2 - Current best internal CV score: 0.8056142575373345

Generation 3 - Current best internal CV score: 0.8103832065370528

Generation 4 - Current best internal CV score: 0.8139194139194139

Generation 5 - Current best internal CV score: 0.8139194139194139

Best pipeline: PytorchLRClassifier(RobustScaler(SelectPercentile(input_matrix, percentile=22)), batch_size=32, learning_rate=0.01, num_epochs=15, weight_decay=0.0001)
block [3] clf test score: 0.8720379146919431
[block 4] long: 599, hold: 458


Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8294096928712313

Generation 2 - Current best internal CV score: 0.8294237813468583

Generation 3 - Current best internal CV score: 0.8294237813468583

Generation 4 - Current best internal CV score: 0.8294237813468583

Generation 5 - Current best internal CV score: 0.8294237813468583

Best pipeline: XGBClassifier(input_matrix, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=100, n_jobs=1, subsample=0.05, verbosity=0)
[block 4] tpot test score: 0.8443396226415094


Optimization Progress:   0%|          | 0/60 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8021484925331078

Generation 2 - Current best internal CV score: 0.8021484925331078

Generation 3 - Current best internal CV score: 0.8056706114398422

Generation 4 - Current best internal CV score: 0.8140321217244294

Generation 5 - Current best internal CV score: 0.8187376725838265

Best pipeline: PytorchLRClassifier(RobustScaler(VarianceThreshold(input_matrix, threshold=0.0001)), batch_size=8, learning_rate=0.01, num_epochs=10, weight_decay=0.0001)
block [4] clf test score: 0.8490566037735849
