In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
import sys
import datetime
import matplotlib.pyplot as plt

from tqdm import tqdm
import itertools

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from scipy.stats import norm, rankdata

sys.path.insert(0, '/mnt/afml/ml_finance/mlfinlab')
from mlfinlab.data_structures import imbalance_data_structures as imbar, standard_data_structures as bar
import mlfinlab as ml

sys.path.insert(0, '/mnt/afml/ml_finance/finance_ml')
from finance_ml import sampling, features

In [2]:
def load_parq(fname):
    table = pq.read_table(fname)
    df = table.to_pandas()
    df = df.set_index('TIMESTAMP')
    ''' 중복된 index 제거, volume은 더해준다 '''
    df = df.sort_values(by='TIMESTAMP')  # 중복 데이터 무시
    df_v = df.groupby(df.index).sum()
    df = df.loc[~df.index.duplicated(keep='first')]
    df['V'] = df_v['V']
    df['DV'] = df_v['DV']
    return df

In [3]:
fname = 'dataset/TRADE_A233740_2018.parq'
df = load_parq(fname)

## Get dollar bar

In [4]:
fname = 'dataset/TRADE_A233740_2018.csv'
bar_fname = 'dataset/DBAR_A233740_2018.csv'
if not os.path.exists(fname):
    df_csv = df.reset_index()[['TIMESTAMP', 'PRICE', 'V']]
    df_csv.columns = ['date_time', 'price', 'volume']
    df_csv['price'] = df_csv['price'].astype('float')
    df_csv.to_csv(fname, index=False)
    
if os.path.exists(bar_fname):
    dbar = pd.read_csv(bar_fname, index_col='date_time')
    dbar.index = pd.to_datetime(dbar.index)
else:
    dbar = bar.get_dollar_bars(fname, threshold=1e8)
    dbar.index = pd.to_datetime(dbar.index)
    dbar.to_csv(bar_fname)

In [5]:
print(df.shape)
print(dbar.shape)

(7647271, 5)
(518545, 5)


## Apply triple barrier

In [6]:
# Compute daily volatility
daily_vol = ml.util.get_daily_vol(close=dbar['close'], lookback=50)

# Apply Symmetric CUSUM Filter and get timestamps for events
# Note: Only the CUSUM filter needs a point estimate for volatility
daily_vol_mean = daily_vol.rolling(10000).mean()
cusum_events = ml.filters.cusum_filter(dbar['close'], daily_vol_mean=daily_vol_mean)

# Compute vertical barrier
vertical_barriers = ml.labeling.add_vertical_barrier(t_events=cusum_events, close=dbar['close'], num_days=1)

Timestamp('2018-01-03 09:00:21.481000')


## Primary - Build Label

In [7]:
pt_sl = [1, 2]
min_ret = 0.01
triple_barrier_events = ml.labeling.get_events(close=dbar['close'],
                                               t_events=cusum_events,
                                               pt_sl=pt_sl,
                                               target=daily_vol,
                                               min_ret=min_ret,
                                               num_threads=3,
                                               vertical_barrier_times=vertical_barriers)

2019-06-20 18:06:25.563012 100.0% apply_pt_sl_on_t1 done after 0.02 minutes. Remaining 0.0 minutes.


In [8]:
labels_p = ml.labeling.get_bins(triple_barrier_events, dbar['close'])

In [9]:
labels_p.bin.value_counts()

-1    334
 1    283
 0    136
Name: bin, dtype: int64

## Features

In [10]:
raw_data = dbar.copy()

# Log Returns
raw_data['log_ret'] = np.log(raw_data['close']).diff()

# Momentum
raw_data['mom1'] = raw_data['close'].pct_change(periods=1)
raw_data['mom2'] = raw_data['close'].pct_change(periods=2)
raw_data['mom5'] = raw_data['close'].pct_change(periods=5)

# Volatility
raw_data['volatility_50'] = raw_data['log_ret'].rolling(window=50, min_periods=50, center=False).std()
raw_data['volatility_15'] = raw_data['log_ret'].rolling(window=15, min_periods=15, center=False).std()

# Serial Correlation (Takes about 4 minutes)
window_autocorr = 50

raw_data['autocorr_1'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=1), raw=False)
raw_data['autocorr_3'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=3), raw=False)

# Get the various log -t returns
raw_data['log_t1'] = raw_data['log_ret'].shift(1)
raw_data['log_t2'] = raw_data['log_ret'].shift(2)
raw_data['log_t5'] = raw_data['log_ret'].shift(5)

# Remove look ahead bias
raw_data = raw_data.shift(1)

In [11]:
# Get features at event dates
X = raw_data

# Drop unwanted columns
try:
    X.drop(['open', 'high', 'low', 'close', 'volume'], axis=1, inplace=True)
except Exception as e:
    print(e)

X.dropna(inplace=True)
y = labels_p['bin']

In [12]:
com_idx = y.index.join(X.index).join(labels_p.index)
X = X.loc[com_idx]
y = y.loc[com_idx]

In [13]:
X.shape

(753, 11)

## Probability of Backtest Overfitting (PBO)
### Combinatorial Purged Cross Valdiation (CPCV)

In [25]:
from finance_ml.model_selection import CPKFold

In [None]:
kfold = KFold(5)
for train_idx, test_idx in kfold.split(X, y):
    X_train = X[train_idx]
    y_train = y[train_idx]
    X_train = X[train_idx]
    X_train = X[train_idx]

In [44]:
cpcv = CPKFold((6,2), t1=triple_barrier_events['t1'])

In [45]:
idx = 1
for train_index, test_index in cpcv.split(X, y):
    print("S{} train: {}, test {}".format(idx, len(train_index), len(test_index)))
    idx += 1

S1 train: 500, test 252
S2 train: 496, test 252
S3 train: 498, test 251
S4 train: 491, test 251
S5 train: 500, test 251
S6 train: 476, test 252
S7 train: 473, test 251
S8 train: 466, test 251
S9 train: 475, test 251
S10 train: 495, test 251
S11 train: 488, test 251
S12 train: 497, test 251
S13 train: 496, test 250
S14 train: 499, test 250
S15 train: 500, test 250


In [46]:
idx = 1
R_train = []
R_test = []
depth = 2
n_estimator = 100
for train_index, test_index in cpcv.split(X, y):
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]
    rf = RandomForestClassifier(max_depth=depth, n_estimators=n_estimator,
                                criterion='entropy')
    rf.fit(X_train, y_train.values.ravel())
    R_train.append(rf.score(X_train, y_train))
    R_test.append(rf.score(X_test, y_test))
    print("S{} train: {}, test: {}".format(idx, R_train[-1], R_test[-1]))
    idx += 1
print("train: {}, test: {}".format(np.mean(R_train), np.mean(R_test)))

S1 train: 0.55, test: 0.4166666666666667
S2 train: 0.4899193548387097, test: 0.39285714285714285
S3 train: 0.5020080321285141, test: 0.4342629482071713
S4 train: 0.5112016293279023, test: 0.4262948207171315
S5 train: 0.488, test: 0.4302788844621514
S6 train: 0.5042016806722689, test: 0.38095238095238093
S7 train: 0.5348837209302325, test: 0.4262948207171315
S8 train: 0.5343347639484979, test: 0.4701195219123506
S9 train: 0.5031578947368421, test: 0.398406374501992
S10 train: 0.494949494949495, test: 0.40239043824701193
S11 train: 0.5122950819672131, test: 0.38645418326693226
S12 train: 0.5090543259557344, test: 0.3665338645418327
S13 train: 0.5584677419354839, test: 0.408
S14 train: 0.49899799599198397, test: 0.432
S15 train: 0.552, test: 0.38


train: 0.5162314478255252, test: 0.4101008031366597


In [48]:
idx = 1
R_train = []
R_test = []
depth = 3
n_estimator = 10
for train_index, test_index in cpcv.split(X, y):
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]
    rf = RandomForestClassifier(max_depth=depth, n_estimators=n_estimator,
                                criterion='entropy')
    rf.fit(X_train, y_train.values.ravel())
    R_train.append(rf.score(X_train, y_train))
    R_test.append(rf.score(X_test, y_test))
    print("S{} train: {}, test: {}".format(idx, R_train[-1], R_test[-1]))
    idx += 1
print("train: {}, test: {}".format(np.mean(R_train), np.mean(R_test)))

S1 train: 0.558, test: 0.4126984126984127
S2 train: 0.5241935483870968, test: 0.38492063492063494
S3 train: 0.5461847389558233, test: 0.398406374501992
S4 train: 0.5498981670061099, test: 0.42231075697211157
S5 train: 0.542, test: 0.44621513944223107
S6 train: 0.5819327731092437, test: 0.35714285714285715
S7 train: 0.5496828752642706, test: 0.4541832669322709
S8 train: 0.5901287553648069, test: 0.4342629482071713
S9 train: 0.5389473684210526, test: 0.41434262948207173
S10 train: 0.5313131313131313, test: 0.3904382470119522
S11 train: 0.569672131147541, test: 0.3784860557768924
S12 train: 0.5553319919517102, test: 0.3665338645418327
S13 train: 0.5604838709677419, test: 0.412
S14 train: 0.5410821643286573, test: 0.408
S15 train: 0.588, test: 0.384
train: 0.555123434414479, test: 0.40426274584202876
