In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
import sys
import datetime
import matplotlib.pyplot as plt

from tqdm import tqdm
import itertools

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from scipy.stats import norm

sys.path.insert(0, '/mnt/afml/ml_finance/mlfinlab')
from mlfinlab.data_structures import imbalance_data_structures as imbar, standard_data_structures as bar
import mlfinlab as ml

sys.path.insert(0, '/mnt/afml/ml_finance/finance_ml')
from finance_ml import sampling, features

In [2]:
def load_parq(fname):
    table = pq.read_table(fname)
    df = table.to_pandas()
    df = df.set_index('TIMESTAMP')
    ''' 중복된 index 제거, volume은 더해준다 '''
    df = df.sort_values(by='TIMESTAMP')  # 중복 데이터 무시
    df_v = df.groupby(df.index).sum()
    df = df.loc[~df.index.duplicated(keep='first')]
    df['V'] = df_v['V']
    df['DV'] = df_v['DV']
    return df

In [None]:
fname = 'dataset/TRADE_A233740_2018.parq'
df = load_parq(fname)

## Get dollar bar

In [None]:
fname = 'dataset/TRADE_A233740_2018.csv'
bar_fname = 'dataset/DBAR_A233740_2018.csv'
if not os.path.exists(fname):
    df_csv = df.reset_index()[['TIMESTAMP', 'PRICE', 'V']]
    df_csv.columns = ['date_time', 'price', 'volume']
    df_csv['price'] = df_csv['price'].astype('float')
    df_csv.to_csv(fname, index=False)
    
if os.path.exists(bar_fname):
    dbar = pd.read_csv(bar_fname, index_col='date_time')
    dbar.index = pd.to_datetime(dbar.index)
else:
    dbar = bar.get_dollar_bars(fname, threshold=1e8)
    dbar.index = pd.to_datetime(dbar.index)
    dbar.to_csv(bar_fname)

In [None]:
print(df.shape)
print(dbar.shape)

## Apply triple barrier

In [None]:
# Compute daily volatility
daily_vol = ml.util.get_daily_vol(close=dbar['close'], lookback=50)

# Apply Symmetric CUSUM Filter and get timestamps for events
# Note: Only the CUSUM filter needs a point estimate for volatility
daily_vol_mean = daily_vol.rolling(10000).mean()
cusum_events = ml.filters.cusum_filter(dbar['close'], daily_vol_mean=daily_vol_mean)

# Compute vertical barrier
vertical_barriers = ml.labeling.add_vertical_barrier(t_events=cusum_events, close=dbar['close'], num_days=1)

## Primary - Build Label

In [None]:
pt_sl = [1, 2]
min_ret = 0.01
triple_barrier_events = ml.labeling.get_events(close=dbar['close'],
                                               t_events=cusum_events,
                                               pt_sl=pt_sl,
                                               target=daily_vol,
                                               min_ret=min_ret,
                                               num_threads=3,
                                               vertical_barrier_times=vertical_barriers)

In [None]:
labels_p = ml.labeling.get_bins(triple_barrier_events, dbar['close'])

In [None]:
labels_p.bin.value_counts()

## Features

In [None]:
raw_data = dbar.copy()

# Log Returns
raw_data['log_ret'] = np.log(raw_data['close']).diff()

# Momentum
raw_data['mom1'] = raw_data['close'].pct_change(periods=1)
raw_data['mom2'] = raw_data['close'].pct_change(periods=2)
raw_data['mom5'] = raw_data['close'].pct_change(periods=5)

# Volatility
raw_data['volatility_50'] = raw_data['log_ret'].rolling(window=50, min_periods=50, center=False).std()
raw_data['volatility_15'] = raw_data['log_ret'].rolling(window=15, min_periods=15, center=False).std()

# Serial Correlation (Takes about 4 minutes)
window_autocorr = 50

raw_data['autocorr_1'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=1), raw=False)
raw_data['autocorr_3'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=3), raw=False)

# Get the various log -t returns
raw_data['log_t1'] = raw_data['log_ret'].shift(1)
raw_data['log_t2'] = raw_data['log_ret'].shift(2)
raw_data['log_t5'] = raw_data['log_ret'].shift(5)

# Remove look ahead bias
raw_data = raw_data.shift(1)

In [None]:
# Get features at event dates
X = raw_data

# Drop unwanted columns
try:
    X.drop(['open', 'high', 'low', 'close', 'volume'], axis=1, inplace=True)
except Exception as e:
    print(e)

X.dropna(inplace=True)
y = labels_p['bin']

In [None]:
com_idx = y.index.join(X.index).join(labels_p.index)
X = X.loc[com_idx]
y = y.loc[com_idx]

## Primary - Fit a model

In [None]:
"""
split data as 1/4/7/10
train  1/4 , 1/7 , 1/10, 4/7 , 4/10, 7/10
test   7/10, 4/10, 4/7 , 1/10, 1/7 , 1/4
"""
train_idx_list = [[1,4], [1,7], [1,10], [4,7], [4,10], [7,10]]
N = 30
pbo = 0
total_iter = 0
n_estimator = 100
depth = 2
lc_list = []
for train_idx in tqdm(train_idx_list):
    R_train = []
    R_test = []
    test_idx = [x for x in [1,4,7,10] if x not in train_idx]
    train_m1 = train_idx[0]
    train_m2 = train_idx[1]
    test_m1 = test_idx[0]
    test_m2 = test_idx[1]
    X_train = pd.concat([X.loc[datetime.date(2018, train_m1, 1): datetime.date(2018, train_m1+2, 28)],
                         X.loc[datetime.date(2018, train_m2, 1): datetime.date(2018, train_m2+2, 28)]], axis=0)
    y_train = pd.concat([y.loc[datetime.date(2018, train_m1, 1): datetime.date(2018, train_m1+2, 28)],
                         y.loc[datetime.date(2018, train_m2, 1): datetime.date(2018, train_m2+2, 28)]], axis=0)
    X_test = pd.concat([X.loc[datetime.date(2018, test_m1, 1): datetime.date(2018, test_m1+2, 28)],
                        X.loc[datetime.date(2018, test_m2, 1): datetime.date(2018, test_m2+2, 28)]], axis=0)
    y_test = pd.concat([y.loc[datetime.date(2018, test_m1, 1): datetime.date(2018, test_m1+2, 28)],
                        y.loc[datetime.date(2018, test_m2, 1): datetime.date(2018, test_m2+2, 28)]], axis=0)
    for _ in range(N):
        rf = RandomForestClassifier(max_depth=depth, n_estimators=n_estimator,
                                    criterion='entropy')
        rf.fit(X_train, y_train.values.ravel())
        R_train.append(rf.score(X_train, y_train))
        R_test.append(rf.score(X_test, y_test))
    best_idx = np.argmax(R_train)
    test_rank = rankdata(R_test, method='ordinal')
    wc = test_rank[best_idx]/N
    if wc == 0:
        lc = -1e9
    elif wc == 1:
        lc = 1e9
    else:
        lc = np.log(wc/(1-wc))
    lc_list.append(lc)
m = np.mean(lc_list)
s = np.std(lc_list)
z = (0-m)/s
pbo = norm.cdf(z)
print("PBO:{:.2f}%%".format(100*pbo))

In [None]:
N = 30
pbo = 0
total_iter = 0
n_estimator = 100
depth = 2
for train_idx in tqdm(itertools.combinations(list(range(1,13)), 6), total=924):
    R_train = []
    R_test = []
    test_idx = [x for x in list(range(1,13)) if x not in train_idx]
    X_train = pd.DataFrame()
    y_train = pd.Series()
    X_test = pd.DataFrame()
    y_test = pd.Series()
    for train_m in list(train_idx):
        X_train = X_train.append(X.loc[datetime.date(2018,train_m,1):datetime.date(2018,train_m,28)])
        y_train = y_train.append(y.loc[datetime.date(2018,train_m,1):datetime.date(2018,train_m,28)])
    for test_m in list(test_idx):
        X_test = X_test.append(X.loc[datetime.date(2018,test_m,1):datetime.date(2018,test_m,28)])
        y_test = y_test.append(y.loc[datetime.date(2018,test_m,1):datetime.date(2018,test_m,28)])
    for _ in range(N):
        rf = RandomForestClassifier(max_depth=depth, n_estimators=n_estimator,
                                    criterion='entropy')
        rf.fit(X_train, y_train.values.ravel())
        R_train.append(rf.score(X_train, y_train))
        R_test.append(rf.score(X_test, y_test))
    best_idx = np.argmax(R_train)
    test_rank = rankdata(R_test, method='ordinal')
    wc = test_rank[best_idx]/N
    if wc == 0:
        lc = -1e9
    elif wc == 1:
        lc = 1e9
    else:
        lc = np.log(wc/(1-wc))
    lc_list.append(lc)
m = np.mean(lc_list)
s = np.std(lc_list)
z = (0-m)/s
pbo = norm.cdf(z)
print("PBO:{:.2f}%%".format(100*pbo))