In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
import sys
import datetime
import matplotlib.pyplot as plt
import pyfolio as pf

sys.path.insert(0, '/mnt/afml/ml_finance/mlfinlab')
from mlfinlab.data_structures import imbalance_data_structures as imbar, standard_data_structures as bar
import mlfinlab as ml

sys.path.insert(0, '/mnt/afml/ml_finance/finance_ml')
from finance_ml import sampling, features

  ' to position notionals.'


In [4]:
def load_parq(fname):
    table = pq.read_table(fname)
    df = table.to_pandas()
    df = df.set_index('TIMESTAMP')
    ''' 중복된 index 제거, volume은 더해준다 '''
    df = df.sort_values(by='TIMESTAMP')  # 중복 데이터 무시
    df_v = df.groupby(df.index).sum()
    df = df.loc[~df.index.duplicated(keep='first')]
    df['V'] = df_v['V']
    df['DV'] = df_v['DV']
    return df

In [5]:
fname = 'dataset/TRADE_A233740_2018.parq'
df = load_parq(fname)

In [6]:
fname = 'dataset/TRADE_A233740_2018.csv'
bar_fname = 'dataset/DBAR_A233740_2018.csv'
if not os.path.exists(fname):
    df_csv = df.reset_index()[['TIMESTAMP', 'PRICE', 'V']]
    df_csv.columns = ['date_time', 'price', 'volume']
    df_csv['price'] = df_csv['price'].astype('float')
    df_csv.to_csv(fname, index=False)
    
if os.path.exists(bar_fname):
    dbar = pd.read_csv(bar_fname, index_col='date_time')
    dbar.index = pd.to_datetime(dbar.index)
else:
    dbar = bar.get_dollar_bars(fname, threshold=1e8)
    dbar.index = pd.to_datetime(dbar.index)
    dbar.to_csv(bar_fname)

In [7]:
# Compute daily volatility
daily_vol = ml.util.get_daily_vol(close=dbar['close'], lookback=50)

# Apply Symmetric CUSUM Filter and get timestamps for events
# Note: Only the CUSUM filter needs a point estimate for volatility
daily_vol_mean = daily_vol.rolling(10000).mean()
cusum_events = ml.filters.cusum_filter(dbar['close'], daily_vol_mean=daily_vol_mean)

# Compute vertical barrier
vertical_barriers = ml.labeling.add_vertical_barrier(t_events=cusum_events, close=dbar['close'], num_days=1)

Timestamp('2018-01-03 09:00:21.481000')


In [8]:
pt_sl = [1, 1]
min_ret = 0.005
triple_barrier_events = ml.labeling.get_events(close=dbar['close'],
                                               t_events=cusum_events,
                                               pt_sl=pt_sl,
                                               target=daily_vol,
                                               min_ret=min_ret,
                                               num_threads=3,
                                               vertical_barrier_times=vertical_barriers)



2019-06-07 16:12:17.435514 100.0% apply_pt_sl_on_t1 done after 0.1 minutes. Remaining 0.0 minutes.


In [9]:
labels_p = ml.labeling.get_bins(triple_barrier_events, dbar['close'])

In [10]:
raw_data = dbar.copy()

# Log Returns
raw_data['log_ret'] = np.log(raw_data['close']).diff()

# Momentum
raw_data['mom1'] = raw_data['close'].pct_change(periods=1)
raw_data['mom2'] = raw_data['close'].pct_change(periods=2)
raw_data['mom5'] = raw_data['close'].pct_change(periods=5)

# Volatility
raw_data['volatility_50'] = raw_data['log_ret'].rolling(window=50, min_periods=50, center=False).std()
raw_data['volatility_15'] = raw_data['log_ret'].rolling(window=15, min_periods=15, center=False).std()

# Serial Correlation (Takes about 4 minutes)
window_autocorr = 50

raw_data['autocorr_1'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=1), raw=False)
raw_data['autocorr_3'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=3), raw=False)

# Get the various log -t returns
raw_data['log_t1'] = raw_data['log_ret'].shift(1)
raw_data['log_t2'] = raw_data['log_ret'].shift(2)
raw_data['log_t5'] = raw_data['log_ret'].shift(5)

# Remove look ahead bias
raw_data = raw_data.shift(1)

In [24]:
# Get features at event dates
X = raw_data

# Drop unwanted columns
try:
    X.drop(['open', 'high', 'low', 'close', 'volume'], axis=1, inplace=True)
except Exception as e:
    print(e)

y = labels_p.loc[X.index,'bin']

"['open' 'high' 'low' 'close' 'volume'] not found in axis"


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


## Ensemble Model

In [12]:
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, classification_report, confusion_matrix, accuracy_score
from sklearn.utils import resample
from sklearn.utils import shuffle

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [13]:
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp

from sklearn import datasets
from sklearn.metrics import roc_curve, auc
from itertools import cycle

In [26]:
y = y.dropna()
X = X.dropna()
com_idx = y.index.join(X.index).join(labels_p.index)
X = X.loc[com_idx]
y = y.loc[com_idx]
labels_p = labels_p.loc[com_idx]

In [None]:
n_co_events = get_num_co_events(dbar['close'].index, t1, 2)
sample_weight = get_sample_tw(t1, n_co_events, triple_barrier_events.index)
labels_p['w'] = sample_weight

In [51]:
labels_p['t1'] = triple_barrier_events['t1']

## Hyper Parameter Tune

In [83]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.pipeline import Pipeline

class MyPipeline(Pipeline):
    def fit(self, X, y, sample_weight=None, **fit_params):
        if sample_weight is not None:
            fit_params[self.steps[-1][0] + '__sample_weight'] = sample_weight
        return super(MyPipeline, self).fit(X, y, **fit_params)


def clf_hyper_fit(feat, label, t1, pipe_clf, search_params, scoring=None,
                  n_splits=3, bagging=[0, None, 1.],
                  rnd_search_iter=0, n_jobs=-1, pct_embargo=0., **fit_params):
    # Set defaut value for scoring
    if scoring is None:
        if set(label.values) == {0, 1}:
            scoring = 'f1'
        else:
            scoring = 'neg_log_loss'
    # HP serach on traing data
    inner_cv = PurgedKFold(n_splits=n_splits, t1=t1, pct_embargo=pct_embargo)
    if rnd_search_iter == 0:
        search = GridSearchCV(estimator=pipe_clf, param_grid=search_params,
                              scoring=scoring, cv=inner_cv, n_jobs=n_jobs, iid=False)
    else:
        search = RandomizedSearchCV(estimator=pipe_clf, param_distributions=search_params,
                                    scoring=scoring, cv=inner_cv, n_jobs=n_jobs, iid=False)
    best_pipe = search.fit(feat, label, **fit_params).best_estimator_
    # Fit validated model on the entirely of data
    if bagging[0] > 0:
        bag_est = BaggingClassifier(base_estimator=MyPipeline(best_pipe.steps),
                                   n_estimators=int(bagging[0]), max_samples=float(bagging[1]),
                                   max_features=float(bagging[2]), n_jobs=n_jobs)
        bag_est = best_pipe.fit(feat, label,
                                sample_weight=fit_params[bag_est.base_estimator.steps[-1][0] + '__sample_weight'])
        best_pipe = Pipeline([('bag', bag_est)])
    return best_pipe

In [84]:
from scipy.stats import rv_continuous


class LogUniformGen(rv_continuous):
    def _cdf(self, x):
        return np.log(x / self.a) / np.log(self.b / self.a)
    
def log_uniform(a=1, b=np.exp(1)):
    return LogUniformGen(a=a, b=b, name='log_uniform')

In [85]:
from finance_ml.datasets import get_cls_data

In [None]:
%%time

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

name = 'svc'
params_grid = {name + '__C': [1e-3, 1e-2, 1e-1, 1, 10], name + '__gamma': [1e-2, 1e-1, 1, 10, 100]}
kernel = 'rbf'
clf = SVC(kernel=kernel, probability=True)
pipe_clf = Pipeline([(name, clf)])
fit_params = dict()

clf = clf_hyper_fit(X, labels_p['bin'], t1=labels_p['t1'], pipe_clf=pipe_clf, scoring='neg_log_loss',
                    search_params=params_grid, n_splits=3, bagging=[0, None, 1.],
                    rnd_search_iter=0, n_jobs=-1, pct_embargo=0., **fit_params)

In [None]:
clf

In [None]:
clf.score(X, labels_p['bin'])

In [None]:
%%time

name = 'svc'
params_dist = {name + '__C': log_uniform(a=1e-3, b=1e1),
               name + '__gamma': log_uniform(a=1e-2, b=1e2)}
kernel = 'rbf'
clf = SVC(kernel=kernel, probability=True)
pipe_clf = Pipeline([(name, clf)])
fit_params = dict()

clf = clf_hyper_fit(X, labels_p['bin'], t1=labels_p['t1'], pipe_clf=pipe_clf, scoring='neg_log_loss',
                    search_params=params_grid, n_splits=3, bagging=[0, None, 1.],
                    rnd_search_iter=25, n_jobs=-1, pct_embargo=0., **fit_params)

In [None]:
clf

In [None]:
clf.score(X, labels_p['bin'])