# Ch06 Ensemble Methods

In [54]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

from adv_finance import bars, labeling, utils, features


In [48]:
from sklearn.metrics import roc_curve, classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.utils import shuffle

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
plt.style.use('seaborn-talk')
plt.style.use('bmh')
pd.set_option('display.max_rows', 100)


In [16]:
data = pd.read_parquet('/nfs/data/interim_2018/TRADE_A233740_DB.parq')
data = data.loc[~data.index.duplicated()]


In [19]:
data.head()

Unnamed: 0_level_0,open,high,low,close,vol
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-02 10:00:03.348,19800,19980,19800,19980,215968
2018-01-02 10:00:15.776,19980,20015,19980,20005,51025
2018-01-02 10:00:39.990,20005,20065,20005,20045,49957
2018-01-02 10:01:11.518,20045,20075,20040,20050,50140
2018-01-02 10:01:30.166,20050,20080,20045,20080,54775


## Primary Model 


In [None]:
# Compute sides 
data['side'] = np.nan 


# long_signals = data['fast_mavg'] >= data['slow_mavg']
# short_signals = data['fast_mavg'] < data['slow_mavg']
# data.loc[long_signals, 'side'] = 1
# data.loc[short_signals, 'side'] = -1


In [33]:
%%time 

# CUSUM Filter 
daily_vol = utils.get_daily_vol(data['close'])
threshold = daily_vol.mean() * 0.2
t_events = labeling.cusum_filter(data['close'], threshold)
v_barriers = labeling.add_vertical_barrier(t_events=t_events, close=data['close'], num_days=1)


# Side Decision by Cusum Filter & Triple Barrier 
pt_sl = [1, 1]
min_ret = 0.01 
t_side_events = labeling.get_events(close=data['close'],
                                            t_events=t_events,
                                            pt_sl=pt_sl,
                                            target=daily_vol,
                                            min_ret=min_ret,
                                            num_threads=8,
                                            vertical_barrier_times=v_barriers,
                                            side_prediction=None)


side_labels = labeling.get_bins(t_side_events, data['close'])

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  target = target.loc[t_events]
2019-06-04 04:54:10.680549 100.0 apply_pt_sl_on_t1 done after 0.07 minutes. Remaining 0.0 minutes.


CPU times: user 3.45 s, sys: 69.4 ms, total: 3.52 s
Wall time: 7.93 s


## Meta Model 

In [80]:
%%time
pt_sl = [1, 2]
min_ret = 0.02
t_barrier_events = labeling.get_events(close=data['close'],
                                            t_events=t_events,
                                            pt_sl=pt_sl,
                                            target=daily_vol,
                                            min_ret=min_ret,
                                            num_threads=8,
                                            vertical_barrier_times=v_barriers,
                                           side_prediction=side_labels['bin'])


labels = labeling.get_bins(t_barrier_events, data['close'])

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  target = target.loc[t_events]
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  side_ = side_prediction.loc[target.index]


CPU times: user 143 ms, sys: 84.2 ms, total: 227 ms
Wall time: 1.22 s


2019-06-04 05:21:21.881849 100.0 apply_pt_sl_on_t1 done after 0.02 minutes. Remaining 0.0 minutes.


In [81]:
labels.side.value_counts()

-1.0    1011
 0.0    1004
 1.0     939
Name: side, dtype: int64

## Results of Primary Model

In [49]:
primary_forecast = pd.DataFrame(labels['bin'])
primary_forecast['pred'] = 1
primary_forecast.columns = ['actual', 'pred']

# Performance Metrics
actual = primary_forecast['actual']
pred = primary_forecast['pred']
print(classification_report(y_true=actual, y_pred=pred))

print('Confusion Matrix')
print(confusion_matrix(actual, pred))

print('')
print('Accuracy')
print(accuracy_score(actual, pred))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1004
           1       0.66      1.00      0.80      1950

   micro avg       0.66      0.66      0.66      2954
   macro avg       0.33      0.50      0.40      2954
weighted avg       0.44      0.66      0.52      2954

Confusion Matrix
[[   0 1004]
 [   0 1950]]

Accuracy
0.6601218686526743


  'precision', 'predicted', average, warn_for)


## Fit a Meta Model 

In [92]:
raw_data = data.copy()

In [94]:
%%time

# Log Returns
raw_data['log_ret'] = np.log(raw_data['close']).diff()

# Momentum
raw_data['mom1'] = raw_data['close'].pct_change(periods=1)
raw_data['mom2'] = raw_data['close'].pct_change(periods=2)
raw_data['mom3'] = raw_data['close'].pct_change(periods=3)
raw_data['mom4'] = raw_data['close'].pct_change(periods=4)
raw_data['mom5'] = raw_data['close'].pct_change(periods=5)

# Volatility
raw_data['volatility_50'] = raw_data['log_ret'].rolling(window=50, min_periods=50, center=False).std()
raw_data['volatility_31'] = raw_data['log_ret'].rolling(window=31, min_periods=31, center=False).std()
raw_data['volatility_15'] = raw_data['log_ret'].rolling(window=15, min_periods=15, center=False).std()


# Serial Correlation (Takes about 4 minutes)
window_autocorr = 15
raw_data['autocorr_1'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=1), raw=False)
raw_data['autocorr_2'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=2), raw=False)
raw_data['autocorr_3'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=3), raw=False)
raw_data['autocorr_4'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=4), raw=False)
raw_data['autocorr_5'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=5), raw=False)

# Get the various log -t returns
raw_data['log_t1'] = raw_data['log_ret'].shift(1)
raw_data['log_t2'] = raw_data['log_ret'].shift(2)
raw_data['log_t3'] = raw_data['log_ret'].shift(3)
raw_data['log_t4'] = raw_data['log_ret'].shift(4)
raw_data['log_t5'] = raw_data['log_ret'].shift(5)

# Fractianl Differentiation
raw_data['fracdf'] = features.frac_diff_ffd(raw_data['close'], 0.3, thres=1e-3)


# Remove look ahead bias
# raw_data = raw_data.shift(1)

CPU times: user 1min 52s, sys: 219 ms, total: 1min 52s
Wall time: 1min 52s


In [138]:
# Get features at event dates 
X = raw_data 

# Drop unwanted columns 
try: 
    X.drop(['open', 'high', 'low', 'close', 'vol'], axis=1, inplace=True)

except Exception as e: 
    print(e)

    
y = labels.loc[X.index, 'bin']

"['open' 'high' 'low' 'close' 'vol'] not found in axis"


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


# Ensemble Model

In [None]:
y = y.dropna()
X = X.dropna()
com_idx = y.index.join(X.index).join(labels_p.index)
X = X.loc[com_idx]
y = y.loc[com_idx]
labels_p = labels_p.loc[com_idx]



# APPENDIX 

## Snippet 6.3 RF를 설정하는 세 가지 방법

In [None]:
avg_u = 0.5
clf0 = RandomForestClassifier(n_estimators=1000, class_weight='balaned_subsample',
                              criterion='entropy')
clf1 = DecisionTreeClassifier(criterion='entropy', max_features='auto',
                              class_weight='balanced')
clf1 = BaggingClassifier(base_estimator=clf1, n_estimators=1000, max_samples=avg_u)
clf2 = RandomForestClassifier(n_estimators=1, criterion='entropy', bootstrap=False,
                              class_weight='balanced_subsample')
clf2 = BaggingClassifier(base_estimator=clf2, n_estimators=1000, max_samples=avg_u,
                         max_features=1.)

