# Ch07

In [71]:
import pandas as pd 
import numpy as np 
import time


import matplotlib.pyplot as plt

from adv_finance import stats, labeling


In [195]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
%load_ext autoreload 
%autoreload 2


plt.style.use('seaborn-talk')
plt.style.use('bmh')
pd.set_option('display.max_rows', 100)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
df = pd.read_parquet('/nfs/data/interim_2018/TRADE_A233740_DB.parq')
df = df.loc[~df.index.duplicated(keep='first')]
data = df


In [9]:
data.shape

(66183, 5)

In [58]:
%%time 

daily_vol = stats.get_daily_vol(data['close'])
t_events = labeling.cusum_filter(data['close'], daily_vol.mean()) 
v_barriers = labeling.add_vertical_barrier(t_events=t_events, close=data['close'], num_days=1)

CPU times: user 2.99 s, sys: 0 ns, total: 2.99 s
Wall time: 2.99 s


In [65]:
%%time

pt_sl = [1,1]
min_ret = 0.01

t_barrier_events = labeling.get_events(close=data['close'], 
                                      t_events=t_events, 
                                      pt_sl=pt_sl, 
                                      num_threads=16, 
                                      target=daily_vol, 
                                      min_ret=min_ret, 
                                      vertical_barrier_times=v_barriers, 
                                      side_prediction=None)



Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  target = target.loc[t_events]


CPU times: user 76.4 ms, sys: 63.1 ms, total: 139 ms
Wall time: 711 ms


2019-06-09 09:30:31.639098 100.0 apply_pt_sl_on_t1 done after 0.01 minutes. Remaining 0.0 minutes.


In [63]:
labels = labeling.get_bins(t_barrier_events, data['close'])

In [86]:
data.head()

Unnamed: 0_level_0,open,high,low,close,vol
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-02 10:00:03.348,19800,19980,19800,19980,215968
2018-01-02 10:00:15.776,19980,20015,19980,20005,51025
2018-01-02 10:00:39.990,20005,20065,20005,20045,49957
2018-01-02 10:01:11.518,20045,20075,20040,20050,50140
2018-01-02 10:01:30.166,20050,20080,20045,20080,54775


In [84]:
t1 = v_barriers
features = data.loc[t_barrier_events.index]

In [None]:
features = features.drop(features.index.difference(y.index))



## RandomForest


In [192]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier

In [197]:
%%time

scores = []
for _ in range(10):   
    clf = RandomForestClassifier()
    kfold = KFold(n_splits=10, shuffle=False)
    scores.append(cross_val_score(clf, features, labels['bin'], cv=kfold, scoring='neg_log_loss'))
print(np.mean(scores), np.var(scores))

-6.620468987240725 1.9641701628337827
CPU times: user 2.53 s, sys: 0 ns, total: 2.53 s
Wall time: 2.53 s


## PurgedKFold 

In [68]:
from sklearn.model_selection._split import _BaseKFold


In [188]:
def get_train_times(t1, test_times): 
    trn = t1.copy(deep=True)
    for i, j in test_times.iteritems(): 
        df0 = trn[(i <= trn.index) & (trn.index <= j)].index 
        df1 = trn[(i <= trn) & (trn <= j)].index
        df2 = trn[(trn.index <= i) & (j <= trn)].index
        trn = trn.drop(df0.union(df1.union(df2)))
        
    return trn
    

class PurgedKFold(_BaseKFold): 
    def __init__(self, n_splits=3, t1=None, pct_embargo=0., purging=True): 
        if not isinstance(t1, pd.Series): 
            raise ValueError('Label through dates must be a pd.Series')
        
        super(PurgedKFold, self).__init__(n_splits=n_splits, shuffle=False, random_state=None)
        
        self.t1 = t1
        self.pct_embargo = pct_embargo
        self.purging = purging 
        
            
    def split(self, X, y=None, groups=None): 
        if (X.index == self.t1.index).sum() != len(self.t1): 
            raise ValueError('X and t1 must have the same index')
        
        indices = np.arange(X.shape[0])
        
        # Embargo width 
        embg_size = int(X.shape[0] * self.pct_embargo)
        test_ranges = [(i[0], i[-1] + 1) for i in np.array_split(indices, self.n_splits)]
        for st, end in test_ranges: 
            # Test dta 
            test_indices = indices[st:end]
            
            # Training data prior to test data 
            t0 = self.t1.index[st]
            train_indices = self.t1.index.searchsorted(self.t1[self.t1 <= t0].index)
            
            # Add training data after test data 
            max_t1_idx = self.t1.index.searchsorted(self.t1[test_indices].max())
            if max_t1_idx < X.shape[0]: 
                train_indices = np.concatenate((train_indices, indices[max_t1_idx + embg_size:]))
                
            # Purging 
            if self.purging: 
                train_t1 = t1.iloc[train_indices]
                test_t1 = t1.iloc[test_indices]
                train_t1 = get_train_times(train_t1, test_t1)
                train_indices = self.t1.index.searchsorted(train_t1.index)
                
            yield train_indices, test_indices
            


In [189]:
from sklearn.metrics import log_loss, accuracy_score 
from adv_finance.sampling import get_sample_tw, get_num_co_events


def cv_score(clf, X, y, sample_weight=None, scoring='neg_log_loss', t1=None, 
             n_splits=3, cv_gen=None, pct_embargo=0., purging=False): 
    
    if scoring not in ['neg_log_loss', 'accuracy']:
        raise Exception('Wrong scoring method') 
        
    if cv_gen is None: 
        cv_gen = PurgedKFold(n_splits=n_splits, t1=t1, 
                            pct_embargo=pct_embargo,
                            purging=purging)
    scores = []
    for train, test in cv_gen.split(X=X): 
        train_params = dict()
        test_params = dict() 

        # Sample weight is an optional parametr 
        if sample_weight is not None: 
            train_params['sample_weight'] = sample_weight.iloc[train].values 
            test_params['sample_weight'] = sample_weight.iloc[test].values 

        clf_ = clf.fit(X=X.iloc[train, :], y=y.iloc[train], **train_params)

        # Scoring 
        if scoring == 'neg_log_loss': 
            prob = clf_.predict_proba(X.iloc[test, :])
            score_ = -log_loss(y.iloc[test], prob, labels=clf.classes_, **test_params)
        else: 
            pred = clf_.predict(X.iloc[test, :])
            score_ = accuracy_score(y.iloc[test], pred, **test_params)
        scores.append(score_)

    return np.array(scores)


In [191]:
%%time 

from sklearn.ensemble import RandomForestClassifier 

clf = RandomForestClassifier()
t1_ = t1.loc[features.index]

scores = [] 
for _ in range(100): 
    scores_ = cv_score(clf, features, labels['bin'], pct_embargo=0.01, t1=t1_, purging=False)
    scores.append(np.mean(scores_))
    
print(np.mean(scores), np.var(scores))

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """


-7.563392057237702 1.7638123497155294
CPU times: user 5.97 s, sys: 0 ns, total: 5.97 s
Wall time: 5.97 s


## With Sample Weights

In [210]:
%%time
n_co_events = get_num_co_events(t_barrier_events.index, t1, num_threads=8)

CPU times: user 11.9 ms, sys: 36 ms, total: 48 ms
Wall time: 347 ms


2019-06-09 16:22:11.767111 100.0 mp_num_co_events done after 0.0 minutes. Remaining 0.0 minutes.


In [214]:
sample_weight = get_sample_tw(t1, n_co_events)

In [218]:
sample_weight = sample_weight.dropna()

In [219]:
%%time

scores = []
for _ in range(100):
    scores_ = cv_score(clf, features, labels['bin'], sample_weight=sample_weight,
                       pct_embargo=0.01, t1=t1_, purging=False)
    scores.append(np.mean(scores_))
print(np.mean(scores), np.var(scores))

-7.94468863147859 2.3983148799868714
CPU times: user 6.15 s, sys: 0 ns, total: 6.15 s
Wall time: 6.15 s


In [221]:
%%time

scores = []
for _ in range(100):
    scores_ = cv_score(clf, features, labels['bin'], sample_weight=sample_weight,
                       pct_embargo=0., t1=t1_, purging=False)
    scores.append(np.mean(scores_))
print(np.mean(scores), np.var(scores))


-9.815097225517478 2.649720007024885
CPU times: user 6.25 s, sys: 0 ns, total: 6.25 s
Wall time: 6.25 s


# APPENDIX