# Cross Validation

## Prepare Data

In [1]:
# import libraries

import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import statsmodels
from statsmodels.tsa.stattools import acf
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import coint
from scipy.stats import jarque_bera
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

#import functions from scripts folder

sys.path.append('../../scripts')
from fracdiff import *
from labelling import *
from samp_weights import *
from fin_data_management import * 
from sequential_CV import *
from fetch_yf_data import fetch_data
from AFML_book_scripts import *
from AFML_my_scripts import *

In [2]:
# import data 
data = pd.read_csv("../../data/SP_futures_tick_data.csv")
#manipulate data such that we can transfomr into a dollar bar series
datetime_str = data['date'] + ' ' + data['time']
data['datetime'] = pd.to_datetime(datetime_str, errors='coerce')
#drop date and time columns
data = data.drop(['date', 'time'], axis=1)
#get the dollar bar dataframe
dollars_bars_size = 1000000  
df = DollarBarsDfVectorized(data, dollar_per_bar=dollars_bars_size)
#check for duplicates
print(df.index[df.index.duplicated()])
# reindex the dataframe to datetime as we will need timeindexed series objects
df = df.drop('start_date', axis=1 )
df = df.rename(columns={'end_date': 'datetime'})
df = df.set_index('datetime')
#remove duplicate indices and check again
df = df[~df.index.duplicated(keep='first')]
print(df.index[df.index.duplicated()])

Index([], dtype='int64')
DatetimeIndex([], dtype='datetime64[ns]', name='datetime', freq=None)


In [3]:
#drop non useful columns 
df = df.drop(['open', 'high', 'low'],axis =1)
#create some extra features
window = 5  
df['rolling_mean'] = df['close'].rolling(window).mean()
df['rolling_std'] = df['close'].rolling(window).std()
df['returns'] = df['close'].pct_change()


In [4]:
df

Unnamed: 0_level_0,close,volume,dollar_volume,rolling_mean,rolling_std,returns
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2003-07-01 03:16:58.000,971.9,1031,1003316.0,,,
2003-07-01 06:32:13.000,971.3,1030,1001129.9,,,-0.000617
2003-07-01 07:44:30.000,968.1,1032,1001141.2,,,-0.003295
2003-07-01 16:25:20.000,982.2,1024,994588.6,,,0.014565
2003-07-02 01:51:56.000,983.2,1026,1008342.4,975.34,6.881352,0.001018
...,...,...,...,...,...,...
2019-10-08 17:29:47.032,2893.5,341,998716.1,2933.06,53.892374,-0.026839
2019-10-14 03:01:58.578,2966.8,344,1005621.3,2953.08,39.838072,0.025333
2019-10-29 20:10:53.206,3034.3,332,994846.1,2972.88,51.644574,0.022752
2019-11-21 04:07:51.275,3108.8,324,999676.2,2995.34,80.719781,0.024553


In [5]:
ptSL = (1,1)
min_ret = df['close'].pct_change().mean()
target = GetTargetforTBM(df.close,ema_periods=window)
numDays = 5
close = df.close
tEvents = close.index[100:5000]

t1=close.index.searchsorted(tEvents+pd.Timedelta(days=numDays))
t1=t1[t1<close.shape[0]]
t1=pd.Series(close.index[t1],index=tEvents[:t1.shape[0]])


In [6]:
#get events
events = getEventsMeta(df.close,tEvents,ptSL,target,min_ret,t1)
events

Unnamed: 0_level_0,t1,trgt,side,hit_first
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2003-07-30 23:11:40,2003-07-31 07:34:51,0.003966,1.0,tp
2003-07-31 05:46:31,2003-07-31 07:34:51,0.003299,1.0,tp
2003-07-31 07:20:38,2003-07-31 07:34:51,0.002720,1.0,tp
2003-07-31 07:34:51,2003-08-01 07:31:01,0.004399,1.0,sl
2003-07-31 07:42:38,2003-08-01 07:31:01,0.003635,1.0,sl
...,...,...,...,...
2007-03-14 05:08:11,2007-03-14 15:56:12,0.002336,1.0,tp
2007-03-14 05:42:47,2007-03-14 15:56:12,0.001920,1.0,tp
2007-03-14 06:16:28,2007-03-14 07:09:46,0.001854,1.0,sl
2007-03-14 06:40:04,2007-03-14 07:09:46,0.001514,1.0,sl


## Purging Observations in the Training Set

In [7]:
t1 = events.t1
test_events = events.t1.iloc[:300]

trn = getTrainTimes(t1,testTimes=test_events)

In [37]:
trn

datetime
2003-11-03 07:51:03   2003-11-04 02:47:39
2003-11-03 22:10:04   2003-11-04 02:47:39
2003-11-04 02:47:39   2003-11-04 19:33:03
2003-11-04 05:12:19   2003-11-04 19:33:03
2003-11-04 07:31:13   2003-11-04 19:33:03
                              ...        
2007-03-14 05:08:11   2007-03-14 15:56:12
2007-03-14 05:42:47   2007-03-14 15:56:12
2007-03-14 06:16:28   2007-03-14 07:09:46
2007-03-14 06:40:04   2007-03-14 07:09:46
2007-03-14 07:09:46   2007-03-14 07:37:05
Name: t1, Length: 4593, dtype: datetime64[ns]

## Embargo on Training Set

In [8]:
mbrg = getEmbargoTimes(t1.index, pctEmbargo=0.01)  # for example 1% embargo
# Pick test start and end as the first 10 events
dt0 = events.index[0]        # start of test set
dt1 = events.index[300]        # end of test set
# Series with start index = dt0 and value = embargoed dt1
testTimes = pd.Series(mbrg[dt1], index=[dt0])
trn = getTrainTimes(events.t1,testTimes)

## CV with Overlapping Observations


In [12]:
#get labels
labels = getTBMLabels(events, df.close)

In [10]:
#align features with labels df
df = df.loc[labels.index]
X = df
y = labels.bin

#check if all the dataframes are aligned
print(f'The shapes of events, feature matrix and labels df are {events.shape}, {df.shape} and {labels.shape}')

The shapes of events, feature matrix and labels df are (4900, 4), (4900, 6) and (4900, 4)


In [11]:
cv = PurgedKFold(n_splits=3, t1 = events.t1, pctEmbargo=0.02)
for train_idx, test_idx in cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

  maxT1 = self.t1[test_indices].max()


## Using PurgedKFold Class

In [13]:
#get labels
labels = getTBMLabels(events, df.close)

In [None]:
#indicator matrix
indicator_matrix = getIndMatrix(barIx=df.close.index,t1=labels.t1)
#average uniqueness
avg_uniq = getAvgUniqueness(indM=indicator_matrix)
#boostrap phi
phi = seqBootstrap(indicator_matrix,sLength=None)
#number of coevents
number_of_coevents = mpNumCoEvents(df.close.index, events.t1, events.index)


In [None]:
df = df.loc[labels.index]
X = df
y = labels.bin

In [None]:
X_boot = X.iloc[phi]
y_boot = y.iloc[phi]
#sample weights
weights_boot = sampleW(events.t1, number_of_coevents, df.close, events.index)

print(f'shape of X boot{X_boot.shape} and shape of y_boot {y_boot.shape}')

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, random_state=42)

# run purged cross-validation
scores = cvScore(
    clf,
    X_boot,
    y_boot,
    sample_weight=weights_boot,
    scoring='neg_log_loss',  # or 'accuracy'
    t1=t1,
    cv=3,                    # number of folds
    pctEmbargo=0.01           # 1% embargo
)

print("CV scores:", scores)
print("Mean score:", scores.mean())
