In [5]:
# Standard library modules
import asyncio
import os
from pathlib import Path

# Third-party modules
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss

# FinancialMachineLearning modules
from FinancialMachineLearning.cross_validation.combinatorial import CombinatorialPurgedKFold
from FinancialMachineLearning.cross_validation.cross_validation import PurgedKFold, cross_val_score, get_train_times
from FinancialMachineLearning.sample_weights.bootstrapping import *

# Claude modules
from claude.feature_storage import FeatureStorage
from claude.train_model import build_model
from claude.validate import _split_data

In [None]:
fs = FeatureStorage(str(Path(os.getcwd()) / "claude/Data/financial_features.parquet"))
data = fs.load_existing_features()[0]

In [None]:
data

In [None]:
model = build_model(features=data, use_cache=False)

In [None]:
model

In [None]:
X_train, X_test = _split_data(model.X_clean, "2023-01-01")
y_train, y_test = _split_data(model.y_size, "2023-01-01")

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

In [None]:
class SequentialRandomForestClassifier(RandomForestClassifier):
    def _generate_sample_indices(self, random_state, n_samples):
        """Generate bootstrap sample indices with sequential bootstrap method."""
        random_instance = random_state  # get the RandomState instance
        
        ind_mat = get_indicator_matrix(
            model.triple_barrier_events.index.to_series(),
            model.triple_barrier_events['t1']
        )
        
        sample_indices = seq_bootstrap(ind_mat, n_samples)
        
        return sample_indices

In [None]:
forest = SequentialRandomForestClassifier(
    criterion = 'entropy',
    class_weight = 'balanced_subsample',
    random_state = 42,
    n_estimators = 100,
    max_features = 3, # early stopping
    min_weight_fraction_leaf = 0.05, # early stopping
    oob_score = True
)

forest_fit = forest.fit(
    X = X_train, 
    y = y_train, 
    sample_weight = model.avg_uniq.loc[X_train.index].to_numpy().reshape(1, -1)[0]
)

In [None]:
train_times, test_times = model.triple_barrier_events.loc[:'2019'], model.triple_barrier_events.loc['2019':]

In [None]:
train_times = get_train_times(train_times['t1'], test_times['t1'])

In [None]:
train_times.head()

In [None]:
purged_k_fold = PurgedKFold(
    n_splits = 10,
    samples_info_sets = model.triple_barrier_events['t1'].loc[X_train.index],
    pct_embargo = 0.01
)

In [None]:
cross_validation_score = cross_val_score(
    classifier = forest,
    X = X_train,
    y = y_train,
    sample_weight = model.avg_uniq.loc[X_train.index].to_numpy().reshape(1, -1)[0],
    cv_gen = purged_k_fold,
    scoring = log_loss
)

In [None]:
fig, ax = plt.subplots(figsize = (8, 4))
ax.bar(
    range(10),
    -cross_validation_score,
    width = 0.75,
    alpha = 0.7,
    edgecolor = 'none'
)
for spine in ax.spines.values():
    spine.set_visible(False)
ax.grid(False)
ax.yaxis.grid(True, ls = ':', alpha = 0.5)
plt.axhline(
    y = np.mean(-cross_validation_score),
    ls = '-.',
    color = 'lightgray',
    alpha = 0.8,
    label = f'mean = {np.mean(-cross_validation_score) * 100 : .2f}%'
)
plt.legend()
plt.title('Purged Cross Validation Scores | Random Forest | Ticker : ES')
plt.show()

In [None]:
samples_info_sets = model.triple_barrier_events['t1'].loc[X_train.index]

In [None]:
comb_purge_fold = CombinatorialPurgedKFold(
    n_splits = 5,
    n_test_splits = 2, 
    samples_info_sets = samples_info_sets, 
    pct_embargo = 0.06
)

for train_indices, test_indices in comb_purge_fold.split(X_train, y_train):
    X_train_valid, X_test_valid = X_train.iloc[train_indices], X_train.iloc[test_indices]
    y_train_valid, y_test_valid = y_train.iloc[train_indices], y_train.iloc[test_indices]

    clf = RandomForestClassifier(random_state = 42)
    clf.fit(X_train_valid, y_train_valid)
    
    y_pred = clf.predict(X_test_valid)
    accuracy = accuracy_score(y_test_valid, y_pred)
    print(f'Accuracy: {accuracy:.4f}')