In [None]:
import sys
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, StratifiedKFold
from dotenv import load_dotenv
import dask
from dask.dataframe import from_pandas
from dask.distributed import Client
import dask.distributed
import joblib

src_path = os.path.abspath(os.path.join('..'))
if src_path not in sys.path:
    sys.path.append(src_path)

from src.utils.constants import *
from src.utils.utils import *
from src.visualization.visualize import *

In [None]:
Client(n_workers=4, threads_per_worker=2, memory_limit='2GB')

In [None]:
ground_truth = pd.read_parquet(GROUND_TRUTH_PATH)
ground_truth = ground_truth[['origin_time', 'label']]

In [None]:
imported_data = {}

for exchange in EXCHANGES:
    imported_data[(CANDLES, exchange)] = pd.read_parquet(os.path.join(INTERIM_DATA_PATH, f'{exchange}_{CANDLES}_pca_data.parquet'))
    imported_data[(ORDERBOOKS, exchange)] = pd.read_parquet(os.path.join(INTERIM_DATA_PATH, f'{exchange}_{ORDERBOOKS}_pca_data.parquet'))

In [None]:
merged_df = {}
cols_to_drop = ['origin_time', 'label']

for (data_type, exchange), df in imported_data.items():     
    merged_df[(data_type, exchange)] = {}
    merged_df[(data_type, exchange)]['full'] = pd.merge(ground_truth[cols_to_drop], df, on='origin_time', how='inner')
    merged_df[(data_type, exchange)]['X'] = merged_df[(data_type, exchange)]['full'].drop(cols_to_drop, axis=1)
    merged_df[(data_type, exchange)]['y'] = merged_df[(data_type, exchange)]['full']['label']

In [None]:
# @dask.delayed
def process_dataset(param_distributions, df):
    X_train, X_test, y_train, y_test = train_test_split(df['X'], df['y'], test_size=TEST_SIZE, random_state=RANDOM_STATE)

    # Nested Cross-Validation: Uses an outer loop for model evaluation and an inner loop for hyperparameter tuning.
    outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
    
    # Initialize the base classifier
    clf = DecisionTreeClassifier(random_state=RANDOM_STATE)
    
    # Initialize the RandomizedSearchCV object
    randomized_search = RandomizedSearchCV(estimator=clf, param_distributions=param_distributions, n_iter=50, cv=inner_cv, scoring='accuracy', n_jobs=-1, random_state=RANDOM_STATE)

    with joblib.parallel_backend('dask'):
    # Perform nested cross-validation
        nested_scores = cross_val_score(randomized_search, df['X'], df['y'], cv=outer_cv, scoring='accuracy')
        
        # Fit the random search to the training data
        randomized_search.fit(X_train, y_train)

    # Best parameters and score from RandomizedSearchCV
    best_params = randomized_search.best_params_
    best_score = randomized_search.best_score_
    best_clf = randomized_search.best_estimator_

    # Predict on the test set using the best estimator
    y_pred = best_clf.predict(X_test)

    # Evaluate the model
    evaluation = get_evaluation(y_test, y_pred)

    # Collect learning curve data
    train_scores = []
    test_scores = []
    depths = param_distributions['max_depth']

    with joblib.parallel_backend('dask'):
        # Gather learning curve data
        for depth in depths:
            model = DecisionTreeClassifier(
                criterion=best_params['criterion'],
                random_state=RANDOM_STATE, 
                max_depth=depth,
                min_samples_split=best_params['min_samples_split'], 
                min_samples_leaf=best_params['min_samples_leaf'],
                max_leaf_nodes=best_params.get('max_leaf_nodes', None),
                min_impurity_decrease=best_params.get('min_impurity_decrease', 0.0)
            )
            
            # Cross-validation on the training data
            train_cv_results = cross_val_score(model, X_train, y_train, cv=inner_cv, scoring='accuracy')
            train_scores.append(train_cv_results.mean())
            
            # Evaluate on the test set
            model.fit(X_train, y_train)
            test_score = model.score(X_test, y_test)
            test_scores.append(test_score)

    return {
        'best_params': best_params,
        'best_score': best_score,
        'nested_scores': nested_scores,
        'evaluation': evaluation,
        'train_scores': train_scores,
        'test_scores': test_scores,
        'depths': depths
    }

In [None]:
param_distributions = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(1, 20),
    'min_samples_split': [2, 5, 10, 20, 50],
    'min_samples_leaf': [5, 10, 20, 30, 50],
    'max_leaf_nodes': [10, 20, 50, 100],
    'min_impurity_decrease': [0.0, 0.01, 0.05, 0.1],
    'ccp_alpha': [0.0, 0.01, 0.02, 0.05, 0.1]
}

In [None]:
# tasks = [process_dataset(data_type, exchange, param_distributions, df) for (data_type, exchange), df in data.items()]
# futures = client.compute(tasks)
# results = client.gather(futures)

In [None]:
# for (data_type, exchange), result in zip(data.keys(), results):
#     print(f"Best parameters for {exchange} {data_type}: {result['best_params']}")
#     print(f'{exchange} {data_type} Nested CV Accuracy: {result["nested_scores"].mean():.2f}')
#     print(f'{exchange} {data_type} Test Set Accuracy: {result["evaluation"]["accuracy"]:.2f}')
#     print(f'{exchange} {data_type} Classification Report:')
#     display(result['evaluation']['classification_report'])
#     print(f'{exchange} {data_type} Confusion Matrix:')
#     display(result['evaluation']['confusion_matrix'])

#     plot_tree_learning_curves(exchange, data_type, result['depths'], result['train_scores'], result['test_scores'], 'decision_tree')

#     pd.DataFrame.to_pickle(result, os.path.join(PROCESSED_DATA_PATH, f'{exchange}_{data_type}_decision_tree_results.pkl'))

In [None]:
for (data_type, exchange), df in merged_df.items():
    ddf = from_pandas(merged_df, npartitions=1)
    result = process_dataset(param_distributions, ddf)
    print(f"Results for {exchange} {data_type}:")
    print(f"Best parameters: {result['best_params']}")
    print(f'Nested CV Accuracy: {result["nested_scores"].mean():.2f}')
    print(f'Accuracy: {result["evaluation"]["accuracy"]:.2f}')
    print(f'Classification Report:')
    display(result['evaluation']['classification_report'])
    print(f'Confusion Matrix:')
    display(result['evaluation']['confusion_matrix'])
    plot_tree_learning_curves(exchange, data_type, result['depths'], result['train_scores'], result['test_scores'], 'decision_tree')
    pd.DataFrame.to_pickle(result, os.path.join(PROCESSED_DATA_PATH, f'{exchange}_{data_type}_decision_tree_results.pkl'))