In [None]:
import sys
sys.dont_write_bytecode = True
import os
from itertools import product
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from src.models.train_model import BTCForecasting

import pickle
import gc

from src.utils.utils import *
from src.utils.constants import *
from src.visualization.visualize import *

In [None]:
# Classifiers

classifiers = {
    'RFC_balanced_subsample': RandomForestClassifier(random_state=RANDOM_STATE, class_weight='balanced_subsample', n_jobs=-1),
}

In [None]:
# Training params

view_params = {
    'candles': {
        'use_smoteenn': False,
        'feature_selection': 'sfm',
        'factor': 2,
        'aggresive_elimination': False,
        'n_splits': 3,
        'classifier__max_depth': range(5, 20),
        'classifier__min_samples_leaf': range(1, 2, 1),
        'classifier__min_samples_split': range(2, 10, 2),
        'classifier__max_leaf_nodes': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    },
    'orderbook': {
        'use_smoteenn': False,
        'feature_selection': 'sfm',
        'factor': 2,
        'aggresive_elimination': False,
        'n_splits': 3,
        'classifier__max_depth': range(5, 20),
        'classifier__min_samples_leaf': range(1, 2, 1),
        'classifier__min_samples_split': range(2, 20, 2),
        'classifier__max_leaf_nodes': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    },
    'unified': {
        'use_smoteenn': False,
        'feature_selection': 'sfm',
        'factor': 2,
        'aggresive_elimination': False,
        'n_splits': 3,
        'classifier__max_depth': range(5, 20),
        'classifier__min_samples_leaf': range(1, 2, 1),
        'classifier__min_samples_split': range(2, 20, 2),
        'classifier__max_leaf_nodes': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    }
}

In [None]:
# Import data

data = {
    (exchange, data_type): pd.read_parquet(
        os.path.join(INTERIM_DATA_PATH, f'{exchange}_{data_type}_data.parquet')
    )
    for exchange, data_type in product(EXCHANGES, DATA_TYPES)
}
data[(ALL, UNIFIED)] = pd.read_parquet(os.path.join(INTERIM_DATA_PATH, f'{ALL}_{UNIFIED}_data.parquet'))

ground_truth = pd.read_parquet(os.path.join(INTERIM_DATA_PATH, 'ground_truth_data.parquet'))

In [None]:
# Training

model_params = {}

for (name, classifier) in classifiers.items():
    for (exchange, data_type), df in data.items():
        
        params = view_params[data_type]

        use_smoteenn = params['use_smoteenn']
        factor = params['factor']
        aggressive_elimination = params['aggresive_elimination']
        n_splits = params['n_splits']
        
        print(f"\nStarting training {name} model for {exchange} - {data_type}")

        btcf = BTCForecasting(
            df, 
            ground_truth,
            n_splits=n_splits,
            smoteenn=use_smoteenn,
            pca_variance_threshold=PCA_VARIANCE_THRESHOLD,
            feature_selection=params['feature_selection'],
        )

        model_params = {k: v for k, v in params.items() if k.startswith('classifier__')}

        btcf.train(classifier, model_params, factor=factor, aggressive_elimination=aggressive_elimination, verbose=3)

        save_model(btcf, name, exchange, data_type)

In [None]:
# Evaluation

for classifier_name, _ in classifiers.items():
        for exchange, data_type in product(EXCHANGES, DATA_TYPES):
                model_path = os.path.join(MODELS_DATA_PATH, f"{classifier_name}_{exchange}_{data_type}.pkl")
                
                if not os.path.exists(model_path):
                    continue

                with open(model_path, "rb") as model_file:
                    btcf = pickle.load(model_file)

                btcf.evaluate()

                save_model(btcf, classifier_name, exchange, data_type)
                
                print(f"\nBest {classifier_name} model for {exchange} - {data_type}")
                
                # Plots
                filename_prefix = f"{classifier_name}_{exchange}_{data_type}"
                val_curve_param = 'classifier__max_depth'

                btcf.plot_learn_cm_feat(f"{filename_prefix}_learn_val_feat.png")
                btcf.plot_bias_variance_tradeoff(f"{filename_prefix}_bias_variance_tradeoff.png")

In [None]:
# Results

eval_results = []
t_test_data = []
comparison_data = []
t_test_metrics = ['mean_test_score', 'mean_train_score', 'std_test_score', 'std_train_score']

for classifier_name, _ in classifiers.items():
        for exchange, data_type in product(EXCHANGES, DATA_TYPES):
                
                model_path = os.path.join(MODELS_DATA_PATH, f"{classifier_name}_{exchange}_{data_type}.pkl")
                if not os.path.exists(model_path):
                    continue

                with open(model_path, "rb") as model_file:
                    btcf = pickle.load(model_file)

                eval_metrics = [x for x in btcf.results.keys() if x not in ['best_params', 'conf_matrix', 'cv_results', 'classification_report', 'feature_selection']]
                metrics_dict = {metric: btcf.results[metric] for metric in eval_metrics}
                eval_record = {
                    'classifier': classifier_name,
                    'exchange': exchange,
                    'data_type': data_type
                }
                
                eval_record.update(btcf.results['best_params'])  
                eval_record.update(metrics_dict)
                eval_results.append(eval_record)

                t_record = { metric: btcf.results['cv_results'][metric] for metric in t_test_metrics }
                t_record.update({'exchange': exchange, 'data_type': data_type, 'classifier': classifier_name })
                t_test_data.append(t_record)

                comparison_record = {**eval_record, **t_record}
                comparison_data.append(comparison_record)

results_df = pd.DataFrame(eval_results)
results_df.to_csv(os.path.join(REPORTS_PATH, 'results.csv'), index=False)
display(results_df)

data_types = results_df['data_type'].unique()
all_comparisons = pd.concat([compute_comparison(pd.DataFrame(comparison_data), data_type) for data_type in data_types], ignore_index=True)
display(all_comparisons)

ttest_results = pd.concat([perform_ttest(pd.DataFrame(t_test_data), metric) for metric in t_test_metrics], ignore_index=True)
display(ttest_results)