In [1]:
import os
import sys
from itertools import product
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from src.models.train_model import BTCForecasting

import pickle

sys.dont_write_bytecode = True
from src.utils.utils import *
from src.utils.constants import *
from src.visualization.visualize import *

In [2]:
data = {
    (exchange, data_type): pd.read_parquet(
        os.path.join(INTERIM_DATA_PATH, f'{exchange}_{data_type}_data.parquet')
    )
    for exchange, data_type in product(EXCHANGES, DATA_TYPES)
}
data[(ALL, UNIFIED)] = pd.read_parquet(os.path.join(INTERIM_DATA_PATH, f'{ALL}_{UNIFIED}_data.parquet'))

ground_truth = pd.read_parquet(os.path.join(INTERIM_DATA_PATH, 'ground_truth_data.parquet'))

In [None]:
use_smoteenn = True
use_rfe = True
n_splits = 10

In [4]:
# Training

# Define a dictionary for storing the results
btcfs = {}

for (exchange, data_type), df in data.items():
    
    classifiers = {
        'XGBoost': (xgb.XGBClassifier(random_state=RANDOM_STATE, verbosity=0), {
            'classifier__n_estimators': 100,
            'classifier__learning_rate': 0.1,
            'classifier__max_depth': 6
        }),
        'CatBoost': (CatBoostClassifier(random_state=RANDOM_STATE, verbose=0), {
            'classifier__iterations': 100,
            'classifier__learning_rate': 0.1,
            'classifier__depth': 6
        })
    }
    
    best_accuracy = 0
    best_model = None
    best_params = None
    best_model_name = None
    
    for model_name, (classifier, params) in classifiers.items():
        btcf = BTCForecasting(data=df, ground_truth=ground_truth, n_splits=n_splits, smoteenn=use_smoteenn, rfe=use_rfe)
        
        print(f"\nTraining models for {exchange} - {data_type} using {model_name}...")
        
        btcf.train_model(classifier, classifier_params=params, features_to_select=len(df.columns) // 2 if use_rfe else None)
        btcf.evaluate_model()
        
        accuracy = btcf.results['accuracy']
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = btcf
            best_params = btcf.results['best_params']
            best_model_name = model_name
    
    btcfs[(exchange, data_type)] = best_model
    
    # Save the best model and results
    model_path = os.path.join(MODELS_DATA_PATH, f"{exchange}_{data_type}_{best_model_name.lower()}_model.pkl")
    
    with open(model_path, "wb") as f:
        pickle.dump(best_model, f)
    
    print(f"Best model for {exchange} - {data_type}: {best_model_name} with accuracy {best_accuracy}")

del(data)



Training models for BINANCE - candles using RandomForest...
n_iterations: 10
n_required_iterations: 10
n_possible_iterations: 10
min_resources_: 817
max_resources_: 418344
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 672
n_resources: 817
Fitting 10 folds for each of 672 candidates, totalling 6720 fits
----------
iter: 1
n_candidates: 336
n_resources: 1634
Fitting 10 folds for each of 336 candidates, totalling 3360 fits
----------
iter: 2
n_candidates: 168
n_resources: 3268
Fitting 10 folds for each of 168 candidates, totalling 1680 fits
----------
iter: 3
n_candidates: 84
n_resources: 6536
Fitting 10 folds for each of 84 candidates, totalling 840 fits
----------
iter: 4
n_candidates: 42
n_resources: 13072
Fitting 10 folds for each of 42 candidates, totalling 420 fits
----------
iter: 5
n_candidates: 21
n_resources: 26144
Fitting 10 folds for each of 21 candidates, totalling 210 fits
----------
iter: 6
n_candidates: 11
n_resources: 52288
Fitting 10 folds for

Parameters: { "max_leaf_nodes", "min_samples_leaf", "min_samples_split", "verbose" } are not used.

Parameters: { "max_leaf_nodes", "min_samples_leaf", "min_samples_split", "verbose" } are not used.

Parameters: { "max_leaf_nodes", "min_samples_leaf", "min_samples_split", "verbose" } are not used.

Parameters: { "max_leaf_nodes", "min_samples_leaf", "min_samples_split", "verbose" } are not used.

Parameters: { "max_leaf_nodes", "min_samples_leaf", "min_samples_split", "verbose" } are not used.

Parameters: { "max_leaf_nodes", "min_samples_leaf", "min_samples_split", "verbose" } are not used.

Parameters: { "max_leaf_nodes", "min_samples_leaf", "min_samples_split", "verbose" } are not used.

Parameters: { "max_leaf_nodes", "min_samples_leaf", "min_samples_split", "verbose" } are not used.

Parameters: { "max_leaf_nodes", "min_samples_leaf", "min_samples_split", "verbose" } are not used.

Parameters: { "max_leaf_nodes", "min_samples_leaf", "min_samples_split", "verbose" } are not used.



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve, validation_curve
from sklearn.metrics import auc

def compute_curve_metrics(estimator, X, y, cv):
    train_sizes, train_scores, valid_scores = learning_curve(estimator, X, y, cv=cv)
    train_mean = np.mean(train_scores, axis=1)
    valid_mean = np.mean(valid_scores, axis=1)
    
    auc_train = auc(train_sizes, train_mean)
    auc_valid = auc(train_sizes, valid_mean)
    
    return {
        'train_mean': train_mean,
        'valid_mean': valid_mean,
        'auc_train': auc_train,
        'auc_valid': auc_valid
    }

def compare_models(models, X, y, cv):
    results = []
    for model in models.items():
        result = compute_curve_metrics(model.pipeline, X, y, cv)
        result['model'] = model.classifier.__class__.__name__
        results.append(result)
    
    return results

# Perform comparison
results = compare_models(btcfs, btcfs.X_train, btcfs.y_train, btcfs.tscv)

# Display results
for result in results:
    print(f"Model: {result['model']}, AUC Train: {result['auc_train']}, AUC Valid: {result['auc_valid']}")

# Visualization
for result in results:
    plt.plot(result['train_mean'], label=f"Train {result['model']}")
    plt.plot(result['valid_mean'], label=f"Valid {result['model']}")
plt.legend()
plt.show()


In [None]:
# btcfs = {
#     (exchange, data_type): pd.read_pickle(
#         os.path.join(MODELS_DATA_PATH, f"{exchange}_{data_type}_rfc_model.pkl")
#     )
#     for exchange, data_type in product(EXCHANGES, DATA_TYPES)
# }
# btcfs[(ALL, UNIFIED)] = pd.read_pickle(os.path.join(MODELS_DATA_PATH, f'{ALL}_{UNIFIED}_rfc_model.pkl'))

In [None]:
# # Evaluation

# eval_results = []
# for (exchange, data_type), btcf in btcfs.items():
#         print(f"Best RFC model for {exchange} - {data_type}")

#         btcf.evaluate_model()

#         print(f"Parameters: {btcf.results['best_params']}")
#         print(f"Best score: {btcf.results['best_score']}")

#         print(f"Confusion matrix:")
#         plot_confusion_matrix(confusion_matrix=btcf.results['conf_matrix'], display_labels=list(btcf.le.classes_))

#         # Plots
#         if(btcf.rfe):
#                 btcf.plot_feature_importance(f"{exchange}_{data_type}_rfc_feature_importance.png")
#         btcf.plot_learning_curves(f"{exchange}_{data_type}_rfc_learning_curve.png")
#         btcf.plot_validation_curves('max_depth', view_params[data_type]['max_depth_range'], f"{exchange}_{data_type}_rfc_validation_curve_max_depth.png")

In [None]:
# # Evaluation results

# eval_results = []
# for (exchange, data_type), model in btcfs.items():

#     result = {
#         'exchange': exchange,
#         'data_type': data_type,
#         'accuracy': model.results.get('accuracy'),
#         'mean_test_score': model.results['cv_results']['mean_test_score'],
#         'mean_fit_time': model.results['cv_results']['mean_fit_time'],
#         'mean_score_time': model.results['cv_results']['mean_score_time'],
#         'mean_train_score': model.results['cv_results']['mean_train_score']
#     }
#     best_params = model.results.get('best_params')
#     result.update(best_params)
#     eval_results.append(result)

# eval_results_df = pd.DataFrame(eval_results)
# display(eval_results_df)


In [None]:
# # Performing t-tests for each metric

# metrics = ['mean_test_score', 'mean_fit_time', 'mean_score_time', 'mean_train_score']
# ttest_results = pd.concat([perform_ttest(eval_results_df, metric) for metric in metrics], ignore_index=True)

# display(ttest_results)

In [None]:
# # Results comparison

# data_types = eval_results_df['data_type'].unique()
# all_comparisons = pd.concat([compute_comparison(eval_results_df, data_type) for data_type in data_types], ignore_index=True)

# display(all_comparisons)

### General Insights
BINANCE models generally outperform those from HUOBI and OKX, with significant differences in most metrics, suggesting better market conditions for BINANCE. Thus, the focus should be on leveraging BINANCE data for more accurate predictions, as it consistently shows better performance. Improving data pipelines for HUOBI and OKX could also help boost their predictive capabilities. 

The t-tests on train and test scores indicate that some models, especially from HUOBI, may be underfitting, as seen in lower training and testing scores. Conversely, significant differences between train and test scores for BINANCE models suggest potential overfitting, necessitating more robust validation techniques.

Underfitting should be addressed by improving feature engineering for HUOBI and OKX datasets, and mitigate overfitting in BINANCE models through better cross-validation and regularization.