In [1]:
import sys
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline

sys.dont_write_bytecode = True
from src.utils.utils import *
from src.utils.constants import *
from src.visualization.visualize import *

In [3]:
ground_truth = pd.read_pickle(GROUND_TRUTH_PROCESSED_PATH)
ground_truth


Unnamed: 0,origin_time,next_change,label_positive,label_neutral,label_negative
0,2022-10-01 00:59:00,-1.084448,False,False,True
1,2022-10-01 01:00:00,-0.161160,False,True,False
2,2022-10-01 01:01:00,0.551711,False,True,False
3,2022-10-01 01:02:00,0.035817,False,True,False
4,2022-10-01 01:03:00,0.185375,False,True,False
...,...,...,...,...,...
525535,2023-09-30 23:54:00,-0.001528,False,True,False
525536,2023-09-30 23:55:00,-0.080146,False,True,False
525537,2023-09-30 23:56:00,0.000903,False,True,False
525538,2023-09-30 23:57:00,-0.123311,False,True,False


In [None]:
data = {}

for exchange in EXCHANGES:
    exchange_candles = pd.read_parquet(os.path.join(INTERIM_DATA_PATH, f'{exchange}_{CANDLES}_pca_data.parquet'))
    exchange_orderbook = pd.read_parquet(os.path.join(INTERIM_DATA_PATH, f'{exchange}_{ORDERBOOKS}_pca_data.parquet'))

    data[exchange] = ground_truth.merge(exchange_candles, on='origin_time', how='inner').merge(exchange_orderbook, on='origin_time', how='inner')
    display(data[exchange].head())

In [None]:
for exchange in EXCHANGES:
    df = data[(CANDLES, exchange)]

    cols_to_drop = ['origin_time', 'label']
    merged_df = pd.merge(ground_truth[cols_to_drop], df, on='origin_time', how='inner')
    X = merged_df.drop(cols_to_drop, axis=1)
    y = merged_df['label']


In [None]:
def process_dataset(param_distributions, pipeline, X, y):
    
    inner_cv = StratifiedKFold(n_splits=INNER_CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    
    randomized_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_distributions,
        cv=inner_cv,
        scoring='accuracy',
        n_jobs=-1,
        random_state=RANDOM_STATE,
        error_score='raise',
        return_train_score=True
    )
    
    randomized_search.fit(X, y)

    cv_results = pd.DataFrame(randomized_search.cv_results_)
    best_params = randomized_search.best_params_
    best_score = randomized_search.best_score_
    best_clf = randomized_search.best_estimator_

    return cv_results, best_params, best_score, best_clf

In [None]:
# # Define the SMOTE and undersampler

# smote = SMOTE(random_state=RANDOM_STATE)
# undersample = RandomUnderSampler(random_state=RANDOM_STATE)
# smote_enn = SMOTEENN(smote=smote, random_state=RANDOM_STATE)

# # Create a pipeline
# pipeline = Pipeline(steps=[('smote_enn', smote_enn), ('classifier', RandomForestClassifier(random_state=RANDOM_STATE, class_weight='balanced_subsample'))])

# # Define the hyperparameter search space
# param_distributions = {
#     'classifier__criterion': ['gini', 'entropy'],
#     'classifier__n_estimators': [200, 300, 400, 500],
#     'classifier__ccp_alpha': [0, 0.001, 0.005, 0.01, 0.02],
#     'classifier__min_samples_split': [10, 20, 30],
#     'classifier__min_samples_leaf': [5, 10, 15],
#     'classifier__max_depth': [5, 10, 20, 30],
# }

In [None]:
param_distributions = {
    'depth': [5, 10, 20, 30],
    'criterion': ['gini', 'entropy'],
    'n_estimators': [200, 300, 400, 500],
    'ccp_alpha': [0, 0.001, 0.005, 0.01, 0.02],
    'min_samples_split': [10, 20, 30],
    'min_samples_leaf': [5, 10, 15],
    'max_depth': [5, 10, 20, 30],
}

In [None]:
results = {}
model_data = {}

for (data_type, exchange), df in data.items():

        cols_to_drop = ['origin_time', 'label']
        merged_df = pd.merge(ground_truth[cols_to_drop], df, on='origin_time', how='inner')
        X = merged_df.drop(cols_to_drop, axis=1)
        y = merged_df['label']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)
        cv_results, best_params, best_score, best_clf = process_dataset(param_distributions, pipeline, X_train, y_train)

        y_pred = best_clf.predict(X_test)
        evaluation = get_evaluation(y_test, y_pred)

        results[(data_type, exchange)] = {
                'cv_results': cv_results,
                'best_params': best_params,
                'best_score': best_score,
                'accuracy': evaluation['accuracy'],
                'classification_report': evaluation['classification_report'],
                'confusion_matrix': evaluation['confusion_matrix']
        }
        model_data[(data_type, exchange)] = {
                'X_train': X_train,
                'y_train': y_train,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
        }
        pd.DataFrame.to_pickle(results[(exchange, data_type)], os.path.join(PROCESSED_DATA_PATH, f"{exchange}_{data_type}_random_forest_results.pkl"))
        pd.DataFrame.to_pickle(model_data[(exchange, data_type)], os.path.join(PROCESSED_DATA_PATH, f"{exchange}_{data_type}_random_forest_model_data.pkl"))
        pd.DataFrame.to_pickle(best_clf, os.path.join(MODELS_DATA_PATH, f'{exchange}_{data_type}_random_forest_best_clf.pkl'))

In [None]:
# for (data_type, exchange), df in results.items():
#     path = os.path.join(MODELS_DATA_PATH, f'{data_type}_{exchange}_random_forest_best_clf.pkl')

#     best_clf = pd.read_pickle(os.path.join(MODELS_DATA_PATH, f'{data_type}_{exchange}_random_forest_best_clf.pkl'))
#     plot_learning_curve(
#         exchange,
#         data_type,
#         'random_forest',
#         best_clf,
#         model_data[(data_type, exchange)]['X_train'],
#         model_data[(data_type, exchange)]['y_train'],
#         train_sizes=np.linspace(0.1, 1.0, 3),
#         cv=INNER_CV_FOLDS
#     )

In [None]:
# for (data_type, exchange), df in results.items():
#     plot_confusion_matrix(
#         exchange,
#         data_type,
#         'random_forest',
#         df['confusion_matrix'],
#         labels=['positive', 'neutral', 'negative']
#     )

In [None]:
# for (data_type, exchange), df in results.items():
#         # Analyzing cv_results_ to check for overfitting
#     cv_results = df['cv_results']
#     cv_results['score_diff'] = cv_results['mean_train_score'] - cv_results['mean_test_score']
#     overfitting_threshold = 0.1  # Define a threshold for overfitting
#     overfitting = cv_results[cv_results['score_diff'] > overfitting_threshold]

#     if not overfitting.empty:
#             print("Possible overfitting detected in the following parameter combinations:")
#             print(overfitting[['params', 'mean_train_score', 'mean_test_score', 'score_diff']])
#     else:
#             print("No significant overfitting detected.")
#     display(f'{exchange} {data_type} Random Forest:\nbest score {df["best_score"]}')
#     display(pd.DataFrame.from_dict(df['best_params'], orient='index').T)
#     display(df['confusion_matrix'])
#     display(df['classification_report'])
#     display(df['cv_results'])