In [1]:
import sys
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, KFold
from imblearn.over_sampling import SMOTE

# import dask
# from dask.distributed import Client
# from dask.dataframe import from_pandas
# import socket
# from dotenv import load_dotenv

# src_path = os.path.abspath(os.path.join('..'))
# if src_path not in sys.path:
#     sys.path.append(src_path)

sys.dont_write_bytecode = True
from src.utils.utils import *
from src.utils.constants import *
from src.visualization.visualize import *

In [2]:
# load_dotenv()
# scheduler_address = os.getenv("DISTRIBUTED_MAIN_IP")
# client = Client("192.168.1.58:8786")

In [3]:
ground_truth = pd.read_parquet(GROUND_TRUTH_PATH)
ground_truth = ground_truth[['origin_time', 'label']]

In [4]:
data = {}

for exchange in EXCHANGES:
    data[(CANDLES, exchange)] = pd.read_parquet(os.path.join(INTERIM_DATA_PATH, f'{exchange}_{CANDLES}_pca_data.parquet'))
    data[(ORDERBOOKS, exchange)] = pd.read_parquet(os.path.join(INTERIM_DATA_PATH, f'{exchange}_{ORDERBOOKS}_pca_data.parquet'))

In [5]:
# merged_df = {}
# cols_to_drop = ['origin_time', 'label']

# for (data_type, exchange), df in data.items():     
#     merged_df[(data_type, exchange)] = {}
#     merged_df[(data_type, exchange)]['full'] = pd.merge(ground_truth[cols_to_drop], df, on='origin_time', how='inner')
#     merged_df[(data_type, exchange)]['X'] = merged_df[(data_type, exchange)]['full'].drop(cols_to_drop, axis=1)
#     merged_df[(data_type, exchange)]['y'] = merged_df[(data_type, exchange)]['full']['label']

In [6]:
# def process_dataset(param_distributions, df):
#     X_train, X_test, y_train, y_test = train_test_split(df['X'], df['y'], test_size=TEST_SIZE, random_state=RANDOM_STATE)

#     # Nested Cross-Validation: Uses an outer loop for model evaluation and an inner loop for hyperparameter tuning.
#     outer_cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
#     inner_cv = KFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
    
#     clf = RandomForestClassifier(random_state=RANDOM_STATE)

#     # Initialize the RandomizedSearchCV object
#     randomized_search = RandomizedSearchCV(estimator=clf, param_distributions=param_distributions, n_iter=50, cv=CV_FOLDS, scoring='accuracy', n_jobs=-1, random_state=RANDOM_STATE)

#     # Perform nested cross-validation
#     nested_scores = cross_val_score(randomized_search, df['X'], df['y'], cv=outer_cv, scoring='accuracy')

#     # Fit the random search to the data
#     randomized_search.fit(X_train, y_train)

#     # Best parameters and score
#     best_params = randomized_search.best_params_
#     best_score = randomized_search.best_score_
#     best_clf = randomized_search.best_estimator_

#     # Predict on the test set
#     y_pred = best_clf.predict(X_test)

#     # Evaluate the model
#     evaluation = get_evaluation(y_test, y_pred)

#     train_scores = []
#     test_scores = []
#     n_estimators_range = param_distributions['n_estimators']

#     for n_estimators in n_estimators_range:
#         model = RandomForestClassifier(
#             criterion=best_params['criterion'],
#             random_state=RANDOM_STATE, 
#             n_estimators=n_estimators,
#             max_depth=best_params['max_depth'],
#             min_samples_split=best_params['min_samples_split'], 
#             min_samples_leaf=best_params['min_samples_leaf']
#         )
        
#         # Cross-validation on the training data
#         train_cv_results = cross_val_score(model, df['X_train'], df['y_train'], cv=inner_cv, scoring='accuracy')
#         train_scores.append(train_cv_results.mean())
        
#         # Evaluate on the test set
#         model.fit(df['X_train'], df['y_train'])
#         test_score = model.score(df['X_test'], df['y_test'])
#         test_scores.append(test_score)

#     return {
#         'best_params': best_params,
#         'best_score': best_score,
#         'nested_scores': nested_scores,
#         'evaluation': evaluation,
#         'train_scores': train_scores,
#         'test_scores': test_scores,
#         'n_estimators_range': n_estimators_range
#     }

In [7]:
def process_dataset(param_distributions, X, y):


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

    smote = SMOTE(random_state=RANDOM_STATE)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

    inner_cv = KFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    
    clf = RandomForestClassifier(random_state=RANDOM_STATE, class_weight='balanced_subsample', verbose=3)

    randomized_search = RandomizedSearchCV(
        estimator=clf,
        param_distributions=param_distributions,
        cv=inner_cv,
        scoring='accuracy',
        n_jobs=-1,
        random_state=RANDOM_STATE,
        verbose=3,
        error_score='raise',
        return_train_score=True
    )
    
    randomized_search.fit(X_resampled, y_resampled)

    best_params = randomized_search.best_params_
    best_score = randomized_search.best_score_
    best_clf = randomized_search.best_estimator_

    y_pred = best_clf.predict(X_test)
    accuracy, classification_report, confusion_matrix = get_evaluation(y_test, y_pred)

    plot_confusion_matrix(y_test, y_pred, labels=['positive', 'neutral', 'negative'])
    plot_learning_curve(best_clf, X_resampled, y_resampled)

    return {
        'best_params': best_params,
        'best_score': best_score,
        'accuracy': accuracy,
        'classification_report': classification_report,
        'confusion_matrix': confusion_matrix
    }

In [10]:
param_distributions = {
    'criterion': 'gini',
    'n_estimators': [50, 100, 200, 300, 500],
    'ccp_alpha': [0, 0.001, 0.005, 0.010],
    'min_samples_split': [2, 10, 20, 30, 50],
    'min_samples_leaf': [1, 5, 10, 20, 30]
}

In [11]:
results = {}

for (data_type, exchange), df in data.items():

    cols_to_drop = ['origin_time', 'label']
    merged_df = pd.merge(ground_truth[cols_to_drop], df, on='origin_time', how='inner')
    X = merged_df.drop(cols_to_drop, axis=1)
    y = merged_df['label']
    results[(data_type, exchange)] = process_dataset(param_distributions, X, y)

    pd.DataFrame.to_pickle(results[(data_type, exchange)], os.path.join(PROCESSED_DATA_PATH, f"{exchange}_{data_type}_random_forest_result.pkl"))


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [32]:
for (data_type, exchange), df in data.items():
    print(results[(data_type, exchange)]['best_params'])
    print(results[(data_type, exchange)]['best_score'])
    print(results[(data_type, exchange)]['nested_scores'])
    print(results[(data_type, exchange)]['evaluation']['accuracy'])
    print(results[(data_type, exchange)]['evaluation']['classification_report'])
    print(results[(data_type, exchange)]['evaluation']['confusion_matrix'])

{'n_estimators': 50, 'min_samples_split': 20, 'min_samples_leaf': 30, 'criterion': 'gini', 'ccp_alpha': 0}
0.6994329652399247
[0.69926335 0.69905948 0.69958954 0.70004757 0.69868841]
0.6978092374827162
              precision    recall  f1-score        support
positive       0.392096  0.087099  0.142536   24719.000000
neutral        0.721360  0.973015  0.828499  108395.000000
negative       0.401778  0.097564  0.157003   24548.000000
accuracy       0.697809  0.697809  0.697809       0.697809
macro avg      0.505078  0.385893  0.376012  157662.000000
weighted avg   0.619977  0.697809  0.616398  157662.000000
               pred:positive  pred:neutral  pred:negative
true:positive           2153         20479           2087
true:neutral            1446        105470           1479
true:negative           1892         20261           2395
{'n_estimators': 50, 'min_samples_split': 50, 'min_samples_leaf': 20, 'criterion': 'gini', 'ccp_alpha': 0.01}
0.6934123855456638
[0.69266028 0.69565153 0

In [None]:
# results = {}
# for (data_type, exchange), df in merged_df.items():
#     future = client.submit(process_dataset(data_type, exchange, param_distributions, merged_df[(CANDLES, BINANCE)]))
#     results[(data_type, exchange)] = future.compute()

In [None]:
# for (data_type, exchange), result in zip(merged_df.keys(), results):
#     print(f"Results for {exchange} {data_type}:")
#     print(f"Best parameters for {exchange} {data_type}: {result['best_params']}")
#     print(f'Nested CV Accuracy: {result["nested_scores"].mean():.2f}')
#     print(f'Test Set Accuracy: {result["evaluation"]["accuracy"]:.2f}')
#     print(f'Classification Report:')
#     display(result['evaluation']['classification_report'])
#     print(f'Confusion Matrix:')
#     display(result['evaluation']['confusion_matrix'])

#     plot_tree_learning_curves(exchange, data_type, result['depths'], result['train_scores'], result['test_scores'], 'random_forest')

#     pd.DataFrame.to_pickle(result, os.path.join(PROCESSED_DATA_PATH, f'{exchange}_{data_type}_random_forest_results.pkl'))

In [None]:
# for (data_type, exchange), df in merged_df.items():
#     result = process_dataset(data_type, exchange, param_distributions, df)
#     print(f"Results for {exchange} {data_type}:")
#     print(f"Best parameters: {result['best_params']}")
#     print(f'Nested CV Accuracy: {result["nested_scores"].mean():.2f}')
#     print(f'Accuracy: {result["evaluation"]["accuracy"]:.2f}')
#     print(f'Classification Report:')
#     display(result['evaluation']['classification_report'])
#     print(f'Confusion Matrix:')
#     display(result['evaluation']['confusion_matrix'])
#     plot_tree_learning_curves(exchange, data_type, result['depths'], result['train_scores'], result['test_scores'], 'random_forest')
#     pd.DataFrame.to_pickle(result, os.path.join(PROCESSED_DATA_PATH, f'{exchange}_{data_type}_random_forest_results.pkl'))