In [1]:
import sys
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

src_path = os.path.abspath(os.path.join('..'))
if src_path not in sys.path:
    sys.path.append(src_path)

from src.utils.constants import *
from src.visualization.visualize import *

In [2]:
ground_truth = pd.read_parquet(GROUND_TRUTH_PATH)
gt_to_merge = ground_truth.copy()
ground_truth = ground_truth[['origin_time', 'label']]

In [3]:
imported_data = {}

for exchange in EXCHANGES:
  
    imported_data[(CANDLES, exchange)] = pd.read_pickle(os.path.join(INTERIM_DATA_PATH, f'{exchange}_{CANDLES}_pca_data.pkl'))
    imported_data[(ORDERBOOKS, exchange)] = pd.read_pickle(os.path.join(INTERIM_DATA_PATH, f'{exchange}_{ORDERBOOKS}_pca_data.pkl'))

In [4]:
for(exchange, data_type), df in imported_data.items():
    print(f'{exchange} {data_type}: {df.shape}')

candles BINANCE: (525541, 6)
orderbooks BINANCE: (479971, 38)
candles HUOBI: (405542, 6)
orderbooks HUOBI: (393120, 38)
candles OKX: (405554, 6)
orderbooks OKX: (393120, 38)


In [5]:
merged_df = {}
cols_to_drop = ['origin_time', 'label']

for (data_type, exchange), df in imported_data.items():     
    merged_df[(data_type, exchange)] = {}
    merged_df[(data_type, exchange)]['full'] = pd.merge(ground_truth[cols_to_drop], df, on='origin_time', how='inner')
    merged_df[(data_type, exchange)]['X'] = merged_df[(data_type, exchange)]['full'].drop(cols_to_drop, axis=1)
    merged_df[(data_type, exchange)]['y'] = merged_df[(data_type, exchange)]['full']['label']

In [6]:
data = {}

for (data_type, exchange), df in merged_df.items():
    data[(data_type, exchange)] = {}
    data[(data_type, exchange)]['X_train'], data[(data_type, exchange)]['X_test'], data[(data_type, exchange)]['y_train'], data[(data_type, exchange)]['y_test'] = train_test_split(df['X'], df['y'], test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [7]:
best_params = {}
best_score = {}
evaluation = {}

param_distributions = {
    'criterion': [RANDOM_FOREST_CRITERION],
    'n_estimators': [100, 200, 300],
    'max_depth': range(1, 20),
    'min_samples_split': [10, 20, 50, 100],
    'min_samples_leaf': [10, 20, 30, 50]
}

for (data_type, exchange), df in data.items():
    clf = RandomForestClassifier(random_state=42)

    # Initialize the RandomizedSearchCV object
    randomized_search = RandomizedSearchCV(
        estimator=clf,
        param_distributions=param_distributions,
        n_iter=50,
        cv=CV_FOLDS,
        scoring='accuracy',
        n_jobs=-1,
        random_state=42
    )

    # Fit the random search to the data
    randomized_search.fit(df['X_train'], df['y_train'])

    # Best parameters and score
    best_params[(data_type, exchange)] = randomized_search.best_params_
    best_score[(data_type, exchange)] = randomized_search.best_score_
    print(f"Best parameters for {exchange} {data_type}: {best_params[(data_type, exchange)]}")

    # Use the best estimator found by RandomizedSearchCV
    best_clf = randomized_search.best_estimator_

    # Predict on the test set
    df['y_pred'] = best_clf.predict(df['X_test'])

    evaluation[(data_type, exchange)] = {}

    # Calculate accuracy
    evaluation[(data_type, exchange)]['accuracy'] = accuracy_score(df['y_test'], df['y_pred'])

    # Convert classification report to DataFrame
    report = classification_report(df['y_test'], df['y_pred'], target_names=['positive', 'neutral', 'negative'], digits=2, output_dict=True)
    evaluation[(data_type, exchange)]['classification_report'] = pd.DataFrame(report).transpose()

    # Convert confusion matrix to DataFrame
    evaluation[(data_type, exchange)]['confusion_matrix'] = pd.DataFrame(
        confusion_matrix(df['y_test'], df['y_pred']),
        index=['true:positive', 'true:neutral', 'true:negative'],
        columns=['pred:positive', 'pred:neutral', 'pred:negative']
    )
    pd.DataFrame.to_pickle(evaluation, os.path.join(PROCESSED_DATA_PATH, f'{exchange}_{data_type}_evaluation.pkl'))

    # Display results
    print(f'{exchange} {data_type} Accuracy: {evaluation[(data_type, exchange)]["accuracy"]:.2f}')
    print(f'{exchange} {data_type} Classification Report:')
    display(evaluation[(data_type, exchange)]['classification_report'])
    print(f'{exchange} {data_type} Confusion Matrix:')
    display(evaluation[(data_type, exchange)]['confusion_matrix'])    

    # Calculate cross-validated train and test scores for plotting learning curves
    train_scores = []
    test_scores = []
    depths = param_distributions['max_depth']

    # Reuse cross-validation results from RandomizedSearchCV
    cv_results = pd.DataFrame(randomized_search.cv_results_)

    for depth in depths:
        subset = cv_results[cv_results['param_max_depth'] == depth]
        train_scores.append(subset['mean_train_score'].values[0])
        test_scores.append(subset['mean_test_score'].values[0])

    plot_tree_learning_curves(exchange, data_type, depths, train_scores, test_scores, 'random_forest')

    pd.DataFrame.to_pickle(best_params[(data_type, exchange)], os.path.join(PROCESSED_DATA_PATH, f'random_forest_best_params.pkl'))
    pd.DataFrame.to_pickle(best_score[(data_type, exchange)], os.path.join(PROCESSED_DATA_PATH, f'random_forest_best_score.pkl'))


In [None]:
# from sklearn.model_selection import cross_val_score

# for (data_type, exchange), df in data.items():
#     # Performing 10-fold cross-validation
#     scores = cross_val_score(clf[(data_type, exchange)], X[(data_type, exchange)], y[(data_type, exchange)], cv=10)
#     print("Average cross-validation score: {:.2f}".format(scores.mean()))
