In [None]:
import os, sys
import pandas as pd

src_path = os.path.abspath(os.path.join('..'))
if src_path not in sys.path:
    sys.path.append(src_path)

sys.dont_write_bytecode = True
from src.utils.utils import *
from src.features.build_features import *
from src.models.predict_model import *
from src.models.train_model import *
from src.visualization.visualize import *

from src.utils.constants import *

In [None]:
paths = [
    EXTERNAL_DATA_PATH,
    INTERIM_DATA_PATH,
    PROCESSED_DATA_PATH,
    os.path.dirname(GROUND_TRUTH_PATH),
    os.path.dirname(GROUND_TRUTH_SUMMARY),
    FIGURE_PATH
]

for path in paths:
    if not os.path.exists(path):
        os.makedirs(path)

# Settings

# Data inspection

In [None]:
# Ground truth summary

ground_truth_threshold = pd.read_parquet(GROUND_TRUTH_SUMMARY)

gtt_df = pd.DataFrame(ground_truth_threshold.loc[0])
gtt_df.T

In [None]:
# Ground truth
print("Ground truth")

ground_truth = pd.read_parquet(GROUND_TRUTH_PATH)
print("Length:", len(ground_truth))

ground_truth_features = [x for x in ground_truth.columns]
print("Features", ground_truth_features)

# Ground truth null values

ground_truth_null = ground_truth.loc[ground_truth['null'] == True]

print(f"Date range: {ground_truth['origin_time'].min()} - {ground_truth['origin_time'].max()}")
print(f"Null values length: {len(ground_truth_null)}")

ground_truth = ground_truth.loc[ground_truth['null'] != True]
print(f"Ground truth new length: {len(ground_truth)}")

In [None]:
# Exchanges summary

data = {}

summary_data = []

for exchange in EXCHANGES:
    candles_path = os.path.join(EXTERNAL_DATA_PATH, f'BTC-{exchange}_candles.parquet')
    orderbooks_path = os.path.join(EXTERNAL_DATA_PATH, f'BTC-{exchange}_orderbook.parquet')
    
    data[(CANDLES, exchange)] = pd.read_parquet(candles_path)
    data[(ORDERBOOKS, exchange)] = pd.read_parquet(orderbooks_path)
    
    summary_data.append({
        'Exchange': exchange,
        'Candles Length': len(data[(CANDLES, exchange)]),
        'Candles Date Range Start': data[(CANDLES, exchange)]['origin_time'].min(),
        'Candles Date Range End': data[(CANDLES, exchange)]['origin_time'].max(),
        'Orderbook Length': len(data[(ORDERBOOKS, exchange)]),
        'Orderbook Date Range Start': data[(ORDERBOOKS, exchange)]['origin_time'].min(),
        'Orderbook Date Range End': data[(ORDERBOOKS, exchange)]['origin_time'].max(),
    })

summary_df = pd.DataFrame(summary_data)
display(summary_df)

In [None]:
# Datasets features

first_candles_key = list(data.keys())[0]
first_orderbooks_key = list(data.keys())[1]

candles_fetures = get_features(data[first_candles_key])
print(f"Candles features len:\t{len(candles_fetures)}\n{candles_fetures}")
orderbook_features = get_features(data[first_orderbooks_key])
print(f"Orderbooks features len:\t{len(orderbook_features)}\n{orderbook_features}")

In [None]:
# Null values summary

candles_summary = []
orderbooks_summary = []

for (data_type, exchange), df in data.items():
    if data_type == CANDLES:
        dataset_df_summary = get_dataframe_null_summary(df, exchange)
        candles_summary.append(dataset_df_summary)
    elif data_type == ORDERBOOKS:
        dataset_df_summary = get_dataframe_null_summary(df, exchange)
        orderbooks_summary.append(dataset_df_summary)

display(pd.DataFrame(candles_summary))
display(pd.DataFrame(orderbooks_summary))

# Preprocessing

In [None]:
scaled_data = {}

for (data_type, exchange), df in data.items():
    
    # Remove null values, drop null feature

    data[(data_type, exchange)] = df.loc[df['null'] != True]
    data[(data_type, exchange)] = df.drop(columns=['null'])
    
    print(f"Exchange: {exchange} - new {data_type} length: {len(df)}")

    # Scale data
    
    scaled_data[(data_type, exchange)] = data[(data_type, exchange)].drop(columns=['origin_time'])
    scaled_data[(data_type, exchange)] = standard_scale(scaled_data[(data_type, exchange)])

    pd.DataFrame.to_parquet(scaled_data[(data_type, exchange)], os.path.join(INTERIM_DATA_PATH, f'{exchange}_{data_type}_scaled.parquet'))

In [None]:
# Correlation matrix

for (data_type, exchange), df in scaled_data.items():
    correlation_matrix = df.corr()
    
    plot_correlation_matrix(data_type, exchange, correlation_matrix)

    highly_correlated_pairs = correlation_matrix.unstack().sort_values(kind="quicksort", ascending=False)
    highly_correlated_pairs = highly_correlated_pairs[(highly_correlated_pairs != 1) & (highly_correlated_pairs > CORRELATION_THRESHOLD)]

    print("Highly correlated pairs:")
    display(pd.DataFrame(highly_correlated_pairs))

In [None]:
# PCA and explained variance

pca = {}
pca_data = {}

for (data_type, exchange), df in data.items():
    pca[(data_type, exchange)], explained_variance, cumulative_variance = perform_pca(scaled_data[(data_type, exchange)], PCA_VARIANCE_THRESHOLD)
    pca_data[(data_type, exchange)] = {}

    pca_transformed = pca[(data_type, exchange)].transform(scaled_data[(data_type, exchange)])
    pca_transformed_df = pd.DataFrame(pca_transformed)
    pca_transformed_df['origin_time'] = df['origin_time'].values
    pca_data[(data_type, exchange)] = pca_transformed_df

    pd.DataFrame.to_parquet(pca_data[(data_type, exchange)], os.path.join(INTERIM_DATA_PATH, f"{exchange}_{data_type}_pca_data.parquet"))

    plot_pca_variance(data_type, exchange, explained_variance, cumulative_variance)

## Candles matrices

These matrices primarily includes various market indicators and technical analysis features such as SMA (Simple Moving Average), EMA (Exponential Moving Average), RSI (Relative Strength Index), MACD (Moving Average Convergence Divergence), etc.
#### There are high correlations among similar indicators
There's a visible high correlation among similar types of indicators, especially among different periods of moving averages (SMA, EMA). This is expected as these indicators are derived from the price and tend to move together.

#### Potential Redundancy
High correlations (close to 1) suggest redundancy among features. For instance, SMA and EMA values that are calculated over similar time frames may provide overlapping information which could be redundant in predictive modeling.

#### Diverse Relationships
Some features show moderate to low correlations, suggesting that they capture different aspects of the market behavior. These features can add valuable diversity to models.

## Orderbooks matrices

These matrices represents features related to the sizes of bids and asks at different levels in an order book. The periodic patterns indicate:

#### Alternating High/Low Correlation
The alternating pattern of high and low correlations suggests a structured dependency in order sizes, possibly alternating between bid and ask sizes or different levels of depth in the order book.

#### Structured Market Dynamics
The structured high correlations (red squares) alternating with lower correlations might indicate typical behaviors in how bids and asks are placed and modified in relation to each other. These patterns might reflect strategic placing/removal of orders at certain levels, influenced by market conditions.

In [None]:
# PCA Loadings: how the original features contribute to the principal components

pca_loadings = {}

for (data_type, exchange), df in scaled_data.items():
    pca_loadings[(data_type, exchange)] = compute_loadings(pca[(data_type, exchange)], scaled_data[(data_type, exchange)])

    pd.DataFrame.to_parquet(pca_loadings[(data_type, exchange)], os.path.join(INTERIM_DATA_PATH, f'{exchange}_{data_type}_pca_loadings.parquet'))
    
    plot_loadings_heatmap(data_type, exchange, pca_loadings[(data_type, exchange)])

In [None]:
# Merge ground truth with candles and orderbooks
merged_df = {}

for (data_type, exchange), df in data.items():
    merged_df[(data_type, exchange)] = {}
    merged_df[(data_type, exchange)]['full'] = pd.merge(ground_truth[['origin_time', 'label']], df, on='origin_time', how='inner')

    cols_to_drop = ['origin_time', 'label']

    merged_df[(data_type, exchange)]['X'] = merged_df[(data_type, exchange)]['full'].drop(cols_to_drop, axis=1)
    merged_df[(data_type, exchange)]['y'] = merged_df[(data_type, exchange)]['full']['label']

    pd.DataFrame.to_parquet(merged_df[(data_type, exchange)]['full'], os.path.join(INTERIM_DATA_PATH, f'{exchange}_{data_type}_merged.parquet'))

In [None]:
information_gain = {}

for (data_type, exchange), df in merged_df.items():

    # Inner merge ground truth with data on origin_time
    information_gain[(data_type, exchange)] = get_information_gain(df['X'], df['y'])

    pd.DataFrame.to_parquet(information_gain[(data_type, exchange)], os.path.join(INTERIM_DATA_PATH, f"{exchange}_{data_type}_information_gain.parquet"))

In [None]:
best_features = {}

for (data_type, exchange), df in pca_loadings.items():
    best_features[(data_type, exchange)] = compare_features_scores(df, information_gain[(data_type, exchange)])
    display(best_features[(data_type, exchange)])

In [None]:
for (data_type, exchange), df in best_features.items():
    print(best_features[(data_type, exchange)].sort_values(by='Combined_Scores', ascending=False))

In [None]:
for (data_type, exchange), df in pca_loadings.items():
    plot_histogram_density(data_type, exchange, best_features[(data_type, exchange)], ['Loadings_Norm', 'Information_Gain', 'Combined_Scores'], ['blue', 'red', 'green'])

In [None]:
for (data_type, exchange), df in best_features.items():
    percentile_90 = np.percentile(df['Combined_Scores'], 80)

    # Select features with scores above this percentile
    selected_features = df[df['Combined_Scores'] >= percentile_90]

    # Count the number of selected features
    num_selected_features = selected_features.shape[0]
    print(f"{exchange}-{data_type} - Number of selected features: {num_selected_features}")