In [1]:
import sys
import pandas as pd
from dask.dataframe import from_pandas

sys.dont_write_bytecode = True
from src.utils.utils import *
from src.features.build_features import *
from src.models.predict_model import *
from src.models.train_model import *
from src.visualization.visualize import *

from src.utils.constants import *

In [3]:
paths = [
    EXTERNAL_DATA_PATH,
    INTERIM_DATA_PATH,
    PROCESSED_DATA_PATH,
    os.path.dirname(GROUND_TRUTH_PATH),
    os.path.dirname(GROUND_TRUTH_SUMMARY),
    FIGURE_PATH
]

for path in paths:
    if not os.path.exists(path):
        os.makedirs(path)

# Settings

# Data inspection

In [4]:
# Ground truth summary

ground_truth_threshold = pd.read_parquet(GROUND_TRUTH_SUMMARY)

gtt_df = pd.DataFrame(ground_truth_threshold.loc[0])
gtt_df.T

Unnamed: 0,crypto,positive_Threshold,negative_Threshold,label,percentage
0,BTC,0.000343,-0.00034,"[positive, neutral, negative]","[15.497202877040758, 68.96639646839441, 15.536..."


In [51]:
# Ground truth
print("Ground truth")

ground_truth = pd.read_parquet(GROUND_TRUTH_PATH)
print("Length:", len(ground_truth))

ground_truth_features = [x for x in ground_truth.columns]
print("Features", ground_truth_features)

# Ground truth null values

ground_truth_null = ground_truth.loc[ground_truth['null'] == True]

print(f"Date range: {ground_truth['origin_time'].min()} - {ground_truth['origin_time'].max()}")
print(f"Null values length: {len(ground_truth_null)}")

ground_truth = ground_truth.loc[ground_truth['null'] != True]
print(f"Ground truth new length: {len(ground_truth)}")

Ground truth
Length: 525540
Features ['origin_time', 'null', 'close', 'next_change', 'label']
Date range: 2022-10-01 00:59:00 - 2023-09-30 23:58:00
Null values length: 1439
Ground truth new length: 524101


In [52]:
# Data cleaning

ground_truth.drop(columns=['null', 'close'], inplace=True)
ground_truth['next_change'] = standard_scale(ground_truth[['next_change']])

# Ground truth one hot encoding

ground_truth_one_hot = pd.get_dummies(ground_truth['label'], prefix='label')
ground_truth = pd.concat([ground_truth, ground_truth_one_hot], axis=1)
ground_truth.drop(columns=['label'], inplace=True)

display(ground_truth.head(2))

pd.DataFrame.to_pickle(ground_truth, os.path.join(PROCESSED_DATA_PATH, "ground_truth.pkl"))

Unnamed: 0,origin_time,next_change,label_positive,label_neutral,label_negative
0,2022-10-01 00:59:00,-1.084448,False,False,True
1,2022-10-01 01:00:00,-0.16116,False,True,False


In [6]:
# Exchanges summary

data = {}

summary_data = []

for exchange in EXCHANGES:
    candles_path = os.path.join(EXTERNAL_DATA_PATH, f'BTC-{exchange}_candles.parquet')
    orderbooks_path = os.path.join(EXTERNAL_DATA_PATH, f'BTC-{exchange}_orderbook.parquet')
    
    data[(CANDLES, exchange)] = pd.read_parquet(candles_path)
    data[(ORDERBOOKS, exchange)] = pd.read_parquet(orderbooks_path)
    
    summary_data.append({
        'Exchange': exchange,
        'Candles Length': len(data[(CANDLES, exchange)]),
        'Candles Date Range Start': data[(CANDLES, exchange)]['origin_time'].min(),
        'Candles Date Range End': data[(CANDLES, exchange)]['origin_time'].max(),
        'Orderbook Length': len(data[(ORDERBOOKS, exchange)]),
        'Orderbook Date Range Start': data[(ORDERBOOKS, exchange)]['origin_time'].min(),
        'Orderbook Date Range End': data[(ORDERBOOKS, exchange)]['origin_time'].max(),
    })

summary_df = pd.DataFrame(summary_data)
display(summary_df)

Unnamed: 0,Exchange,Candles Length,Candles Date Range Start,Candles Date Range End,Orderbook Length,Orderbook Date Range Start,Orderbook Date Range End
0,BINANCE,525541,2022-10-01 00:59:00,2023-09-30 23:59:00,479971,2022-11-01 16:29:00,2023-09-30 23:59:00
1,HUOBI,405542,2022-12-23 08:58:00,2023-09-30 23:59:00,393120,2023-01-01 00:00:00,2023-09-30 23:59:00
2,OKX,405554,2022-12-23 08:46:00,2023-09-30 23:59:00,393120,2023-01-01 00:00:00,2023-09-30 23:59:00


In [7]:
# Datasets features

first_candles_key = list(data.keys())[0]
first_orderbooks_key = list(data.keys())[1]

candles_fetures = get_features(data[first_candles_key])
print(f"Candles features len:\t{len(candles_fetures)}\n{candles_fetures}")
orderbook_features = get_features(data[first_orderbooks_key])
print(f"Orderbooks features len:\t{len(orderbook_features)}\n{orderbook_features}")

Candles features len:	26
['origin_time', 'open', 'high', 'low', 'close', 'volume', 'trades', 'SMA_60', 'SMA_30', 'SMA_15', 'SMA_5', 'EMA_60', 'EMA_30', 'EMA_15', 'EMA_5', 'RSI_14', 'STOCH_H', 'MACD_H', 'BBL_20_2', 'BBM_20_2', 'BBH_20_2', 'MOM_10', 'CMO_9', 'ULTOSC', 'KAMA_30', 'null']
Orderbooks features len:	82
['origin_time', 'bid_0_price', 'bid_0_size', 'bid_1_price', 'bid_1_size', 'bid_2_price', 'bid_2_size', 'bid_3_price', 'bid_3_size', 'bid_4_price', 'bid_4_size', 'bid_5_price', 'bid_5_size', 'bid_6_price', 'bid_6_size', 'bid_7_price', 'bid_7_size', 'bid_8_price', 'bid_8_size', 'bid_9_price', 'bid_9_size', 'bid_10_price', 'bid_10_size', 'bid_11_price', 'bid_11_size', 'bid_12_price', 'bid_12_size', 'bid_13_price', 'bid_13_size', 'bid_14_price', 'bid_14_size', 'bid_15_price', 'bid_15_size', 'bid_16_price', 'bid_16_size', 'bid_17_price', 'bid_17_size', 'bid_18_price', 'bid_18_size', 'bid_19_price', 'bid_19_size', 'ask_0_price', 'ask_0_size', 'ask_1_price', 'ask_1_size', 'ask_2_price

In [8]:
# Null values summary

candles_summary = []
orderbooks_summary = []

for (data_type, exchange), df in data.items():
    if data_type == CANDLES:
        dataset_df_summary = get_dataframe_null_summary(df, exchange)
        candles_summary.append(dataset_df_summary)
    elif data_type == ORDERBOOKS:
        dataset_df_summary = get_dataframe_null_summary(df, exchange)
        orderbooks_summary.append(dataset_df_summary)

display(pd.DataFrame(candles_summary))
display(pd.DataFrame(orderbooks_summary))

Unnamed: 0,Exchange,Total Entries,Null Entries,Null Percentage (%),origin_time,open,high,low,close,volume,...,STOCH_H,MACD_H,BBL_20_2,BBM_20_2,BBH_20_2,MOM_10,CMO_9,ULTOSC,KAMA_30,null
0,BINANCE,525541,2600,0.00495,0,0,0,0,0,2837,...,0,0,0,0,0,3492,2251,2727,0,522941
1,HUOBI,405542,4,1e-05,0,0,0,0,0,1222,...,16,0,0,0,0,827,0,0,0,405538
2,OKX,405554,1408,0.00347,0,0,0,0,0,1572,...,0,0,0,0,0,2975,1069,1424,0,404146


Unnamed: 0,Exchange,Total Entries,Null Entries,Null Percentage (%),origin_time,bid_0_price,bid_0_size,bid_1_price,bid_1_size,bid_2_price,...,ask_15_size,ask_16_price,ask_16_size,ask_17_price,ask_17_size,ask_18_price,ask_18_size,ask_19_price,ask_19_size,null
0,BINANCE,479971,2739,0.00571,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,477232
1,HUOBI,393120,54316,0.13817,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,338804
2,OKX,393120,934,0.00238,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,392186


# Preprocessing

In [18]:
for (data_type, exchange), df in data.items():
    display(df.head(2))

Unnamed: 0,origin_time,open,high,low,close,volume,trades,SMA_60,SMA_30,SMA_15,...,STOCH_H,MACD_H,BBL_20_2,BBM_20_2,BBH_20_2,MOM_10,CMO_9,ULTOSC,KAMA_30,null
0,2022-10-01 00:59:00,19389.949219,19402.890625,19388.789062,19402.880859,97.72231,1360,19420.161816,19390.319466,19384.258594,...,4.654771,4.225298,19353.981577,19384.593359,19415.205142,21.361328,16.449292,62.816524,19404.818674,False
1,2022-10-01 01:00:00,19401.839844,19402.880859,19384.980469,19390.060547,171.99713,1665,19419.7625,19388.869141,19385.421354,...,-2.218688,3.586515,19354.827081,19383.719922,19412.612763,8.179688,-0.993919,57.319306,19404.465252,False


Unnamed: 0,origin_time,bid_0_price,bid_0_size,bid_1_price,bid_1_size,bid_2_price,bid_2_size,bid_3_price,bid_3_size,bid_4_price,...,ask_15_size,ask_16_price,ask_16_size,ask_17_price,ask_17_size,ask_18_price,ask_18_size,ask_19_price,ask_19_size,null
0,2022-11-01 16:29:00,20430.31,0.45928,20430.3,0.017,20430.28,0.75656,20430.25,0.01019,20430.2,...,0.33599,20432.49,0.59007,20432.5,0.01033,20432.51,0.49064,20432.52,0.07531,False
1,2022-11-01 16:30:00,20437.44,0.00581,20437.14,0.16832,20437.08,0.5334,20437.07,0.0198,20437.04,...,0.08021,20439.53,0.20421,20439.54,0.057,20439.56,0.03428,20439.7,0.02273,False


Unnamed: 0,origin_time,open,high,low,close,volume,trades,SMA_60,SMA_30,SMA_15,...,STOCH_H,MACD_H,BBL_20_2,BBM_20_2,BBH_20_2,MOM_10,CMO_9,ULTOSC,KAMA_30,null
0,2022-12-23 08:58:00,16832.74,16832.75,16826.12,16827.83,1.610151,88,16833.9635,16834.492,16836.926,...,-18.01157,-1.063314,16829.158436,16836.8685,16844.578564,-10.3,-43.670269,30.492535,16832.870645,False
1,2022-12-23 08:59:00,16827.82,16833.64,16827.81,16833.64,2.309176,79,16833.7815,16834.658333,16836.904,...,-12.47351,-0.991988,16829.32945,16836.9255,16844.52155,-8.9,-8.828931,46.130679,16832.88,False


Unnamed: 0,origin_time,bid_0_price,bid_0_size,bid_1_price,bid_1_size,bid_2_price,bid_2_size,bid_3_price,bid_3_size,bid_4_price,...,ask_15_size,ask_16_price,ask_16_size,ask_17_price,ask_17_size,ask_18_price,ask_18_size,ask_19_price,ask_19_size,null
0,2023-01-01 00:00:00,16540.99,0.770922,16540.75,0.3,16540.71,0.056112,16540.13,0.725767,16540.12,...,2.702179,16547.21,0.09068,16547.3,0.031625,16547.31,0.022527,16547.52,0.2,False
1,2023-01-01 00:01:00,16539.29,0.56565,16539.04,0.006046,16538.8,0.006046,16538.56,0.006046,16538.31,...,0.36,16542.21,0.24,16542.45,0.04,16542.55,0.08,16542.96,0.236,False


Unnamed: 0,origin_time,open,high,low,close,volume,trades,SMA_60,SMA_30,SMA_15,...,STOCH_H,MACD_H,BBL_20_2,BBM_20_2,BBH_20_2,MOM_10,CMO_9,ULTOSC,KAMA_30,null
0,2022-12-23 08:46:00,16841.8,16841.8,16841.2,16841.2,0.318745,31,16841.055,16836.5,16839.36,...,-3.730738,0.713263,16831.350282,16838.45,16845.549718,3.4,12.086986,58.245871,16834.101427,False
1,2022-12-23 08:47:00,16841.3,16841.3,16841.2,16841.3,0.401436,17,16840.911667,16837.1,16839.68,...,-1.378751,0.548341,16831.5458,16838.69,16845.8342,3.4,12.802971,57.036327,16834.591311,False


Unnamed: 0,origin_time,bid_0_price,bid_0_size,bid_1_price,bid_1_size,bid_2_price,bid_2_size,bid_3_price,bid_3_size,bid_4_price,...,ask_15_size,ask_16_price,ask_16_size,ask_17_price,ask_17_size,ask_18_price,ask_18_size,ask_19_price,ask_19_size,null
0,2023-01-01 00:00:00,16547.6,0.485728,16547.2,0.011894,16547.0,0.748344,16546.9,0.444408,16546.8,...,0.004,16549.5,1.5113,16549.7,0.2,16550.1,0.1,16550.3,0.055259,False
1,2023-01-01 00:01:00,16544.3,0.550176,16544.2,0.00674,16543.9,0.010414,16543.8,0.530234,16543.7,...,1.5113,16548.1,0.191439,16548.4,0.285396,16548.5,0.713489,16548.7,0.457,False


In [50]:
scaled_data = {}

for (data_type, exchange), df in data.items():
    print(f"Exchange: {exchange} - {data_type} length: {len(df)}")

    # Remove null values, drop null feature

    df_non_null_values = df.loc[df['null'] != True]
    df_non_null_feature = df_non_null_values.drop(columns=['null'])
    
    print(f"Exchange: {exchange} - new {data_type} length: {len(df_non_null_feature)}")

    # Scale data
    
    df_no_origin_time = df_non_null_feature.drop(columns=['origin_time'])
    scaled_data[(data_type, exchange)] = standard_scale(df_no_origin_time)
    
    scaled_to_be_stored = scaled_data[(data_type, exchange)].copy()
    scaled_to_be_stored['origin_time'] = df_non_null_feature['origin_time']
    pd.DataFrame.to_parquet(scaled_to_be_stored, os.path.join(INTERIM_DATA_PATH, f"{exchange}_{data_type}_scaled.parquet"))

Exchange: BINANCE - candles length: 525541
Exchange: BINANCE - new candles length: 522941


Unnamed: 0,open,high,low,close,volume,trades,SMA_60,SMA_30,SMA_15,SMA_5,...,RSI_14,STOCH_H,MACD_H,BBL_20_2,BBM_20_2,BBH_20_2,MOM_10,CMO_9,ULTOSC,KAMA_30
0,-1.041427,-1.040203,-1.040142,-1.038659,-0.047591,-0.257661,-1.034891,-1.041321,-1.042636,-1.040881,...,0.278873,0.547092,0.86262,-1.042695,-1.042559,-1.042349,0.463645,0.536953,0.930479,-1.038222
1,-1.038878,-1.040205,-1.040959,-1.041407,0.398249,-0.141649,-1.034977,-1.041632,-1.042387,-1.040373,...,-0.158965,-0.260796,0.732206,-1.042514,-1.042747,-1.042904,0.175583,-0.042714,0.521482,-1.038297


Exchange: BINANCE - orderbooks length: 479971
Exchange: BINANCE - new orderbooks length: 477232


Unnamed: 0,bid_0_price,bid_0_size,bid_1_price,bid_1_size,bid_2_price,bid_2_size,bid_3_price,bid_3_size,bid_4_price,bid_4_size,...,ask_15_price,ask_15_size,ask_16_price,ask_16_size,ask_17_price,ask_17_size,ask_18_price,ask_18_size,ask_19_price,ask_19_size
0,-0.915737,-0.511076,-0.915719,-0.152638,-0.915706,0.359101,-0.915696,-0.139116,-0.915692,-0.112342,...,-0.915576,0.152928,-0.915585,0.323525,-0.915594,-0.135187,-0.915603,0.366027,-0.915611,-0.057532
1,-0.914203,-0.577886,-0.914247,-0.05276,-0.914243,0.200994,-0.914229,-0.133685,-0.91422,-0.155603,...,-0.914061,-0.05318,-0.91407,0.042746,-0.914079,-0.087605,-0.914086,-0.115994,-0.914066,-0.101065


Exchange: HUOBI - candles length: 405542
Exchange: HUOBI - new candles length: 405538


Unnamed: 0,open,high,low,close,volume,trades,SMA_60,SMA_30,SMA_15,SMA_5,...,RSI_14,STOCH_H,MACD_H,BBL_20_2,BBM_20_2,BBH_20_2,MOM_10,CMO_9,ULTOSC,KAMA_30
0,-2.496414,-2.497641,-2.496946,-2.497775,-0.137183,-0.08511,-2.495557,-2.495696,-2.495176,-2.495889,...,-1.048984,-2.023464,-0.210231,-2.490197,-2.495145,-2.499779,-0.223756,-1.382931,-1.36504,-2.496273
1,-2.49775,-2.4974,-2.496487,-2.496197,-0.035903,-0.110698,-2.495606,-2.495651,-2.495182,-2.496166,...,-0.231833,-1.401301,-0.196128,-2.49015,-2.495129,-2.499795,-0.194062,-0.290193,-0.345104,-2.49627


Exchange: HUOBI - orderbooks length: 393120
Exchange: HUOBI - new orderbooks length: 338804


Unnamed: 0,bid_0_price,bid_0_size,bid_1_price,bid_1_size,bid_2_price,bid_2_size,bid_3_price,bid_3_size,bid_4_price,bid_4_size,...,ask_15_price,ask_15_size,ask_16_price,ask_16_size,ask_17_price,ask_17_size,ask_18_price,ask_18_size,ask_19_price,ask_19_size
0,-2.800518,-0.047667,-2.800266,0.087804,-2.800147,-0.094893,-2.800146,0.34642,-2.799991,0.053043,...,-2.801222,1.859372,-2.801378,-0.212771,-2.801511,-0.311759,-2.801667,-0.320662,-2.801764,-0.127204
1,-2.801018,-0.075776,-2.800769,-0.085582,-2.800708,-0.127015,-2.800608,-0.15745,-2.800523,-0.214917,...,-2.802737,0.025819,-2.802849,-0.083113,-2.802937,-0.303419,-2.803067,-0.265857,-2.803106,-0.10031


Exchange: OKX - candles length: 405554
Exchange: OKX - new candles length: 404146


Unnamed: 0,open,high,low,close,volume,trades,SMA_60,SMA_30,SMA_15,SMA_5,...,RSI_14,STOCH_H,MACD_H,BBL_20_2,BBM_20_2,BBH_20_2,MOM_10,CMO_9,ULTOSC,KAMA_30
0,-2.501554,-2.502832,-2.500459,-2.501735,-0.231787,-0.222125,-2.501237,-2.502747,-2.502109,-2.50138,...,0.360386,-0.416163,0.138237,-2.497083,-2.50231,-2.507203,0.065406,0.384794,0.551564,-2.503539
1,-2.50169,-2.502968,-2.500459,-2.501708,-0.228565,-0.25868,-2.501276,-2.502584,-2.502022,-2.501587,...,0.375841,-0.153814,0.10625,-2.49703,-2.502244,-2.507126,0.065406,0.408184,0.468109,-2.503406


Exchange: OKX - orderbooks length: 393120
Exchange: OKX - new orderbooks length: 392186


Unnamed: 0,bid_0_price,bid_0_size,bid_1_price,bid_1_size,bid_2_price,bid_2_size,bid_3_price,bid_3_size,bid_4_price,bid_4_size,...,ask_15_price,ask_15_size,ask_16_price,ask_16_size,ask_17_price,ask_17_size,ask_18_price,ask_18_size,ask_19_price,ask_19_size
0,-2.922914,-0.20649,-2.922844,-0.241501,-2.922801,0.345459,-2.922749,0.14741,-2.922707,0.106469,...,-2.923468,-0.477529,-2.92349,1.372384,-2.923483,-0.167603,-2.923416,-0.438713,-2.923409,-0.439675
1,-2.923901,-0.191098,-2.923742,-0.24735,-2.923729,-0.163866,-2.923676,0.216117,-2.923634,0.088254,...,-2.923886,1.422383,-2.923909,-0.23953,-2.923872,-0.092908,-2.923894,0.480683,-2.923888,0.078345


In [None]:
# Correlation matrix

for (data_type, exchange), df in scaled_data.items():
    correlation_matrix = df.corr()
    
    plot_correlation_matrix(data_type, exchange, correlation_matrix)

    highly_correlated_pairs = correlation_matrix.unstack().sort_values(kind="quicksort", ascending=False)
    highly_correlated_pairs = highly_correlated_pairs[(highly_correlated_pairs != 1) & (highly_correlated_pairs > CORRELATION_THRESHOLD)]

    print("Highly correlated pairs:")
    display(pd.DataFrame(highly_correlated_pairs))

In [47]:
# PCA and explained variance

pca = {}
pca_data = {}

for (data_type, exchange), df in scaled_data.items():
    pca_fit, explained_variance, cumulative_variance = perform_pca(df, PCA_VARIANCE_THRESHOLD)

    pca_transformed = pca_fit.transform(df)
    pca_transformed_df = pd.DataFrame(pca_transformed)
    pca_transformed_df['origin_time'] = df['origin_time'].values

    pca_data[(data_type, exchange)] = {}
    pca_data[(data_type, exchange)] = pca_transformed_df

    pd.DataFrame.to_parquet(pca_data[(data_type, exchange)], os.path.join(INTERIM_DATA_PATH, f"{exchange}_{data_type}_pca_data.parquet"))

    plot_pca_variance(data_type, exchange, explained_variance, cumulative_variance)

Unnamed: 0,0,1,2,3,4
0,4.072248,-1.425993,-0.832263,0.196227,-0.350932
1,4.131692,-0.448744,-0.453845,-0.446913,-0.08697


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,-5.847353,-0.170574,-0.008292,0.051979,0.365381,-0.182759,-0.04718,0.146782,0.13471,-0.006214,...,0.000724,0.059995,0.189089,-0.234475,0.072252,-0.134166,0.005486,-0.026488,0.309547,-0.426545
1,-5.738043,2.186984,-2.464983,-0.737784,0.643677,-0.134798,-1.722882,0.480742,-2.055411,-2.53962,...,-2.212633,-2.659618,4.193672,0.930763,0.224282,-2.248556,0.004675,-1.10813,-0.245312,0.332633


Unnamed: 0,0,1,2,3,4
0,9.970286,2.329191,-0.271934,-1.897531,0.487464
1,9.979513,0.854882,-0.051722,-0.949368,0.866037


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,17.651409,-1.323814,-0.296391,0.270303,0.225537,-0.14356,0.521838,0.62295,-0.022481,-0.521688,...,1.286559,-0.769461,0.923547,0.105357,1.179183,0.030668,0.090124,-0.017436,-0.00933,-0.224525
1,17.634013,-1.581382,-0.388583,0.026518,0.194445,-0.54722,0.313486,0.094268,-0.534516,-0.00897,...,-0.150115,0.360578,0.036841,-0.105784,0.03491,-0.241776,0.275626,-0.72205,0.073596,-0.403396


Unnamed: 0,0,1,2,3,4
0,10.010785,-0.587351,-0.163834,-0.086835,0.607501
1,10.011316,-0.606014,-0.17584,0.091617,0.427919


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,18.4826,-0.685403,0.158733,0.018941,-0.212028,0.529342,-0.02516,0.481638,-0.729073,0.003704,...,0.475957,0.065035,-0.183573,-0.550048,-0.20228,-0.76795,-0.511815,0.120575,0.172582,0.147955
1,18.521184,-0.09843,0.462728,0.14775,0.886874,0.193805,0.108334,-0.081829,-0.133839,0.7333,...,-0.275349,0.070564,0.043493,0.266416,0.138052,-0.188521,-0.125603,-0.448511,-0.446876,-0.177104


## Candles matrices

These matrices primarily includes various market indicators and technical analysis features such as SMA (Simple Moving Average), EMA (Exponential Moving Average), RSI (Relative Strength Index), MACD (Moving Average Convergence Divergence), etc.
#### There are high correlations among similar indicators
There's a visible high correlation among similar types of indicators, especially among different periods of moving averages (SMA, EMA). This is expected as these indicators are derived from the price and tend to move together.

#### Potential Redundancy
High correlations (close to 1) suggest redundancy among features. For instance, SMA and EMA values that are calculated over similar time frames may provide overlapping information which could be redundant in predictive modeling.

#### Diverse Relationships
Some features show moderate to low correlations, suggesting that they capture different aspects of the market behavior. These features can add valuable diversity to models.

## Orderbooks matrices

These matrices represents features related to the sizes of bids and asks at different levels in an order book. The periodic patterns indicate:

#### Alternating High/Low Correlation
The alternating pattern of high and low correlations suggests a structured dependency in order sizes, possibly alternating between bid and ask sizes or different levels of depth in the order book.

#### Structured Market Dynamics
The structured high correlations (red squares) alternating with lower correlations might indicate typical behaviors in how bids and asks are placed and modified in relation to each other. These patterns might reflect strategic placing/removal of orders at certain levels, influenced by market conditions.

In [None]:
# PCA Loadings: how the original features contribute to the principal components

pca_loadings = {}

for (data_type, exchange), df in scaled_data.items():
    pca_loadings[(data_type, exchange)] = compute_loadings(pca[(data_type, exchange)], scaled_data[(data_type, exchange)])    
    plot_loadings_heatmap(data_type, exchange, pca_loadings[(data_type, exchange)])

In [None]:
# Merge ground truth with candles and orderbooks
merged_df = {}

for (data_type, exchange), df in data.items():
    merged_df[(data_type, exchange)] = {}
    merged_df[(data_type, exchange)]['full'] = pd.merge(ground_truth[['origin_time', 'label']], df, on='origin_time', how='inner')

    cols_to_drop = ['origin_time', 'label']

    merged_df[(data_type, exchange)]['X'] = merged_df[(data_type, exchange)]['full'].drop(cols_to_drop, axis=1)
    merged_df[(data_type, exchange)]['y'] = merged_df[(data_type, exchange)]['full']['label']

In [None]:
information_gain = {}

for (data_type, exchange), df in merged_df.items():

    # Inner merge ground truth with data on origin_time
    information_gain[(data_type, exchange)] = get_information_gain(df['X'], df['y'])

In [None]:
best_features = {}

for (data_type, exchange), df in pca_loadings.items():
    best_features[(data_type, exchange)] = compare_features_scores(df, information_gain[(data_type, exchange)])
    display(best_features[(data_type, exchange)])

In [None]:
for (data_type, exchange), df in best_features.items():
    print(best_features[(data_type, exchange)].sort_values(by='Combined_Scores', ascending=False))

In [None]:
for (data_type, exchange), df in pca_loadings.items():
    plot_histogram_density(data_type, exchange, best_features[(data_type, exchange)], ['Loadings_Norm', 'Information_Gain', 'Combined_Scores'], ['blue', 'red', 'green'])

In [None]:
for (data_type, exchange), df in best_features.items():
    percentile_90 = np.percentile(df['Combined_Scores'], 80)

    # Select features with scores above this percentile
    selected_features = df[df['Combined_Scores'] >= percentile_90]

    # Count the number of selected features
    num_selected_features = selected_features.shape[0]
    print(f"{exchange}-{data_type} - Number of selected features: {num_selected_features}")