In [None]:
import sys
import pandas as pd

sys.dont_write_bytecode = True
from src.utils.utils import *
from src.features.build_features import *
from src.visualization.visualize import *

from src.utils.constants import *
from functools import reduce

# Settings

In [None]:
USE_PCA = True

In [None]:
paths = [
    EXTERNAL_DATA_PATH,
    INTERIM_DATA_PATH,
    PROCESSED_DATA_PATH,
    os.path.dirname(GROUND_TRUTH_PATH),
    os.path.dirname(GROUND_TRUTH_SUMMARY),
    FIGURE_PATH
]

for path in paths:
    if not os.path.exists(path):
        os.makedirs(path)

# Data inspection

In [None]:
# Ground truth summary

ground_truth_threshold = pd.read_parquet(GROUND_TRUTH_SUMMARY)

gtt_df = pd.DataFrame(ground_truth_threshold.loc[0])
gtt_df.T

In [None]:
# Ground truth
print("Ground truth")

ground_truth = pd.read_parquet(GROUND_TRUTH_PATH)
print("Length:", len(ground_truth))

ground_truth_features = [x for x in ground_truth.columns]
print("Features", ground_truth_features)

# Ground truth null values

ground_truth_null = ground_truth.loc[ground_truth['null'] == True]

print(f"Date range: {ground_truth['origin_time'].min()} - {ground_truth['origin_time'].max()}")
print(f"Null values length: {len(ground_truth_null)}")

ground_truth = ground_truth.loc[ground_truth['null'] != True]
print(f"Ground truth new length: {len(ground_truth)}")

In [None]:
# Data cleaning

ground_truth.drop(columns=['null', 'close', 'next_change'], inplace=True)
ground_truth.head(2)

In [None]:
# Exchanges summary

data = {}

for exchange in EXCHANGES:

    candles_path = os.path.join(EXTERNAL_DATA_PATH, f'BTC-{exchange}_candles.parquet')
    orderbooks_path = os.path.join(EXTERNAL_DATA_PATH, f'BTC-{exchange}_orderbook.parquet')   
        
    # Candles and orderbooks views
    data[(exchange, CANDLES)] = pd.read_parquet(candles_path)
    data[(exchange, ORDERBOOK)] = pd.read_parquet(orderbooks_path)
    

In [None]:
# Null values summary

candles_summary = []
orderbooks_summary = []

for (exchange, data_type), df in data.items():
    if data_type == CANDLES:
        dataset_df_summary = get_dataframe_null_summary(df, exchange)
        candles_summary.append(dataset_df_summary)
    elif data_type == ORDERBOOK:
        dataset_df_summary = get_dataframe_null_summary(df, exchange)
        orderbooks_summary.append(dataset_df_summary)

display(pd.DataFrame(candles_summary))
display(pd.DataFrame(orderbooks_summary))

In [None]:
# Clean null values

for (exchange, data_type), df in data.items():
    df = df.loc[df['null'] != True]
    data[(exchange, data_type)] = df.drop(columns=['null'])


In [None]:
# Exchange unified view

for exchange in EXCHANGES:  
    data[(exchange, UNIFIED)] = merge_datasets(data[(exchange, CANDLES)], data[(exchange, ORDERBOOK)])

In [None]:
# Show data

for (exchange, data_type), df in data.items():
    display(f"{exchange} {data_type}", df.head(2))

# Preprocessing

In [None]:
# Add lag and time features, remove null values

views = {}

for (exchange, data_type), df in data.items():

    # Add lag features
    if data_type != ORDERBOOK:                    
        df = add_lag_features(df, LAGS)

    # Add time features
    views[(exchange, data_type)] = add_time_features(df)
    print(f"Exchange: {exchange} {data_type} - old length: {len(df)}, new length: {len(views[(exchange, data_type)])}, null values: {len(df) - len(views[(exchange, data_type)])}")

    display(views[(exchange, data_type)].head(2))

In [None]:
# Filter and rename columns for the 'unified' datasets
unified_data = {}
for exchange in EXCHANGES:
    df = views[(exchange, UNIFIED)]
    df = df.rename(columns={col: f"{col}_{exchange}" if col != 'origin_time' else col for col in df.columns})
    unified_data[exchange] = df

In [None]:
ALL = 'ALL'

# Perform an inner join on 'origin_time'
views[(ALL, UNIFIED)] = reduce(lambda left, right: pd.merge(left, right, on='origin_time', how='inner'), unified_data.values())

In [None]:
display(views.keys())

In [None]:
# Scale data

scaled_data = {}

for (exchange, data_type), df in views.items():
    
    # Scale data
    
    df_no_time = df.drop(columns=['origin_time'])
    scaled_data[(exchange, data_type)] = standard_scale(df_no_time)
    scaled_data[(exchange, data_type)]['origin_time'] = df['origin_time']

    display(scaled_data[(exchange, data_type)].head(2))

In [None]:
# Evaluate data correlation

for (exchange, data_type), df in scaled_data.items():
    evaluate_correlation(df, exchange, data_type, CORRELATION_THRESHOLD)

In [None]:
# PCA and explained variance

pca_data = {}
pca_loadings = {}

for (exchange, data_type), df in scaled_data.items():

    # Deattach time feature for computing PCA
    no_time = df.drop(columns=['origin_time'])

    pca_df, explained_variance, cumulative_variance, loadings = compute_pca(no_time, PCA_VARIANCE_THRESHOLD)

    # And reattach it for merging it with GT later
    pca_df['origin_time'] = df['origin_time'].values

    pca_data[(exchange, data_type)] = pca_df
    pca_loadings[(exchange, data_type)] = loadings

    pd.DataFrame.to_parquet(pca_data[(exchange, data_type)], os.path.join(INTERIM_DATA_PATH, f"{exchange}_{data_type}_pca_data.parquet"))

    plot_explained_variance(data_type, exchange, explained_variance, cumulative_variance)

In [None]:
# PCA loadings heatmap

for (exchange, data_type), df in pca_loadings.items():
    plot_loadings_heatmap(data_type, exchange, df)

In [None]:
# Merge datasets with ground truth before training

merged_data = {}

for (exchange, data_type), df in pca_data.items():
    merged_data[(exchange, data_type)] = merge_datasets(df, ground_truth)
    pd.DataFrame.to_parquet(merged_data[(exchange, data_type)], os.path.join(INTERIM_DATA_PATH, f"{exchange}_{data_type}_merged.parquet"))