In [None]:
# Install the necessary packages
!pip install -r ../requirements.txt

In [None]:
start = "2024-01-01"
end = "2024-03-01"
interval = "1h"
tickers = ["BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT", "XRPUSDT", "ADAUSDT", "AVAXUSDT", "DOGEUSDT", "DOTUSDT",
           "LINKUSDT", "TRXUSDT", "LTCUSDT", "SHIBUSDT", "ICPUSDT", "BCHUSDT"]

In [None]:
from modules.data_services.data_pipeline import load_data

# 1. Load data
df = load_data(
    tickers=tickers,
    start=start,
    end=end,
    interval=interval,
)
print(df.head())

In [None]:
# 1. Calculate Pearson's correlation matrix

from modules.pair_selection.statistical_tests import pearson_correlation

corr_prices_df = pearson_correlation(df).sort_values('corr', ascending=False).reset_index(drop=True)
corr_prices_df = corr_prices_df.rename(columns={'corr': 'corr_prices'})
print(corr_prices_df.head())

In [None]:
# 2. Calculate returns correlation

returns = (df / df.shift(1) - 1).dropna()
corr_returns_df = pearson_correlation(returns).sort_values('corr', ascending=False).reset_index(drop=True)
corr_returns_df = corr_returns_df.rename(columns={'corr': 'corr_returns'})
print(corr_returns_df.head())

In [None]:
# 3. Calculate log-returns correlation
import numpy as np

log_returns = np.log(df / df.shift(1)).dropna()
corr_log_returns_df = pearson_correlation(log_returns).sort_values('corr', ascending=False).reset_index(drop=True)
corr_log_returns_df = corr_log_returns_df.rename(columns={'corr': 'corr_log_returns'})
print(corr_log_returns_df.head())

In [None]:
# 4. Perform Engle-Granger cointegration test

from modules.pair_selection.statistical_tests import engle_granger_cointegration

eg_df = engle_granger_cointegration(df).sort_values('eg_p_value', ascending=True).reset_index(drop=True)
print(eg_df.head())

In [None]:
# 5. Perform Johansen cointegration test

from modules.pair_selection.statistical_tests import johansen_cointegration

johansen_df = johansen_cointegration(df).sort_values('joh_p_value', ascending=True).reset_index(drop=True)
print(johansen_df.head())

In [None]:
# 6. Merge dataframes

from modules.data_services.data_pipeline import merge_by_pair

merged_df = merge_by_pair(
    dfs=[corr_prices_df, corr_returns_df, corr_log_returns_df, eg_df, johansen_df],
    keep_cols=[
        ['corr_prices'],
        ['corr_returns'],
        ['corr_log_returns'],
        ['p_value'],
        ['p_est']
    ]
).sort_values('p_value', ascending=True).reset_index(drop=True)
print(merged_df.head())

In [None]:
merged_df['corr_returns x (1 - p_value)'] = merged_df['corr_returns'] * (1 - merged_df['p_value'])
print(merged_df.sort_values('corr_returns x (1 - p_value)', ascending=False).head())