In [105]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import statsmodels.api as sm
from tqdm import tqdm

plt.style.use('bmh')
plt.rcParams["figure.figsize"] = (15, 10)

In [106]:
def plot_multiaxis(data,
                   name_to_hightlight="nan",
                   start_date="2017-01-01",
                   end_date="2020-01-01",
                   force_fix_second_axis=False,
                   colors=['black', 'red', 'blue', 'green', 'purple']):
    fig, ax_orig = plt.subplots(figsize=(15, 10))
    time_series = data.loc[start_date:end_date].ffill()
    labels = list(data.columns)
    
    if name_to_hightlight == "nan":
        alpha = 1
    else:
        alpha = 0.1
    
    for i, (color, label) in enumerate(zip(colors, labels)):
        if i == 0:
            ax = ax_orig
        else:
            ax = ax_orig.twinx()
            ax.spines['right'].set_position(('outward', 50 * (i - 1)))
        
        if label == name_to_hightlight:
            ax.plot(time_series.index,time_series[time_series.columns[i]], color=color, alpha=1)
        else:
            ax.plot(time_series.index,time_series[time_series.columns[i]], color=color, alpha=alpha)
        ax.set_ylabel(label, color=color)
        ax.tick_params(axis='y', colors=color)
        ax.set_title('')
        
    plt.show()
    
    return fig

In [107]:
data = pd.read_excel(os.path.join("data", "crypto_quant_data.xlsx"), engine='openpyxl')
data.set_index("date", inplace=True)
data.tail()

Unnamed: 0_level_0,XBTUSD Index,ACA,Funding Rates,Exchange Inflow,Mean Fees per Transaction,MPI,MVRV,Exchange Netflow,NUPL,Open Interest,...,12m_18m,18m_2y,2y_3y,3y_5y,5y_7y,7y_10y,10y_,SOPR,Supply in Profit,Transactions Count
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-07-19,,679417.0,0.006209,49762.514503,9.5e-05,0.296877,1.067241,-7447.300041,0.063004,9489528000.0,...,48715.223571,22244.844982,8635.084159,7101.000951,1017.510398,363.157394,1.463749,0.7043,60.600928,279812.0
2022-07-20,,640433.0,0.006102,57328.395469,9.9e-05,-0.534073,1.058846,-9274.62512,0.055575,9029214000.0,...,48745.356847,22330.946457,8633.516423,7104.756626,1020.55282,363.290762,1.466195,0.846652,59.490927,260069.0
2022-07-21,,683135.0,0.007127,40920.216053,8.8e-05,-0.784885,1.057568,-1618.080672,0.054434,9025240000.0,...,48743.177663,22306.915146,8634.403784,7110.649653,1024.719268,363.705064,1.470494,0.721688,59.95946,273837.0
2022-07-22,,648393.0,0.002772,43964.376473,8.3e-05,-0.519118,1.037782,-1982.575534,0.036406,8890100000.0,...,48753.243351,22383.852291,8635.857256,7113.423451,1027.777942,363.753817,1.471332,0.379805,58.2211,263144.0
2022-07-25,,,-0.000505,29699.140132,,,,-1419.078914,,8549086000.0,...,,,,,,,,,,


In [182]:
lags = [1, 5, 20, 20*3, 20*6]
target_name = "XBTUSD Index"

linear_reg_summary = []
logit_reg_summary = []
for colname in tqdm(list(data.columns),
                    total=len(list(data.columns))):
    if colname != target_name:
        tmp_data = data[[target_name, colname]].dropna().resample("B").last().ffill()

        start_date = "2017-01-01"
        end_date = tmp_data.index[-1]
        tmp_data = tmp_data.loc[start_date:end_date]

        # generate lags
        lags_data = []
        for lag in lags:
            tmp_lags_data = tmp_data[[colname]].shift(lag)
            tmp_lags_data.rename(columns={tmp_lags_data.columns[0]: tmp_lags_data.columns[0] + "lag" + str(lag)}, inplace=True)
            lags_data.append(tmp_lags_data)
        lags_data = pd.concat(lags_data, axis=1).dropna()

        tmp_lag_data = pd.concat([tmp_data, lags_data], axis=1)

        # normal linear regression with lags
        tmp_linear_reg_summary = []
        for i, c in enumerate(list(tmp_lag_data.columns)):
            if c != target_name:
                check_negative_values = tmp_lag_data[[target_name, c]].dropna()[tmp_lag_data[[target_name, c]].dropna()<0].dropna(how="all").shape[0]
                
                if check_negative_values == 0:
                    reg_df = np.log(tmp_lag_data[[target_name, c]].dropna()).diff().dropna()
                else:
                    reg_df = tmp_lag_data[[target_name, c]].dropna().copy()
                    reg_df[target_name] = np.log(reg_df[target_name])
                    reg_df = reg_df.diff().dropna()
                    
                linear_reg = sm.OLS(endog=reg_df[target_name],
                                    exog=reg_df[c])
                fit_linear_reg = linear_reg.fit()

                tmp_summary = {}
                tmp_summary['variable'] = colname
                tmp_summary['predictor'] = c
                tmp_summary['beta'] = np.round(fit_linear_reg.params[0], 3)
                tmp_summary['pval'] = np.round(fit_linear_reg.pvalues[0], 3)
                tmp_linear_reg_summary.append(pd.DataFrame(tmp_summary, index=[i]))
        tmp_linear_reg_summary_df = pd.concat(tmp_linear_reg_summary, axis=0)
        linear_reg_summary.append(tmp_linear_reg_summary_df)
        
        # classification model with lags
        tmp_logit_reg_summary = []
        for i, c in enumerate(list(tmp_lag_data.columns)):
            if c != target_name:
                check_negative_values = tmp_lag_data[[target_name, c]].dropna()[tmp_lag_data[[target_name, c]].dropna()<0].dropna(how="all").shape[0]

                if check_negative_values == 0:
                    reg_df = np.log(tmp_lag_data[[target_name, c]].dropna()).diff().dropna()
                else:
                    reg_df = tmp_lag_data[[target_name, c]].dropna().copy()
                    reg_df[target_name] = np.log(reg_df[target_name])
                    reg_df = reg_df.diff().dropna()

                class_reg_df = reg_df.copy()
                class_reg_df[target_name] = np.where(class_reg_df[target_name] >= 0, 1, 0)
                logit_reg = Logit(endog=class_reg_df[target_name],
                                  exog=class_reg_df.drop([target_name], axis=1))
                fit_logit_reg = logit_reg.fit(disp=0)

                tmp_summary = {}
                tmp_summary['variable'] = colname
                tmp_summary['predictor'] = c
                tmp_summary['beta'] = np.round(fit_logit_reg.params[0], 3)
                tmp_summary['odds'] = np.round((np.e ** tmp_summary['beta']), 2)
                tmp_summary['prob'] = np.round((np.e ** tmp_summary['beta']) / (1+(np.e ** tmp_summary['beta'])), 3)
                tmp_summary['pval'] = np.round(fit_logit_reg.pvalues[0], 3)
                tmp_logit_reg_summary.append(pd.DataFrame(tmp_summary, index=[i]))
        tmp_logit_reg_summary_df = pd.concat(tmp_logit_reg_summary, axis=0)
        logit_reg_summary.append(tmp_logit_reg_summary_df)
        
linear_reg_summary_df = pd.concat(linear_reg_summary, axis=0)
logit_reg_summary_df = pd.concat(logit_reg_summary, axis=0)

100%|██████████| 29/29 [00:02<00:00, 10.12it/s]


## Linear regression summary

In [183]:
linear_reg_summary_df.head(12)

Unnamed: 0,variable,predictor,beta,pval
1,ACA,ACA,0.032,0.022
2,ACA,ACAlag1,0.027,0.076
3,ACA,ACAlag5,-0.007,0.629
4,ACA,ACAlag20,-0.018,0.233
5,ACA,ACAlag60,-0.002,0.884
6,ACA,ACAlag120,0.023,0.109
1,Funding Rates,Funding Rates,0.574,0.0
2,Funding Rates,Funding Rateslag1,0.073,0.374
3,Funding Rates,Funding Rateslag5,-0.162,0.046
4,Funding Rates,Funding Rateslag20,0.025,0.755


## Logistic regression summary

The value $e^\beta$ tell us how much the odds of the outcome (return of the BTC) will change for each 1 unit change in the predictor.

Thus, an increase of 1 unit in the predictor multiplies the odds of the BTC to have a positive return by $e^\beta$.

In [184]:
logit_reg_summary_df.head(12)

Unnamed: 0,variable,predictor,beta,odds,prob,pval
1,ACA,ACA,1.291,3.64,0.784,0.032
2,ACA,ACAlag1,1.066,2.9,0.744,0.103
3,ACA,ACAlag5,-0.105,0.9,0.474,0.871
4,ACA,ACAlag20,-0.679,0.51,0.336,0.295
5,ACA,ACAlag60,-0.125,0.88,0.469,0.841
6,ACA,ACAlag120,0.888,2.43,0.708,0.145
1,Funding Rates,Funding Rates,14.961,3143979.78,1.0,0.0
2,Funding Rates,Funding Rateslag1,-1.14,0.32,0.242,0.741
3,Funding Rates,Funding Rateslag5,-1.251,0.29,0.223,0.715
4,Funding Rates,Funding Rateslag20,-0.159,0.85,0.46,0.962
