In [None]:
# Funding Rate vs Future Returns Analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.tsa.stattools import grangercausalitytests
from scipy.stats import pearsonr, spearmanr
import matplotlib.dates as mdates

# Set plot style
plt.style.use('ggplot')
sns.set(font_scale=1.2)
colors = sns.color_palette("viridis", 4)

# Load data
def load_and_preprocess_data(file_path='../binance_data_pipeline/data/markets/BTCUSDT_1d.csv'):
    """Load and preprocess the crypto data."""
    df = pd.read_csv(file_path)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df = df[df["Timestamp"] >= "2023-06-25"]
    
    # Calculate the basis and APRs
    binance_rate = 0.0001 * 3 * 365 * 100
    df.rename(columns={"prompt_days_till_expiry": "prompt_dte", "next_days_till_expiry": "next_dte"}, inplace=True)
    df["prompt_basis"] = df["prompt_close"] - df["spot_close"]
    df["prompt_apr"] = (df["prompt_basis"] / df["spot_close"]) * (365 / df["prompt_dte"]) * 100 - binance_rate
    
    # Only use rows where next futures data is available
    df_with_next = df.dropna(subset=['next_close']).copy()
    df_with_next.loc[:, "next_basis"] = df_with_next["next_close"] - df_with_next["spot_close"]
    df_with_next.loc[:, "next_apr"] = (df_with_next["next_basis"] / df_with_next["spot_close"]) * (365 / df_with_next["next_dte"]) * 100 - binance_rate
    
    # Multiply funding rate by 100 to match APR units
    df["funding_annualized"] *= 100
    df["funding_annualized"] -= binance_rate
    
    # Set index for time series operations
    df.set_index('Timestamp', inplace=True)
    df_with_next.set_index('Timestamp', inplace=True)
    
    return df, df_with_next

# Calculate future returns for different periods
def calculate_future_returns(df, periods=[1, 3, 5, 10, 20]):
    """Calculate future returns for each instrument with different lag periods."""
    instruments = ['spot', 'perp', 'prompt']
    
    for instrument in instruments:
        # Calculate regular returns
        df[f'{instrument}_returns'] = df[f'{instrument}_close'].pct_change() * 100
        
        # Calculate future returns for different periods
        for period in periods:
            df[f'{instrument}_future_{period}d_returns'] = df[f'{instrument}_close'].pct_change(periods=period).shift(-period) * 100
    
    # Add next future returns if available
    df_with_next = df.dropna(subset=['next_close']).copy()
    df_with_next['next_returns'] = df_with_next['next_close'].pct_change() * 100
    
    for period in periods:
        df_with_next[f'next_future_{period}d_returns'] = df_with_next['next_close'].pct_change(periods=period).shift(-period) * 100
    
    return df, df_with_next

# Analyze the relationship between funding rate and future returns
def analyze_funding_vs_returns(df, df_with_next, period=1):
    """Analyze the relationship between funding rates and future returns."""
    
    # Create plotting data
    plot_df = df.copy()
    plot_df.reset_index(inplace=True)
    
    # Create figure for scatter plots
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    axes = axes.flatten()
    
    instruments = ['spot', 'perp', 'prompt', 'next']
    titles = ['Spot BTC', 'Perpetual Futures', 'Prompt Quarterly Futures', 'Next Quarterly Futures']
    
    # Create scatter plots
    for i, (instrument, title) in enumerate(zip(instruments, titles)):
        if instrument == 'next':
            data = df_with_next.reset_index()
            return_col = f'{instrument}_future_{period}d_returns'
        else:
            data = plot_df
            return_col = f'{instrument}_future_{period}d_returns'
        
        if return_col in data.columns:
            sns.regplot(x='funding_annualized', y=return_col, 
                       data=data, 
                       scatter_kws={'alpha':0.5}, 
                       line_kws={'color':'red'},
                       ax=axes[i])
            
            # Calculate correlation
            valid_data = data.dropna(subset=['funding_annualized', return_col])
            corr, p_value = pearsonr(valid_data['funding_annualized'], valid_data[return_col])
            
            axes[i].set_title(f'{title}\nCorrelation: {corr:.3f} (p-value: {p_value:.3f})')
            axes[i].set_xlabel('Annualized Funding Rate (%)')
            axes[i].set_ylabel(f'{period}-Day Future Returns (%)')
            axes[i].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.suptitle(f'Funding Rate vs {period}-Day Future Returns', fontsize=20, y=1.02)
    plt.show()
    
    # Return the correlation values for further analysis
    correlations = {}
    for instrument in instruments:
        if instrument == 'next':
            data = df_with_next
            return_col = f'{instrument}_future_{period}d_returns'
        else:
            data = df
            return_col = f'{instrument}_future_{period}d_returns'
        
        if return_col in data.columns:
            valid_data = data.dropna(subset=['funding_annualized', return_col])
            corr, p_value = pearsonr(valid_data['funding_annualized'], valid_data[return_col])
            correlations[instrument] = {'correlation': corr, 'p_value': p_value}
    
    return correlations

# Time series analysis of the relationship
def analyze_time_series_relationship(df, period=1):
    """Analyze and visualize the time series relationship between funding rate and future returns."""
    
    # Create figure for time series
    plt.figure(figsize=(16, 10))
    
    # Plot funding rate
    ax1 = plt.gca()
    ax1.plot(df.index, df['funding_annualized'], 'b-', label='Funding Rate (Annualized %)')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Funding Rate (%)', color='b')
    ax1.tick_params(axis='y', labelcolor='b')
    
    # Create secondary y-axis for future returns
    ax2 = ax1.twinx()
    
    instruments = ['spot', 'perp', 'prompt']
    colors = ['g', 'r', 'purple']
    
    for instrument, color in zip(instruments, colors):
        return_col = f'{instrument}_future_{period}d_returns'
        if return_col in df.columns:
            ax2.plot(df.index, df[return_col], color=color, alpha=0.7, 
                    label=f'{instrument.capitalize()} {period}-Day Future Returns')
    
    ax2.set_ylabel('Future Returns (%)', color='k')
    ax2.tick_params(axis='y', labelcolor='k')
    
    # Add legend
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc='best')
    
    plt.title(f'Funding Rate vs {period}-Day Future Returns Over Time', fontsize=16)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

# Perform Granger causality tests
def test_granger_causality(df, max_lag=10):
    """Test if funding rate Granger-causes future returns for each instrument."""
    instruments = ['spot', 'perp', 'prompt']
    results = {}
    
    for instrument in instruments:
        # Test for different lag periods
        results[instrument] = {}
        
        for lag in range(1, max_lag+1):
            return_col = f'{instrument}_future_{lag}d_returns'
            if return_col in df.columns:
                # Create a dataframe with just the two columns we need
                test_df = df[['funding_annualized', return_col]].dropna()
                
                if len(test_df) > lag + 10:  # Ensure we have enough data
                    try:
                        # Test if funding rate causes future returns
                        gc_result = grangercausalitytests(test_df[['funding_annualized', return_col]], 
                                                         maxlag=lag, verbose=False)
                        
                        # Extract p-value from the test
                        p_value = gc_result[lag][0]['ssr_ftest'][1]
                        results[instrument][lag] = p_value
                    except:
                        results[instrument][lag] = None
    
    # Visualize the results
    plt.figure(figsize=(14, 8))
    
    for i, instrument in enumerate(instruments):
        lags = list(results[instrument].keys())
        p_values = list(results[instrument].values())
        
        # Filter out None values
        valid_lags = [lags[j] for j in range(len(lags)) if p_values[j] is not None]
        valid_p_values = [p_values[j] for j in range(len(p_values)) if p_values[j] is not None]
        
        if valid_lags:
            plt.plot(valid_lags, valid_p_values, 'o-', label=f'{instrument.capitalize()} Returns')
    
    plt.axhline(y=0.05, color='r', linestyle='--', label='5% Significance Level')
    plt.xlabel('Lag (Days)')
    plt.ylabel('p-value')
    plt.title('Granger Causality Test: Does Funding Rate Cause Future Returns?')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    return results

# Analyze the relationship between funding rate, prompt APR and future returns
def analyze_funding_prompt_vs_returns(df, period=1):
    """Analyze how the combination of funding rate and prompt APR affects future returns."""
    
    # Calculate the spread between funding rate and prompt APR
    df['funding_prompt_spread'] = df['funding_annualized'] - df['prompt_apr']
    
    # Create a figure for the spread analysis
    plt.figure(figsize=(14, 8))
    
    instruments = ['spot', 'perp', 'prompt']
    colors = ['g', 'r', 'purple']
    
    for i, (instrument, color) in enumerate(zip(instruments, colors)):
        return_col = f'{instrument}_future_{period}d_returns'
        if return_col in df.columns:
            # Create scatter plot
            plt.scatter(df['funding_prompt_spread'], df[return_col], 
                       color=color, alpha=0.5, label=f'{instrument.capitalize()}')
            
            # Add regression line
            valid_data = df.dropna(subset=['funding_prompt_spread', return_col])
            if len(valid_data) > 10:
                x = valid_data['funding_prompt_spread']
                y = valid_data[return_col]
                z = np.polyfit(x, y, 1)
                p = np.poly1d(z)
                plt.plot(sorted(x), p(sorted(x)), color=color)
    
    plt.axhline(y=0, color='k', linestyle='--', alpha=0.5)
    plt.axvline(x=0, color='k', linestyle='--', alpha=0.5)
    plt.title(f'Funding-Prompt Spread vs {period}-Day Future Returns', fontsize=16)
    plt.xlabel('Funding Rate - Prompt APR Spread (%)')
    plt.ylabel(f'{period}-Day Future Returns (%)')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Calculate correlation between spread and future returns
    correlations = {}
    for instrument in instruments:
        return_col = f'{instrument}_future_{period}d_returns'
        if return_col in df.columns:
            valid_data = df.dropna(subset=['funding_prompt_spread', return_col])
            corr, p_value = pearsonr(valid_data['funding_prompt_spread'], valid_data[return_col])
            correlations[instrument] = {'correlation': corr, 'p_value': p_value}
    
    return correlations

# Analyze returns based on different funding rate levels
def analyze_returns_by_funding_levels(df, period=1, bins=5):
    """Analyze future returns for different levels of funding rates."""
    
    # Create bins of funding rates
    # Use 'duplicates="drop"' to handle duplicate bin edges
    try:
        df['funding_bin'] = pd.qcut(df['funding_annualized'], bins, labels=False, duplicates="drop")
    except ValueError:
        # If still getting errors, try a different approach
        df['funding_bin'] = pd.qcut(df['funding_annualized'].rank(method='first'), bins, labels=False)
    
    # Create a figure for the analysis
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    
    instruments = ['spot', 'perp', 'prompt']
    titles = ['Spot BTC', 'Perpetual Futures', 'Prompt Quarterly Futures']
    
    for i, (instrument, title) in enumerate(zip(instruments, titles)):
        return_col = f'{instrument}_future_{period}d_returns'
        if return_col in df.columns:
            # Group returns by funding bin
            grouped = df.groupby('funding_bin')[return_col].mean().reset_index()
            
            # Get the actual number of bins in the data
            actual_bins = sorted(grouped['funding_bin'].unique())
            actual_bin_labels = [f'Q{int(b)+1}' for b in actual_bins]
            
            # Calculate confidence intervals
            ci_low = []
            ci_high = []
            for bin_val in grouped['funding_bin']:
                bin_returns = df[df['funding_bin'] == bin_val][return_col].dropna()
                if len(bin_returns) > 1:
                    ci = sm.stats.DescrStatsW(bin_returns).tconfint_mean()
                    ci_low.append(ci[0])
                    ci_high.append(ci[1])
                else:
                    ci_low.append(np.nan)
                    ci_high.append(np.nan)
            
            grouped['ci_low'] = ci_low
            grouped['ci_high'] = ci_high
            
            # Plot
            axes[i].bar(actual_bin_labels, grouped[return_col], color=colors[i], alpha=0.7)
            
            # Only add error bars if we have valid confidence intervals
            valid_ci = ~np.isnan(grouped['ci_low']) & ~np.isnan(grouped['ci_high'])
            if valid_ci.any():
                valid_groups = grouped[valid_ci]
                axes[i].errorbar(
                    [actual_bin_labels[j] for j in range(len(actual_bin_labels)) if valid_ci.iloc[j]], 
                    valid_groups[return_col],
                    yerr=[valid_groups[return_col]-valid_groups['ci_low'], 
                          valid_groups['ci_high']-valid_groups[return_col]],
                    fmt='o', color='black', capsize=5
                )
            
            axes[i].set_title(title)
            axes[i].set_xlabel('Funding Rate Quintile (Low to High)')
            axes[i].set_ylabel(f'{period}-Day Future Returns (%)')
            axes[i].grid(True, alpha=0.3)
            
            # Add the average value of funding rate in each bin
            funding_avgs = df.groupby('funding_bin')['funding_annualized'].mean()
            for j, bin_val in enumerate(actual_bins):
                if bin_val in funding_avgs.index:
                    avg_val = funding_avgs[bin_val]
                    return_val = grouped[grouped['funding_bin'] == bin_val][return_col].iloc[0]
                    axes[i].annotate(
                        f'{avg_val:.2f}%',
                        xy=(j, return_val),
                        xytext=(0, 10 if return_val >= 0 else -25),
                        textcoords='offset points',
                        ha='center'
                    )
    
    plt.tight_layout()
    plt.suptitle(f'Average {period}-Day Future Returns by Funding Rate Quintile', fontsize=16, y=1.02)
    plt.show()

# Create a combined strategy visualization
def visualize_combined_strategy(df, period=1):
    """Visualize returns from a combined strategy using funding rate and prompt APR."""
    
    # Calculate z-scores
    # Add a small epsilon to avoid division by zero
    epsilon = 1e-10
    df['funding_zscore'] = (df['funding_annualized'] - df['funding_annualized'].rolling(30).mean()) / (df['funding_annualized'].rolling(30).std() + epsilon)
    df['prompt_apr_zscore'] = (df['prompt_apr'] - df['prompt_apr'].rolling(30).mean()) / (df['prompt_apr'].rolling(30).std() + epsilon)
    
    # Create signal based on both metrics
    df['signal'] = np.nan
    
    # High funding + Low prompt APR = Short signal
    df.loc[(df['funding_zscore'] > 1) & (df['prompt_apr_zscore'] < -1), 'signal'] = -1
    
    # Low funding + High prompt APR = Long signal
    df.loc[(df['funding_zscore'] < -1) & (df['prompt_apr_zscore'] > 1), 'signal'] = 1
    
    # Signal description
    df.loc[df['signal'] == 1, 'signal_desc'] = 'Long'
    df.loc[df['signal'] == -1, 'signal_desc'] = 'Short'
    df.loc[df['signal'].isna(), 'signal_desc'] = 'No Signal'
    
    # Calculate future returns for each signal
    plt.figure(figsize=(14, 8))
    
    instruments = ['spot', 'perp', 'prompt']
    colors = ['g', 'r', 'purple']
    
    # Create a box plot for each instrument and signal
    data_to_plot = []
    labels = []
    
    for instrument in instruments:
        return_col = f'{instrument}_future_{period}d_returns'
        if return_col in df.columns:
            for signal in [-1, 0, 1]:
                # Get the returns based on the signal
                if signal == 0:
                    returns = df[df['signal'].isna()][return_col].dropna()
                else:
                    returns = df[df['signal'] == signal][return_col].dropna()
                
                # Only add to plot if we have data
                if len(returns) > 0:
                    data_to_plot.append(returns.values)  # Convert to numpy array to avoid pandas issues
                    signal_desc = 'Short' if signal == -1 else ('No Signal' if signal == 0 else 'Long')
                    labels.append(f'{instrument.capitalize()} {signal_desc}')
    
    # Create the box plot
    plt.boxplot(data_to_plot, labels=labels, patch_artist=True)
    plt.title(f'Distribution of {period}-Day Future Returns by Trading Signal', fontsize=16)
    plt.ylabel('Future Returns (%)')
    plt.xticks(rotation=45)
    plt.axhline(y=0, color='k', linestyle='--', alpha=0.5)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Calculate average returns for each signal
    return_summary = pd.DataFrame(columns=['Instrument', 'Signal', 'Average Return', 'Win Rate', 'Count'])
    
    for instrument in instruments:
        return_col = f'{instrument}_future_{period}d_returns'
        if return_col in df.columns:
            for signal_val, signal_desc in zip([1, -1, np.nan], ['Long', 'Short', 'No Signal']):
                if signal_val is np.nan:
                    signal_data = df[df['signal'].isna()]
                else:
                    signal_data = df[df['signal'] == signal_val]
                
                returns = signal_data[return_col].dropna()
                
                if len(returns) > 0:
                    avg_return = returns.mean()
                    
                    # For long signals, win if return > 0
                    # For short signals, win if return < 0
                    if signal_val == 1:
                        win_rate = (returns > 0).mean() * 100
                    elif signal_val == -1:
                        win_rate = (returns < 0).mean() * 100
                    else:
                        win_rate = np.nan
                    
                    return_summary = pd.concat([return_summary, pd.DataFrame({
                        'Instrument': [instrument.capitalize()],
                        'Signal': [signal_desc],
                        'Average Return': [avg_return],
                        'Win Rate': [win_rate],
                        'Count': [len(returns)]
                    })], ignore_index=True)
    
    print("\nStrategy Performance Summary:")
    print(return_summary.to_string(index=False))
    
    return return_summary

# Analyze cross-correlation between funding and returns
def analyze_cross_correlation(df, max_lag=20):
    """Analyze cross-correlation between funding rates and future returns at different lags."""
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    
    instruments = ['spot', 'perp', 'prompt']
    colors = ['g', 'r', 'purple']
    
    # Calculate cross-correlation at various lags
    lags = range(-max_lag, max_lag + 1)
    
    # Create figure
    plt.figure(figsize=(14, 7))
    
    for instrument, color in zip(instruments, colors):
        xcorr_values = []
        for lag in lags:
            # For positive lags, funding leads returns (predictive)
            # For negative lags, returns lead funding (reactive)
            if lag >= 0:
                # Funding leads returns
                funding = df['funding_annualized'].iloc[:-lag] if lag > 0 else df['funding_annualized']
                returns = df[f'{instrument}_returns'].iloc[lag:] if lag > 0 else df[f'{instrument}_returns']
            else:
                # Returns lead funding
                funding = df['funding_annualized'].iloc[abs(lag):]
                returns = df[f'{instrument}_returns'].iloc[:len(df)-abs(lag)]
            
            # Calculate correlation with error handling
            try:
                valid_data = pd.DataFrame({'funding': funding, 'returns': returns}).dropna()
                if len(valid_data) > 10 and valid_data['funding'].std() > 0 and valid_data['returns'].std() > 0:
                    corr = valid_data['funding'].corr(valid_data['returns'])
                    xcorr_values.append(corr)
                else:
                    xcorr_values.append(np.nan)
            except:
                xcorr_values.append(np.nan)
        
        # Plot cross-correlation
        plt.plot(lags, xcorr_values, 'o-', color=color, label=f'{instrument.capitalize()} Returns')
    
    plt.axvline(x=0, color='k', linestyle='--', alpha=0.5)
    plt.axhline(y=0, color='k', linestyle='--', alpha=0.5)
    plt.grid(True, alpha=0.3)
    plt.xlabel('Lag (Negative: Returns lead Funding, Positive: Funding leads Returns)')
    plt.ylabel('Cross-Correlation')
    plt.title('Cross-Correlation: Funding Rate vs. Returns')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # Create a heat map of cross-correlation between funding rate and future returns for different lags
    instruments_extended = ['spot', 'perp', 'prompt']
    metrics = ['returns', 'future_1d_returns', 'future_3d_returns', 'future_5d_returns', 'future_10d_returns']
    
    # Create matrix for heatmap
    xcorr_matrix = pd.DataFrame(index=range(1, max_lag + 1))
    
    for instrument in instruments_extended:
        for metric in metrics:
            col_name = f'{instrument}_{metric}'
            if col_name in df.columns:
                xcorr_values = []
                for lag in range(1, max_lag + 1):
                    # Funding leads returns (predictive)
                    funding = df['funding_annualized'].iloc[:-lag]
                    returns = df[col_name].iloc[lag:]
                    
                    # Calculate correlation
                    valid_data = pd.DataFrame({'funding': funding, 'returns': returns}).dropna()
                    if len(valid_data) > 10:  # Only calculate if enough data
                        corr = valid_data['funding'].corr(valid_data['returns'])
                        xcorr_values.append(corr)
                    else:
                        xcorr_values.append(np.nan)
                
                xcorr_matrix[col_name] = xcorr_values
    
    # Plot heatmap
    plt.figure(figsize=(15, 10))
    sns.heatmap(xcorr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
    plt.title('Cross-Correlation: Funding Rate vs. Returns at Different Lags')
    plt.xlabel('Return Metrics')
    plt.ylabel('Lag (Days)')
    plt.tight_layout()
    plt.show()
    
    return xcorr_matrix

# Analyze autocorrelation of various metrics
def analyze_autocorrelation(df, lags=20):
    """Analyze and visualize autocorrelation of funding rates, premiums, and returns."""
    """Analyze and visualize autocorrelation of funding rates, premiums, and returns."""
    from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
    
    # List of metrics to analyze
    metrics = [
        ('funding_annualized', 'Funding Rate'),
        ('prompt_apr', 'Prompt APR'),
        ('funding_prompt_spread', 'Funding-Prompt Spread'),
        ('spot_returns', 'Spot Returns'),
        ('perp_returns', 'Perpetual Returns'),
        ('prompt_returns', 'Prompt Returns')
    ]
    
    # Create subplots for ACF and PACF
    fig, axes = plt.subplots(len(metrics), 2, figsize=(16, 4*len(metrics)))
    
    for i, (col, title) in enumerate(metrics):
        if col in df.columns:
            series = df[col].dropna()
            
            # Only plot if we have enough data and variation
            if len(series) > lags + 2 and series.std() > 0:
                try:
                    # Plot ACF
                    plot_acf(series, lags=lags, ax=axes[i, 0], title=f'Autocorrelation: {title}')
                    axes[i, 0].set_xlabel('Lag')
                    axes[i, 0].set_ylabel('ACF')
                except Exception as e:
                    axes[i, 0].text(0.5, 0.5, f"Error plotting ACF: {str(e)}", 
                                  horizontalalignment='center', verticalalignment='center',
                                  transform=axes[i, 0].transAxes)
                
                try:
                    # Plot PACF
                    plot_pacf(series, lags=lags, ax=axes[i, 1], title=f'Partial Autocorrelation: {title}')
                    axes[i, 1].set_xlabel('Lag')
                    axes[i, 1].set_ylabel('PACF')
                except Exception as e:
                    axes[i, 1].text(0.5, 0.5, f"Error plotting PACF: {str(e)}", 
                                  horizontalalignment='center', verticalalignment='center',
                                  transform=axes[i, 1].transAxes)
            else:
                axes[i, 0].text(0.5, 0.5, "Insufficient data for ACF", 
                              horizontalalignment='center', verticalalignment='center',
                              transform=axes[i, 0].transAxes)
                axes[i, 1].text(0.5, 0.5, "Insufficient data for PACF", 
                              horizontalalignment='center', verticalalignment='center',
                              transform=axes[i, 1].transAxes)
    
    plt.tight_layout()
    plt.show()
    
    # Statistical significance test for autocorrelation
    from statsmodels.stats.diagnostic import acorr_ljungbox
    
    print("\nLjung-Box Test for Autocorrelation:")
    print("H0: Data is independently distributed (no autocorrelation)")
    print("H1: Data exhibits autocorrelation\n")
    
    results_table = pd.DataFrame(
        columns=['Metric', 'Test Statistic', 'p-value', 'Significant at 5%']
    )
    
    for col, title in metrics:
        if col in df.columns:
            series = df[col].dropna()
            try:
                result = acorr_ljungbox(series, lags=[10], return_df=True)
                test_stat = result['lb_stat'].iloc[0]
                p_value = result['lb_pvalue'].iloc[0]
                
                results_table = pd.concat([results_table, pd.DataFrame({
                    'Metric': [title],
                    'Test Statistic': [round(test_stat, 3)],
                    'p-value': [round(p_value, 4)],
                    'Significant at 5%': ['Yes' if p_value < 0.05 else 'No']
                })], ignore_index=True)
            except:
                pass
    
    print(results_table.to_string(index=False))
    
    # Calculate autocorrelation at specific lags for metrics
    print("\nAutocorrelation Coefficients at Selected Lags:")
    
    autocorr_table = pd.DataFrame(
        columns=['Metric', 'Lag-1', 'Lag-3', 'Lag-5', 'Lag-10']
    )
    
    for col, title in metrics:
        if col in df.columns:
            series = df[col].dropna()
            
            # Calculate autocorrelation at different lags
            lag_1 = series.autocorr(lag=1) if len(series) > 1 else np.nan
            lag_3 = series.autocorr(lag=3) if len(series) > 3 else np.nan
            lag_5 = series.autocorr(lag=5) if len(series) > 5 else np.nan
            lag_10 = series.autocorr(lag=10) if len(series) > 10 else np.nan
            
            autocorr_table = pd.concat([autocorr_table, pd.DataFrame({
                'Metric': [title],
                'Lag-1': [round(lag_1, 3)],
                'Lag-3': [round(lag_3, 3)],
                'Lag-5': [round(lag_5, 3)],
                'Lag-10': [round(lag_10, 3)]
            })], ignore_index=True)
    
    print(autocorr_table.to_string(index=False))
    
    return results_table, autocorr_table

# Main function to run all analyses
def main(file_path='../data/markets/BTCUSDT_1d.csv', period=1):
    """Run all analyses with the specified future returns period."""
    
    # Load and preprocess data
    print(f"Loading data from {file_path}...")
    df, df_with_next = load_and_preprocess_data(file_path)
    
    # Calculate future returns
    print(f"Calculating {period}-day future returns...")
    df, df_with_next = calculate_future_returns(df, periods=[period])
    
    # Analyze relationship between funding rate and future returns
    print("\nAnalyzing relationship between funding rate and future returns...")
    correlations = analyze_funding_vs_returns(df, df_with_next, period=period)
    
    # Analyze time series relationship
    print("\nVisualizing time series relationship...")
    analyze_time_series_relationship(df, period=period)
    
    # Test Granger causality
    print("\nTesting Granger causality...")
    gc_results = test_granger_causality(df, max_lag=10)
    
    # Analyze relationship between funding-prompt spread and future returns
    print("\nAnalyzing relationship between funding-prompt spread and future returns...")
    spread_correlations = analyze_funding_prompt_vs_returns(df, period=period)
    
    # Analyze returns by funding rate levels
    print("\nAnalyzing returns by funding rate levels...")
    analyze_returns_by_funding_levels(df, period=period)
    
    # Visualize combined strategy
    print("\nVisualizing combined strategy...")
    strategy_results = visualize_combined_strategy(df, period=period)
    
    # Analyze autocorrelation
    print("\nAnalyzing autocorrelation of key metrics...")
    auto_results, auto_coeffs = analyze_autocorrelation(df, lags=20)
    
    # Analyze cross-correlation
    print("\nAnalyzing cross-correlation between funding and future returns...")
    analyze_cross_correlation(df, max_lag=20)
    
    print("\nAnalysis complete!")
    
    return {
        'df': df,
        'df_with_next': df_with_next,
        'correlations': correlations,
        'gc_results': gc_results,
        'spread_correlations': spread_correlations,
        'strategy_results': strategy_results,
        'autocorrelation_results': auto_results,
        'autocorrelation_coeffs': auto_coeffs
    }

# If run as script
if __name__ == "__main__":
    # Set the analysis period (n+1 days)
    analysis_period = 3  # Default to 3-day future returns
    
    # Run the analysis
    results = main(period=analysis_period)
    
    print(f"\nCorrelations between funding rate and {analysis_period}-day future returns:")
    for instrument, stats in results['correlations'].items():
        print(f"  {instrument.capitalize()}: {stats['correlation']:.3f} (p-value: {stats['p_value']:.3f})")
    
    print(f"\nCorrelations between funding-prompt spread and {analysis_period}-day future returns:")
    for instrument, stats in results['spread_correlations'].items():
        print(f"  {instrument.capitalize()}: {stats['correlation']:.3f} (p-value: {stats['p_value']:.3f})")