In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm
from statsmodels.regression.linear_model import OLS
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.stattools import adfuller, acf, pacf, q_stat
from statsmodels.stats.stattools import durbin_watson
import warnings

warnings.simplefilter(action='ignore', category='FutureWarning')

In [None]:
# Functions
def check_stationarity(returns):
    """
    Perform Augmented Dickey-Fuller test to assess the stationarity of a time series.

    Parameters:
    - time_series: Pandas Series. The time series to be tested.

    Returns:
    - result: Dictionary containing the test statistic, p-value, critical values,
      and the best lag based on BIC for an AR model if the series is stationary.
    """
    # Drop NaN values which can occur after differencing
    time_series = returns.dropna()
    # Perform ADF test
    adf_test = adfuller(time_series)
    result = {
        'ADF Statistic': adf_test[0],
        'p-value': adf_test[1],
        'Critical Values': adf_test[4],
        'Used Lag': adf_test[2],
        'Number of Observations': adf_test[3]
    }

    # Display the results
    print(f'ADF Statistic: {result["ADF Statistic"]}')
    print(f'p-value: {result["p-value"]}')
    for key, value in result['Critical Values'].items():
        print(f'Critical Value ({key}): {value}')

    # Assess stationarity based on the p-value
    if result["p-value"] > 0.05:
        print("The time series is likely non-stationary.")
    else:
        print("The time series is likely stationary.")

    return result


def find_best_lag_and_rho_for_ar(returns):
    """
    Fit an AutoRegressive (AR) model to find the best lag using BIC and get the rho values.

    Parameters:
    - time_series: Pandas Series. The time series for which to fit the AR model.

    Returns:
    - best_lag: The best number of lags for the AR model.
    - rho_values: List of rho values up to the best lag.
    """
    best_lag = 0
    best_bic = np.inf

    # Iterate over possible lag values to find the best BIC
    for lag in range(1, 31):  # Adjust the range based on expected number of lags
        model = AutoReg(returns, lags=lag)
        results = model.fit()
        bic = results.bic
        if bic < best_bic:
            best_bic = bic
            best_lag = lag

    # Get the rho values for lags up to the best lag
    acf_values = acf(returns, nlags=best_lag)
    rho_values = acf_values[1:best_lag+1].tolist()

    return best_lag, rho_values


def determine_rho_and_lag(time_series, nlags=20, alpha=0.05):
    """
    Determine the rho values and best lag for unsmoothing a time series.

    Parameters:
    - time_series: Pandas Series containing the time series data.
    - nlags: Number of lags to consider for ACF and PACF.
    - alpha: Significance level for statistical significance of autocorrelations.

    Returns:
    - rho_values: Significant autocorrelation coefficients.
    - best_lag: The lag with the highest significant autocorrelation.
    """
    # Ensure time_series is a Series
    if isinstance(time_series, pd.DataFrame):
        if time_series.shape[1] == 1:
            time_series = time_series.iloc[:, 0]
        else:
            raise ValueError("DataFrame must have only one column")

    # Compute ACF and PACF
    acf_values, acf_confint = acf(time_series, nlags=nlags, alpha=alpha, fft=False)
    pacf_values, pacf_confint = pacf(time_series, nlags=nlags, alpha=alpha, method='ols')

    # Identify significant lags
    significant_lags = np.where((acf_confint[:, 0] > acf_values) | (acf_confint[:, 1] < acf_values))[0]
    rho_values = acf_values[significant_lags]
    best_lag = significant_lags[0] if significant_lags.size > 0 else None

    return rho_values, best_lag


def unsmooth_returns(returns_series, rho, order=1):
    """
    Adjusts a series of returns for smoothing as per Geltner (1993) and Okunev & White (2003).

    Parameters:
    - returns_series: Pandas Series containing the smoothed returns.
    - rho: List of autocorrelation coefficients for each lag.
    - order: The order of the autocorrelation.

    Returns:
    - unsmoothed_returns: The unsmoothed returns series.
    """
    if not isinstance(rho, list) or len(rho) < order:
        raise ValueError("rho must be a list with length equal to the specified order")

    unsmoothed_returns = returns_series.copy()
    for i in range(1, order + 1):
        unsmoothed_returns += (rho[i - 1] / (1 - rho[i - 1])) * (returns_series - returns_series.shift(i))

    # Drop NaN values resulted from shifting
    unsmoothed_returns = unsmoothed_returns.dropna()

    return unsmoothed_returns



In [None]:
# load the data
df_hfri = pd.read_csv(r'index_ror_perf_download-2899_1705242702.csv', index_col=0, parse_dates=True)
# set the datetime frequency to monthly
df_hfri = df_hfri.asfreq('M')


In [None]:
# Test for stationarity
stationarity_results = check_stationarity(df_hfri)

In [None]:
# If the time series is stationary, find the best lag for the AR model
best_lag, rho_values = find_best_lag_and_rho_for_ar(df_hfri)
print(f"Best Lag: {best_lag}")
print(f"Rho Values: {rho_values}")

In [None]:
# remove the autocorrelations
unsmoothed_series = unsmooth_returns(df_hfri, rho_values, order=best_lag)

In [None]:
# Calculate the volatility and annualized returns
reported_vol = np.round((df_hfri.iloc[:, 0].std() * np.sqrt(12)) * 100, 2)
adjusted_vol = np.round((unsmoothed_series.iloc[:, 0].std() * np.sqrt(12)) * 100, 2)

reported_annualized_return = (ia.annualize_returns(df_hfri, 'M').iloc[:, 0].values[0])
actual_annualized_return = (ia.annualize_returns(unsmoothed_series, 'M').iloc[:, 0].values[0])

print(f'Annualized Volatility Before Smoothing: {reported_vol}')
print(f'Annualized Volatility After Adjustment: {adjusted_vol}')

print(f"Annualized Return Before Smoothing: {reported_annualized_return}")
print(f"Annualized Return After Adjustment: {actual_annualized_return}")