In [1]:
import pandas as pd
from pathlib import Path

In [2]:
EXOG_PATH = Path("../data/exogenous/Co2CoalTTF.xlsx")

def prepare_exog(path=EXOG_PATH):
    # Load
    exog_raw = pd.read_excel(path)
    exog_raw["date"] = pd.to_datetime(exog_raw["date"])
    exog_raw = exog_raw.sort_values("date")

    # Full daily range between first and last date
    full_idx = pd.date_range(
        start=exog_raw["date"].min(),
        end=exog_raw["date"].max(),
        freq="D"
    )

    # Identify missing dates (for logging / sanity check)
    missing_dates = full_idx.difference(exog_raw["date"])
    print(f"Number of missing days in original exog: {len(missing_dates)}")
    if len(missing_dates) > 0:
        print("First few missing dates:", missing_dates[:10])

    # Reindex to full daily index and forward fill
    # Drop year/month/day first, we will recompute after reindexing
    exog = exog_raw.drop(columns=["year", "month", "day"])
    exog = exog.set_index("date").reindex(full_idx)

    # Forward fill value columns
    value_cols = ["TTF", "co2", "coal"]
    exog[value_cols] = exog[value_cols].ffill()

    # Recreate date + calendar breakdown
    exog = exog.reset_index().rename(columns={"index": "date"})
    exog["year"] = exog["date"].dt.year
    exog["month"] = exog["date"].dt.month
    exog["day"] = exog["date"].dt.day

    # Optional: re-order columns
    exog = exog[["date", "year", "month", "day", "TTF", "co2", "coal"]]

    return exog

In [3]:
exog = prepare_exog()
print(exog.head())
print(exog.tail())

Number of missing days in original exog: 0
        date  year  month  day     TTF   co2       coal
0 2015-01-01  2015      1    1  21.296  7.24  55.105750
1 2015-01-02  2015      1    2  20.448  6.99  54.611347
2 2015-01-03  2015      1    3  20.448  6.99  54.611347
3 2015-01-04  2015      1    4  20.448  6.99  54.611347
4 2015-01-05  2015      1    5  20.145  6.89  53.335009
           date  year  month  day     TTF    co2        coal
3648 2024-12-27  2024     12   27  47.731  69.70  107.674881
3649 2024-12-28  2024     12   28  47.731  69.70  107.674881
3650 2024-12-29  2024     12   29  47.731  69.70  107.674881
3651 2024-12-30  2024     12   30  47.633  69.95  108.234120
3652 2024-12-31  2024     12   31  48.889  71.48  110.584100


In [4]:
exog.to_parquet("../data/exogenous/Co2CoalTTF_daily_ffill.parquet", index=False)


## Helper: run ADF + KPSS and summarize

In [6]:
from statsmodels.tsa.stattools import adfuller, kpss
import numpy as np

def unit_root_tests(series, name, adf_reg="c", kpss_reg="c"):
    """
    Run ADF (H0: unit root) and KPSS (H0: stationary) on a 1D series.

    Returns a dict with stats, p-values and a simple classification.
    """
    s = pd.Series(series).dropna()
    s = s.astype(float)

    if s.empty:
        return {
            "series": name,
            "nobs": 0,
            "adf_stat": np.nan,
            "adf_pvalue": np.nan,
            "kpss_stat": np.nan,
            "kpss_pvalue": np.nan,
            "decision": "no data"
        }

    # ADF: H0 = unit root
    adf_res = adfuller(s, regression=adf_reg, autolag="AIC")
    adf_stat, adf_pvalue = adf_res[0], adf_res[1]

    # KPSS: H0 = stationary
    # nlags="auto" (Bartlett kernel + automatic bandwidth)
    kpss_res = kpss(s, regression=kpss_reg, nlags="auto")
    kpss_stat, kpss_pvalue = kpss_res[0], kpss_res[1]

    # Combine both tests into a simple decision rule
    # (you can tweak thresholds if you prefer)
    alpha = 0.05
    if (adf_pvalue < alpha) and (kpss_pvalue > alpha):
        decision = "stationary (no unit root)"
    elif (adf_pvalue > alpha) and (kpss_pvalue < alpha):
        decision = "unit root / non-stationary"
    else:
        decision = "inconclusive / mixed"

    return {
        "series": name,
        "nobs": int(len(s)),
        "adf_stat": adf_stat,
        "adf_pvalue": adf_pvalue,
        "kpss_stat": kpss_stat,
        "kpss_pvalue": kpss_pvalue,
        "decision": decision
    }


def unit_root_table(df, cols, adf_reg="c", kpss_reg="c"):
    """
    Apply unit_root_tests to multiple columns of a DataFrame.
    """
    results = [
        unit_root_tests(df[col], col, adf_reg=adf_reg, kpss_reg=kpss_reg)
        for col in cols
    ]
    out = pd.DataFrame(results)
    return out[[
        "series",
        "nobs",
        "adf_stat",
        "adf_pvalue",
        "kpss_stat",
        "kpss_pvalue",
        "decision"
    ]]


## Apply to endogenous variables (rolling TSI)

In [7]:
import pandas as pd
from pathlib import Path

BASE_TSI_DIR = Path("../data/endogenous/prices") 
OUT_DIR      = Path("../data/endogenous/prices/differenced")

tsi_files = {
    "tsi_mhar_recov_neg": BASE_TSI_DIR / "rolling_tsi_mhar_recov_neg_recalculated.parquet",
    "tsi_mhar_recov_pos": BASE_TSI_DIR / "rolling_tsi_mhar_recov_pos_recalculated.parquet",
    "tsi_mhar_recov":     BASE_TSI_DIR / "rolling_tsi_mhar_recov_recalculated.parquet",
    "tsi_mhar_revar":     BASE_TSI_DIR / "rolling_tsi_mhar_revar_recalculated.parquet",
}

def load_tsi_df(path: Path, series_name: str) -> pd.DataFrame:
    """
    Load a rolling TSI parquet file where the index is the date and the
    single column is 'TSI_recalculated'. Returns a DF with ['date', series_name].
    """
    df = pd.read_parquet(path)

    # If date is the index, bring it out as a column
    if "date" not in df.columns:
        df = df.reset_index()

    # After reset_index, first column is the date, second is the TSI value
    # (in your case the value column is 'TSI_recalculated')
    cols = list(df.columns)
    date_col = cols[0]
    value_col = cols[1]

    df = df.rename(columns={date_col: "date", value_col: series_name})
    df["date"] = pd.to_datetime(df["date"])
    df = df.sort_values("date").reset_index(drop=True)

    return df


In [8]:
all_tsi_results = []
combined = None  # optional: to build a single DF with all 4 series

for series_name, fpath in tsi_files.items():
    print(f"\n=== {series_name} ===")
    df_tsi = load_tsi_df(fpath, series_name)

    # Unit-root test on this single series
    tsi_unit_root = unit_root_table(df_tsi, [series_name],
                                    adf_reg="c",
                                    kpss_reg="c")
    tsi_unit_root.insert(1, "tsi_type", series_name)
    print(tsi_unit_root.round(4))

    all_tsi_results.append(tsi_unit_root)

    # build combined DF with all series (aligned by date) if you want later
    if combined is None:
        combined = df_tsi
    else:
        combined = combined.merge(df_tsi, on="date", how="outer")

# Summary table for all TSI tests
tsi_unit_root_all = pd.concat(all_tsi_results, ignore_index=True)
print("\n=== Unit-root tests for ALL rolling TSI series ===")
print(tsi_unit_root_all.round(4))

# Optional: inspect combined TSI DF
# print(combined.head())



=== tsi_mhar_recov_neg ===


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_res = kpss(s, regression=kpss_reg, nlags="auto")
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_res = kpss(s, regression=kpss_reg, nlags="auto")
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_res = kpss(s, regression=kpss_reg, nlags="auto")


               series            tsi_type  nobs  adf_stat  adf_pvalue  \
0  tsi_mhar_recov_neg  tsi_mhar_recov_neg  1077   -0.7278      0.8394   

   kpss_stat  kpss_pvalue                    decision  
0     3.7854         0.01  unit root / non-stationary  

=== tsi_mhar_recov_pos ===
               series            tsi_type  nobs  adf_stat  adf_pvalue  \
0  tsi_mhar_recov_pos  tsi_mhar_recov_pos  1077   -0.6529      0.8586   

   kpss_stat  kpss_pvalue                    decision  
0     4.1123         0.01  unit root / non-stationary  

=== tsi_mhar_recov ===
           series        tsi_type  nobs  adf_stat  adf_pvalue  kpss_stat  \
0  tsi_mhar_recov  tsi_mhar_recov  1077   -1.3489      0.6065     3.9541   

   kpss_pvalue                    decision  
0         0.01  unit root / non-stationary  

=== tsi_mhar_revar ===
           series        tsi_type  nobs  adf_stat  adf_pvalue  kpss_stat  \
0  tsi_mhar_revar  tsi_mhar_revar  1077    -2.148      0.2256     3.4167   

   kpss_pv

look-up table. The actual p-value is smaller than the p-value returned.

  kpss_res = kpss(s, regression=kpss_reg, nlags="auto")


In [9]:
# 1) build a single DF with all four TSI series
dfs = []
for name, path in tsi_files.items():
    dfs.append(load_tsi_df(path, name))

tsi_all = dfs[0]
for df in dfs[1:]:
    tsi_all = tsi_all.merge(df, on="date", how="outer")

tsi_all = tsi_all.sort_values("date").reset_index(drop=True)

tsi_cols = list(tsi_all.columns)
tsi_cols.remove("date")

# 2) create first differences
tsi_diff = tsi_all.copy()
for col in tsi_cols:
    tsi_diff[f"d_{col}"] = tsi_diff[col].diff()

diff_cols = [f"d_{c}" for c in tsi_cols]

# 3) run unit-root tests on first differences
tsi_diff_unit_root = unit_root_table(tsi_diff, diff_cols,
                                     adf_reg="c",
                                     kpss_reg="c")

print("\n=== Unit-root tests for FIRST DIFFERENCES of TSI ===")
print(tsi_diff_unit_root.round(4))


=== Unit-root tests for FIRST DIFFERENCES of TSI ===
                 series  nobs  adf_stat  adf_pvalue  kpss_stat  kpss_pvalue  \
0  d_tsi_mhar_recov_neg  1076  -13.9681         0.0     0.4728       0.0478   
1  d_tsi_mhar_recov_pos  1076  -11.4653         0.0     0.6835       0.0150   
2      d_tsi_mhar_recov  1076   -5.0242         0.0     0.5882       0.0237   
3      d_tsi_mhar_revar  1076  -11.8078         0.0     0.1067       0.1000   

                    decision  
0       inconclusive / mixed  
1       inconclusive / mixed  
2       inconclusive / mixed  
3  stationary (no unit root)  


look-up table. The actual p-value is greater than the p-value returned.

  kpss_res = kpss(s, regression=kpss_reg, nlags="auto")


In [10]:
# --- build combined endogenous DF ---
dfs = []
for name, path in tsi_files.items():
    dfs.append(load_tsi_df(path, name))

tsi_all = dfs[0]
for df in dfs[1:]:
    tsi_all = tsi_all.merge(df, on="date", how="outer")

tsi_all = tsi_all.sort_values("date").reset_index(drop=True)

# --- first differences ---
tsi_cols = [c for c in tsi_all.columns if c != "date"]
tsi_diff = tsi_all[["date"]].copy()

for col in tsi_cols:
    tsi_diff[f"d_{col}"] = tsi_all[col].diff()

# drop the first row (all diffs are NaN there)
tsi_diff = tsi_diff.dropna(how="all", subset=[c for c in tsi_diff.columns if c != "date"]).reset_index(drop=True)

# save
endog_path = OUT_DIR / "endogenous_tsi_diff.parquet"
tsi_diff.to_parquet(endog_path, index=False)
print(f"Saved differenced endogenous TSI to: {endog_path}")
print(tsi_diff.head())

Saved differenced endogenous TSI to: ..\data\endogenous\prices\differenced\endogenous_tsi_diff.parquet
        date  d_tsi_mhar_recov_neg  d_tsi_mhar_recov_pos  d_tsi_mhar_recov  \
0 2022-05-21              0.565159              0.353851          0.318845   
1 2022-05-22              0.028779              0.053486          0.018242   
2 2022-05-23              0.017773              0.001561          0.009926   
3 2022-05-24             -0.010183             -0.026224         -0.020802   
4 2022-05-25              0.021521             -0.015650         -0.001137   

   d_tsi_mhar_revar  
0          0.464115  
1          0.090025  
2         -0.035204  
3         -0.033416  
4         -0.004788  


## Apply to exogenous variables

In [11]:
exog_cols = ["TTF", "co2", "coal"]

exog_unit_root = unit_root_table(exog, exog_cols, adf_reg="c", kpss_reg="ct")
# For exogenous prices it often makes sense to allow for trend in KPSS: regression="ct"

print("\n=== Unit-root tests for exogenous variables (levels) ===")
print(exog_unit_root.round(4))


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_res = kpss(s, regression=kpss_reg, nlags="auto")
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_res = kpss(s, regression=kpss_reg, nlags="auto")



=== Unit-root tests for exogenous variables (levels) ===
  series  nobs  adf_stat  adf_pvalue  kpss_stat  kpss_pvalue  \
0    TTF  3653   -2.3025      0.1712     0.4645         0.01   
1    co2  3653   -0.8487      0.8044     0.8999         0.01   
2   coal  3653   -1.6953      0.4335     0.4573         0.01   

                     decision  
0  unit root / non-stationary  
1  unit root / non-stationary  
2  unit root / non-stationary  


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_res = kpss(s, regression=kpss_reg, nlags="auto")


In [12]:
exog_diff = exog.copy()
for col in exog_cols:
    exog_diff[f"d_{col}"] = exog_diff[col].diff()

diff_cols = [f"d_{c}" for c in exog_cols]

exog_diff_unit_root = unit_root_table(exog_diff, diff_cols, adf_reg="c", kpss_reg="c")
print("\n=== Unit-root tests for exogenous variables (first differences) ===")
print(exog_diff_unit_root.round(4))


look-up table. The actual p-value is greater than the p-value returned.

  kpss_res = kpss(s, regression=kpss_reg, nlags="auto")
look-up table. The actual p-value is greater than the p-value returned.

  kpss_res = kpss(s, regression=kpss_reg, nlags="auto")



=== Unit-root tests for exogenous variables (first differences) ===
   series  nobs  adf_stat  adf_pvalue  kpss_stat  kpss_pvalue  \
0   d_TTF  3652  -11.0989         0.0     0.0274          0.1   
1   d_co2  3652  -18.1318         0.0     0.0547          0.1   
2  d_coal  3652  -13.4012         0.0     0.0511          0.1   

                    decision  
0  stationary (no unit root)  
1  stationary (no unit root)  
2  stationary (no unit root)  


look-up table. The actual p-value is greater than the p-value returned.

  kpss_res = kpss(s, regression=kpss_reg, nlags="auto")


In [14]:
# 1) keep only date + diff columns
diff_cols = [f"d_{c}" for c in exog_cols]

exog_diff_out = exog[["date"]].copy()
exog_diff_out[diff_cols] = exog_diff[diff_cols]

# 2) drop first row (diff NaNs) and reset index
exog_diff_out = exog_diff_out.dropna(subset=diff_cols).reset_index(drop=True)

# 3) save to parquet
out_path = Path("../data/exogenous/Co2CoalTTF_diff.parquet")
out_path.parent.mkdir(parents=True, exist_ok=True)

exog_diff_out.to_parquet(out_path, index=False)
print(f"Saved differenced exogenous variables to: {out_path}")
print(exog_diff_out.head())

Saved differenced exogenous variables to: ..\data\exogenous\Co2CoalTTF_diff.parquet
        date  d_TTF  d_co2    d_coal
0 2015-01-02 -0.848  -0.25 -0.494403
1 2015-01-03  0.000   0.00  0.000000
2 2015-01-04  0.000   0.00  0.000000
3 2015-01-05 -0.303  -0.10 -1.276338
4 2015-01-06 -0.607  -0.13 -0.597184
