In [None]:
import pandas as pd

def build_feature_dataset(
    input_paths: list[str],
    output_path: str,
    region: str,
    cols: list[str],
    freq: str = "30T",
    plot: bool = True
) -> tuple[pd.DataFrame, pd.Series]:
    """
    Loading Features 
    Args:
        input_paths:    List of Parquet file paths. Each file must load into a DataFrame
                        whose columns are a MultiIndex with levels [region, variable_name].
        output_path:    File‐path (including filename) where the feature report should be written.
        region:         The first‐level column key (region) to subset by after concatenation.
        cols:           A list of variable names (second‐level columns) to keep, once we subset to `region`.
        freq:           A Pandas offset alias (e.g. "30T", "15T", "1H") used to resample each DataFrame.
                        Default is "30T".
        plot:           If True, calls `generate_feature_report(...)` on the final feature set.

    Returns:
        feat:   A DataFrame of shape [n_samples × n_features], containing:
                • the time‐features (weekday, hour, month, etc.),
                • the chosen columns in `cols`,
                • and any newly added columns (forward‐filled) for modeling.
        tar:    A pd.Series named "Imbalance_Minus_Spot", aligned with `feat.index`, 
                containing the (imbalance_price − spot_price) at each timestamp.
    """
    # ----------------------------------------------------------------------------------
    # Helpers for timezone‐normalization + resampling
    # ----------------------------------------------------------------------------------
    def _load_and_resample_one(path: str, freq_rule: str) -> pd.DataFrame:
        """
        Loads one Parquet file into a DataFrame with a DateTimeIndex, normalizes
        its index to Asia/Tokyo, and resamples to `freq_rule` using .mean().
        """
        df = pd.read_parquet(path)

        # Ensure index is datetime:
        df = df.copy()
        df.index = pd.to_datetime(df.index)

        # If tz‐naive → assume it's already JST, so localize to Asia/Tokyo.
        # If tz‐aware (e.g. UTC or anything), convert to Asia/Tokyo.
        if df.index.tz is None:
            df.index = df.index.tz_localize("Asia/Tokyo")
        else:
            df.index = df.index.tz_convert("Asia/Tokyo")

        # Resample to the requested frequency, taking the mean of each
        # (e.g. if `freq_rule="30T"`, each 30‐minute block is averaged).
        df_resampled = df.resample(freq_rule).mean()
        return df_resampled

    # ----------------------------------------------------------------------------------
    # 1) Load + resample each input DataFrame; collect start/end times
    # ----------------------------------------------------------------------------------
    loaded_dfs = []
    start_times = []
    end_times = []

    for path in input_paths:
        df_resampled = _load_and_resample_one(path, freq)
        loaded_dfs.append(df_resampled)

        # Record the new index range
        start_times.append(df_resampled.index.min())
        end_times.append(df_resampled.index.max())

    if not loaded_dfs:
        raise ValueError("`input_paths` must contain at least one parquet file.")

    # ----------------------------------------------------------------------------------
    # 2) Find the common date‐range: [latest_start, earliest_end]
    # ----------------------------------------------------------------------------------
    latest_start = max(start_times)
    earliest_end = min(end_times)

    if latest_start >= earliest_end:
        raise ValueError(
            f"No overlapping time‐range found among the loaded files. "
            f"latest_start={latest_start}, earliest_end={earliest_end}"
        )

    # 3) Truncate each DataFrame to [latest_start : earliest_end]
    aligned_dfs = [
        df.loc[latest_start : earliest_end] for df in loaded_dfs
    ]

    # ----------------------------------------------------------------------------------
    # 4) Concatenate side‐by‐side (axis=1)
    # ----------------------------------------------------------------------------------
    # Since each df had columns = MultiIndex [region, variable], 
    # the concatenation keeps the same MultiIndex column structure.
    concatenated = pd.concat(aligned_dfs, axis=1)

    # ----------------------------------------------------------------------------------
    # 5) Subset by region (first level) and then by `cols` (second level)
    # ----------------------------------------------------------------------------------
    # This picks out one “slice” of the MultiIndex at level=0 == region.
    try:
        df_region = concatenated[region]
    except KeyError:
        raise KeyError(f"Region '{region}' not found in the concatenated columns.")

    # Now df_region’s columns are the second level only. We keep exactly `cols`.
    missing = [c for c in cols if c not in df_region.columns]
    if missing:
        raise KeyError(f"The following requested columns are not present for region {region}: {missing}")

    df_region = df_region[cols]

    # ----------------------------------------------------------------------------------
    # 6) Find the first+last index where BOTH spot & imbalance are non‐NaN
    # ----------------------------------------------------------------------------------
    imb_col = "pri_imb_down_%_kwh_jst_min30_a"
    spot_col = "pri_spot_jepx_%_kwh_jst_min30_a"

    # Ensure those two are in `cols` (or else we can’t form the target)
    if imb_col not in df_region.columns or spot_col not in df_region.columns:
        raise KeyError(
            f"Cannot find both target columns ('{imb_col}' and '{spot_col}') in df_region. "
            f"Got columns={list(df_region.columns)}"
        )

    # Build a mask where both are non‐NaN:
    both_valid = (
        df_region[imb_col].notna() &
        df_region[spot_col].notna()
    )
    # If there is no timestamp where both are valid, it's an error:
    if not both_valid.any():
        raise ValueError(
            f"No timestamp exists where both '{imb_col}' and '{spot_col}' are non‐NaN."
        )

    valid_times = df_region.index[both_valid]
    crop_start = valid_times.min()
    crop_end = valid_times.max()

    # Crop the DataFrame so that the first row has both non‐NaN, and the last row has both non‐NaN
    df_region = df_region.loc[crop_start : crop_end]

    # ----------------------------------------------------------------------------------
    # 7) Forward‐fill any remaining NaNs (limit=1)
    # ----------------------------------------------------------------------------------
    df_region = df_region.ffill(limit=1)

    # ----------------------------------------------------------------------------------
    # 8) Construct time‐features
    #    (Assumes you already have a function `construct_time_features(df)` defined elsewhere.)
    # ----------------------------------------------------------------------------------
    construct_time_features(df_region)

    # ----------------------------------------------------------------------------------
    # 9) Create the target series: "Imbalance_Minus_Spot"
    # ----------------------------------------------------------------------------------
    tar = df_region[imb_col] - df_region[spot_col]
    tar.name = "Imbalance_Minus_Spot"

    # ----------------------------------------------------------------------------------
    # 10) Optionally generate a feature report
    #    (Assumes you already have `generate_feature_report(...)` imported.)
    # ----------------------------------------------------------------------------------
    if plot:
        # name="Features" is arbitrary; you can change if you like
        generate_feature_report(
            features=df_region,
            target=tar,
            document_name=output_path,
            name="Features"
        )

    # ----------------------------------------------------------------------------------
    # 11) Return the final feature‐DataFrame and the target‐Series
    # ----------------------------------------------------------------------------------
    return df_region, tar