In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
BASE_DIR = "./test-data/" 

In [None]:
def introduce_missing_data(df, missing_rate, seed=42):
    rng = np.random.default_rng(seed)
    df_missing = df.copy()
    mask = rng.random(len(df_missing)) < missing_rate
    df_missing.loc[mask, "throughput_bps"] = np.nan
    print(f"Introduced {missing_rate * 100}% missing data.")
    # print(f"Mask: {mask}")
    return df_missing

datasets_missing = {}

for file in os.listdir(BASE_DIR):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(BASE_DIR, file))
        base_key = file.removesuffix(".throughput.csv")  
        datasets_missing[base_key] = {} 
        
        for rate in [0.1, 0.2, 0.3, 0.4]:
            df_missing = introduce_missing_data(df, missing_rate=rate, seed=42)
            rate_key = f"{int(rate * 100)}"
            datasets_missing[base_key][rate_key] = df_missing

datasets_missing  # DataFrame com 10% de dados faltantes

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
from sklearn.impute import KNNImputer


In [None]:
def impute_knn_imputer(df_missing, k=5):
    df_imp = df_missing.copy()

    imputer = KNNImputer(n_neighbors=k, weights="uniform")
    imputed_values = imputer.fit_transform(df_imp[["throughput_bps"]])

    df_imp["throughput_bps"] = imputed_values[:, 0]
    return df_imp

In [None]:
import numpy as np
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.statespace.structural import UnobservedComponents

In [None]:
def impute_kalman(
    df_missing,
    model="arima",
    # --- ARIMA/SARIMA params ---
    arima_order=(1, 1, 1),
    seasonal_order=(0, 0, 0, 0),
    # --- Estrutural params ---
    level="local level",         # opções comuns: "local level", "local linear trend"
    seasonal_period=None         # por ex., 24 p/ sazonalidade diária em dados horários
):
    """
    Imputa NaNs em 'throughput_bps' via Kalman smoothing.

    Parâmetros
    ----------
    df_missing : pd.DataFrame
        DataFrame com colunas 'time' (datetime string com timezone) e 'throughput_bps'.
    model : {'arima', 'structural'}
        Escolhe a abordagem:
        - 'arima': ajusta SARIMAX(ARIMA/SARIMA) e usa as previsões in-sample (Kalman).
        - 'structural': ajusta UnobservedComponents (nível/trend/seasonal) + Kalman.
    arima_order : tuple
        Ordem (p, d, q) do ARIMA.
    seasonal_order : tuple
        Ordem sazonal (P, D, Q, s) para SARIMA.
    level : str
        Componente de nível do modelo estrutural (ex.: 'local level', 'local linear trend').
    seasonal_period : int or None
        Período sazonal para o modelo estrutural (ex.: 24, 7*24, etc.). None = sem sazonalidade.

    Retorna
    -------
    df_imputed : pd.DataFrame
        Cópia de df_missing com 'throughput_bps' imputado nos pontos NaN.
    """
    df_imp = df_missing.copy()

    # Garante dtype datetime (assumido correto e com tz, sem tratamento de erros)
    t = pd.to_datetime(df_imp["time"])
    y = df_imp["throughput_bps"].astype(float)

    # Máscara de faltantes
    miss_mask = y.isna()

    if model == "arima":
        # SARIMAX lida com NaNs na endógena e usa Kalman internamente
        mod = SARIMAX(
            y,
            order=arima_order,
            seasonal_order=seasonal_order,
            enforce_stationarity=False,
            enforce_invertibility=False
        )
        res = mod.fit(disp=False)

        # Previsão in-sample (predicted_mean) já incorpora filtro/smoother de Kalman
        y_hat = res.get_prediction().predicted_mean

    elif model == "structural":
        # Modelo estrutural: nível / tendência / sazonal (state-space) + Kalman
        # Ex.: level='local level' ou 'local linear trend'
        #     seasonal_period define sazonalidade (ex.: 24 p/ hora, 7*24 p/ semanal, etc.)
        ucm = UnobservedComponents(
            y,
            level=level,
            seasonal=seasonal_period  # None => sem sazonal
        )
        res = ucm.fit(disp=False)

        # predicted_mean é a série observável estimada (alisada) pelo modelo
        y_hat = res.get_prediction().predicted_mean

    else:
        raise ValueError("model must be 'arima' or 'structural'")

    # Imputa apenas onde havia NaN, preservando os valores observados
    y_imp = y.copy()
    y_imp[miss_mask] = y_hat[miss_mask].values

    df_imp["throughput_bps"] = y_imp.values
    return df_imp


In [None]:
# --- Moving Average (SMA) centrada ---
def impute_moving_average(df_missing, window=5, center=True):
    """
    Imputa NaNs em 'throughput_bps' usando média móvel (SMA).
    
    Ideia:
      - Calcula a média dos vizinhos dentro de uma janela.
      - Aqui usamos janela centrada (anterior e posterior) e min_periods=1
        para sempre produzir valor mesmo com poucas observações na borda.
      - Substitui apenas os pontos que eram NaN originalmente.

    Parâmetros
      window: tamanho da janela (3–10 usualmente)
      center: True para janela centrada (vizinho passado e futuro)
    """
    df_imp = df_missing.copy()
    y = df_imp["throughput_bps"]

    # Média móvel centrada; ignora NaNs automaticamente; min_periods=1 para não “perder” bordas
    sma = y.rolling(window=window, center=center, min_periods=1).mean()

    mask_missing = y.isna()
    df_imp.loc[mask_missing, "throughput_bps"] = sma[mask_missing].values
    return df_imp


# --- Exponentially Weighted Moving Average (EWMA) ---
def impute_ewma(df_missing, alpha=0.2):
    """
    Imputa NaNs em 'throughput_bps' usando EWMA (média móvel exponencial).
    
    Ideia:
      - EWMA pondera mais os valores recentes (decaimento controlado por alpha).
      - Aqui usamos a forma “causal” (somente passado) com adjust=False (recursivo padrão).
      - Substitui apenas os pontos que eram NaN originalmente.

    Parâmetros
      alpha: 0.1–0.3 é um intervalo comum; quanto maior, mais peso ao mais recente.
    """
    df_imp = df_missing.copy()
    y = df_imp["throughput_bps"]

    ew = y.ewm(alpha=alpha, adjust=False).mean()

    mask_missing = y.isna()
    df_imp.loc[mask_missing, "throughput_bps"] = ew[mask_missing].values
    return df_imp


In [None]:
import numpy as np
import pandas as pd
from typing import Tuple, Optional

# ----------------------------
# 1) PERIOD (n_lines) ESTIMATION
# ----------------------------

def _acf(y: np.ndarray, max_lag: int) -> np.ndarray: # ->basicamente isso aqui calcula a galera com maior correlação e deixa em uma só linha
    """Biased ACF up to max_lag (lag 0..max_lag). NaNs are linearly interpolated first."""
    y = pd.Series(y).interpolate(limit_direction="both").to_numpy()
    y = y - np.nanmean(y)
    n = len(y)
    acf_vals = np.empty(max_lag + 1)
    denom = np.dot(y, y) + 1e-12
    for lag in range(max_lag + 1):
        acf_vals[lag] = np.dot(y[: n - lag], y[lag:]) / denom
    return acf_vals

def _fft_period(y: np.ndarray, min_period: int, max_period: int) -> Optional[int]: # isso aqui faz a mesma coisa só que com fourrier 
    """FFT-based dominant period in [min_period, max_period], None if not found."""
    y = pd.Series(y).interpolate(limit_direction="both").to_numpy()
    y = y - np.mean(y)
    n = len(y)
    if n < 4:
        return None
    # Real FFT spectrum
    spec = np.fft.rfft(y)
    freqs = np.fft.rfftfreq(n, d=1.0)  # assume unit sampling
    # Exclude DC
    mask = freqs > 0
    freqs = freqs[mask]
    power = (spec[mask].real**2 + spec[mask].imag**2)
    # Convert freq -> period
    periods = np.round(1.0 / freqs).astype(int)
    # Keep only within bounds
    sel = (periods >= min_period) & (periods <= max_period)
    if not np.any(sel):
        return None
    # Aggregate power by period (many freqs can map to same rounded period)
    dfp = pd.DataFrame({"period": periods[sel], "power": power[sel]})
    top = dfp.groupby("period", as_index=False)["power"].sum().sort_values("power", ascending=False)
    return int(top["period"].iloc[0]) if len(top) else None

def estimate_period(
    y: np.ndarray,
    min_period: int = 4,
    max_period: Optional[int] = None
) -> int:
    """
    Pick period (n_lines) automatically using ACF peak with FFT fallback.
    """
    y = np.asarray(y, dtype=float)
    n = len(y)
    if max_period is None:
        max_period = max(7, min(n // 4, 1000))  # sensible cap

    if n < min_period * 2:
        # too short — just return something small
        return max(min_period, min(n, 8))

    acf_vals = _acf(y, max_period)
    # Ignore lag 0; pick the best lag in [min_period, max_period]
    candidate_lags = np.arange(min_period, max_period + 1)
    best_lag = candidate_lags[np.argmax(acf_vals[min_period: max_period + 1])]

    # FFT fallback check: if ACF peak is weak, try FFT suggestion
    acf_strength = acf_vals[best_lag]
    fft_suggestion = _fft_period(y, min_period, max_period)
    if fft_suggestion is not None:
        if acf_strength < 0.15:  # weak ACF; trust FFT
            return int(fft_suggestion)
        # If both agree closely, prefer the smaller (more stable) period
        if abs(fft_suggestion - best_lag) <= 2:
            return int(min(fft_suggestion, best_lag))
    return int(best_lag)

# ---------------------------------
# 2) FOLDING (PERIODIC TEMPORAL MATRIX)
# ---------------------------------

def fold_series_to_matrix(y: np.ndarray, period: int) -> Tuple[np.ndarray, int]: # basicamente isso cria a matriz temporal 
    """
    Fold 1D series into a (n_blocks, period) matrix. 
    Pads the last block with NaN if needed.
    Returns (M, original_len).
    """
    y = np.asarray(y, dtype=float)
    n = len(y)
    n_blocks = int(np.ceil(n / period)) #n de coluans
    pad_len = n_blocks * period - n
    if pad_len > 0:
        y = np.concatenate([y, np.full(pad_len, np.nan)])
    M = y.reshape(n_blocks, period)
    return M, n

def unfold_matrix_to_series(M: np.ndarray, original_len: int) -> np.ndarray: # volta pro formato original
    """Inverse of fold: row-wise flatten and trim to original length."""
    y = M.reshape(-1)
    return y[:original_len]

# ----------------------------
# 3) SVD + RANK SELECTION
# ----------------------------

def svd_rank(M_filled: np.ndarray, energy: float = 0.9) -> Tuple[np.ndarray, np.ndarray, np.ndarray, int]: # faz o svd na tora e escolhe o r com a soma dos valore sisnuglares so deus sabe pq
    """
    Compute SVD and choose rank r by cumulative explained 'energy' (sum of singular values).
    """
    U, s, Vt = np.linalg.svd(M_filled, full_matrices=False)
    cum = np.cumsum(s) / (np.sum(s) + 1e-12)
    r = int(np.searchsorted(cum, energy) + 1)
    r = max(1, min(r, min(M_filled.shape)))
    return U, s, Vt, r

# -------------------------------------------
# 4) KNN IMPUTE USING ROW EMBEDDINGS FROM SVD
# -------------------------------------------

def _warm_start_fill(M: np.ndarray) -> np.ndarray: # coloca mediana em tudo e vapo só pra começar
    """Column-wise median fill as a stable warm start."""
    M_filled = M.copy()
    col_medians = np.nanmedian(M_filled, axis=0)
    # If an entire column is NaN, fallback to global median
    if np.any(np.isnan(col_medians)):
        global_med = np.nanmedian(M_filled)
        col_medians = np.where(np.isnan(col_medians), global_med, col_medians)
    inds = np.where(np.isnan(M_filled))
    M_filled[inds] = np.take(col_medians, inds[1])
    return M_filled

def impute_with_knn_in_latent( 
    M: np.ndarray,
    k: int = 5,
    energy: float = 0.9,
    allow_future: bool = True
) -> np.ndarray:
    """
    Impute NaNs in M by KNN in SVD latent space (rows ≈ cycles/weeks). que porra eh latent space
    For each missing cell (i,j), find k nearest rows to row i in latent space
    among those with M[row, j] observed (and optionally row < i).
    """
    M_filled0 = _warm_start_fill(M)
    U, s, Vt, r = svd_rank(M_filled0, energy=energy)
    # Row embeddings (T x r): U_r * S_r
    Z = U[:, :r] * s[:r]  # broadcasting: each column of U scaled by s

    M_imp = M.copy()
    T, P = M.shape
    eps = 1e-8

    # Precompute which rows have each column observed
    observed_mask = ~np.isnan(M)
    for i in range(T):
        # indices (columns) that are missing in row i
        miss_cols = np.where(~observed_mask[i])[0]
        if len(miss_cols) == 0:
            continue
        zi = Z[i]

        # Candidate rows for neighbors (global, filtered below per col)
        if allow_future:
            candidate_rows_global = np.arange(T)
        else:
            candidate_rows_global = np.arange(0, i)  # only past

        if len(candidate_rows_global) == 0:
            # If we can't use past (i==0), allow future just for this row:
            candidate_rows_global = np.arange(T)

        # Distances in latent space to all candidates
        Zc = Z[candidate_rows_global]
        dists = np.linalg.norm(Zc - zi[None, :], axis=1)
        dists = dists + eps  # avoid zero

        for j in miss_cols:
            # keep only candidates that have this column observed
            obs_rows = candidate_rows_global[observed_mask[candidate_rows_global, j]]
            if len(obs_rows) == 0:
                # fall back to warm-start value if nothing observed
                M_imp[i, j] = M_filled0[i, j]
                continue

            # distances for those rows
            d = np.linalg.norm(Z[obs_rows] - zi[None, :], axis=1) + eps
            # k nearest
            if len(d) > k:
                idx = np.argpartition(d, k)[:k]
                nn_rows = obs_rows[idx]
                d = d[idx]
            else:
                nn_rows = obs_rows

            w = 1.0 / d  # inverse-distance weights
            vals = M[nn_rows, j]
            # safety: if still NaN (shouldn't happen), drop them
            ok = ~np.isnan(vals)
            if not np.any(ok):
                M_imp[i, j] = M_filled0[i, j]
            else:
                vals = vals[ok]
                w = w[ok]
                M_imp[i, j] = np.sum(w * vals) / np.sum(w)

    return M_imp

# ----------------------------
# 5) MAIN PIPELINE
# ----------------------------

def impute_throughput_svd_knn(
    df: pd.DataFrame,
    col: str = "throughput_bps",
    min_period: int = 4,
    max_period: Optional[int] = None,
    energy: float = 0.9,
    k: int = 5,
    allow_future: bool = True
) -> Tuple[pd.Series, dict]:
    """
    Full pipeline:
    1) auto period detection -> n_lines (period)
    2) fold into (n_blocks, period) matrix
    3) SVD -> latent row embeddings
    4) KNN in latent space to impute NaNs
    5) unfold back to 1D series
    Returns (imputed_series, diagnostics).
    """
    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in df.")

    y = df[col].to_numpy(dtype=float)
    original_index = df.index

    # Detect period (n_lines)
    period = estimate_period(y, min_period=min_period, max_period=max_period)

    # Fold
    M, orig_len = fold_series_to_matrix(y, period=period)

    # Impute in latent KNN space
    M_imp = impute_with_knn_in_latent(M, k=k, energy=energy, allow_future=allow_future)

    # Unfold
    y_imp = unfold_matrix_to_series(M_imp, original_len=orig_len)

    print(y_imp)

    diagnostics = {
        "period_estimated": period,
        "matrix_shape": M.shape,
        "rank_energy_target": energy,
        "k_neighbors": k,
        "allow_future": allow_future,
    }
    return pd.Series(y_imp, index=original_index, name=f"{col}_imputed"), diagnostics



In [None]:
# import numpy as np
# import pandas as pd
# from fancyimpute import SoftImpute, IterativeSVD
# from statsmodels.tsa.seasonal import seasonal_decompose

# def impute_hankel_knn_svd(
#     df_missing,
#     window=168,          # tamanho da janela L da Hankel (p.ex. ~1/2 a 1x do período sazonal)
#     k=5,                # vizinhos do KNN time-indexed
#     blend=0.2,          # mistura final entre SVD e KNN nos pontos faltantes (0 = só SVD)
#     seasonal_period=None  # None => inferência simples (diária) pelo passo temporal
# ):
#     """
#     Híbrido: Decomposição + KNN time-indexed + Hankel + SVD (SoftImpute) + reconstrução.

#     Suposições:
#       - df_missing tem colunas 'time' (YYYY-MM-DD HH:MM:SS+00:00) e 'throughput_bps'.
#       - Série univariada; index regular ou quase regular.

#     Passos:
#       1) Índice temporal.
#       2) KNN time-indexed para um preenchimento provisório (base local, suave).
#       3) Normalização robusta (log1p simétrico + mediana/MAD).
#       4) Decomposição sazonal (trend + seasonal) sobre a série *preenchida*.
#       5) Resíduo observado = resíduo com NaN nos pontos originalmente faltantes.
#       6) Embedding Hankel do resíduo observado (NaNs ficam na matriz).
#       7) Matrix completion com SoftImpute (ou IterativeSVD como fallback).
#       8) Reconstrução por média de anti-diagonais (hankelização inversa).
#       9) Combina: trend + seasonal + residual_svd; denormaliza.
#      10) Imputa só onde havia NaN originalmente, com mistura SVD/KNN (blend).
#     """

#     # -------- utilidades internas --------
#     def _ensure_dt_index(dfx):
#         out = dfx.copy()
#         out["time"] = pd.to_datetime(out["time"])
#         out = out.set_index("time").sort_index()
#         return out

#     def _robust_norm(y):
#         log_vals = np.sign(y) * np.log1p(np.abs(y))
#         med = np.nanmedian(log_vals)
#         mad = np.nanmedian(np.abs(log_vals - med))
#         z = (log_vals - med) / (mad + 1e-8)
#         return pd.Series(z, index=y.index), med, mad

#     def _robust_denorm(z, med, mad):
#         log_vals = z * (mad + 1e-8) + med
#         return np.sign(log_vals) * np.expm1(np.abs(log_vals))

#     def _infer_daily_period(idx):
#         if not isinstance(idx, pd.DatetimeIndex) or len(idx) < 3:
#             return 24
#         step_sec = np.median(np.diff(idx.view("int64"))) / 1e9
#         step_min = max(1.0, step_sec / 60.0)
#         p = int(round(24 * 60 / step_min))
#         p = max(2, min(p, max(2, len(idx)//3)))
#         return p

#     def _decompose(series_z, period):
#         filled = series_z.interpolate(method="time", limit_direction="both")
#         if period is None:
#             period = _infer_daily_period(series_z.index)
#         if len(filled) < 3 * max(2, period):
#             # fallback simples
#             trend = filled.rolling(period, min_periods=1).mean()
#             seasonal = (
#                 filled.groupby(filled.index.time).transform("median")
#                 if isinstance(series_z.index, pd.DatetimeIndex) else
#                 pd.Series(0.0, index=filled.index)
#             )
#             seasonal = seasonal - seasonal.mean()
#             resid = filled - trend - seasonal
#             return trend, seasonal, resid
#         dec = seasonal_decompose(filled, model="additive", period=period, extrapolate_trend="freq")
#         return dec.trend, dec.seasonal, dec.resid

#     def _knn_timeindexed(series, k):
#         # KNN 1D no eixo do tempo (proximidade temporal)
#         s = series.copy()
#         idx = s.index
#         x = (idx.view("int64") // 10**9).astype(np.int64)  # seg desde epoch
#         y = s.to_numpy(dtype=float)
#         obs = ~np.isnan(y)
#         miss = ~obs
#         x_obs, y_obs = x[obs], y[obs]
#         x_miss = x[miss]
#         y_imp = y.copy()
#         for i, xm in enumerate(x_miss):
#             d = np.abs(x_obs - xm)
#             if len(d) <= k: nn = np.argsort(d)
#             else: nn = np.argpartition(d, k-1)[:k]
#             y_imp[np.where(miss)[0][i]] = np.mean(y_obs[nn])
#         return pd.Series(y_imp, index=idx)

#     def _build_hankel(x, L):
#         # Trajectory/Hankel: shape (L, K) com K = N-L+1, H[i,j] = x[i+j]
#         x = np.asarray(x, float)
#         N = len(x); K = N - L + 1
#         if K <= 0:  # série muito curta p/ L
#             L = max(2, min(N, L))
#             K = N - L + 1
#         H = np.empty((L, K))
#         for j in range(K):
#             H[:, j] = x[j:j+L]
#         return H

#     def _diagonal_averaging(H):
#         # Reconstrução da série pela média das anti-diagonais
#         L, K = H.shape
#         N = L + K - 1
#         y = np.zeros(N, dtype=float)
#         w = np.zeros(N, dtype=float)
#         for i in range(L):
#             for j in range(K):
#                 n = i + j
#                 v = H[i, j]
#                 if not np.isnan(v):
#                     y[n] += v
#                     w[n] += 1.0
#         y = np.where(w > 0, y / w, np.nan)
#         return y

#     def _choose_imputer(H):
#         frac_nan = np.isnan(H).sum() / H.size
#         if frac_nan > 0:
#             shrink = max(0.1, min(1.0, frac_nan))
#             return SoftImpute(shrinkage_value=shrink)
#         # caso raro (sem NaNs): baixa-rank por IterativeSVD
#         rank = max(5, int(min(H.shape) * 0.3))
#         rank = min(rank, 15, H.shape[0]//2 if H.shape[0]>=2 else 1, H.shape[1]//2 if H.shape[1]>=2 else 1)
#         return IterativeSVD(rank=rank)

#     # -------- pipeline --------
#     df_idx = _ensure_dt_index(df_missing)
#     y = df_idx["throughput_bps"].astype(float)
#     miss_mask = y.isna()

#     # (2) preenchimento provisório por KNN (base local)
#     y_knn = _knn_timeindexed(y, k=k)

#     # (3) normalização robusta e (4) decomposição sobre a série preenchida
#     y_fill = y.copy(); y_fill[miss_mask] = y_knn[miss_mask]
#     z_fill, med, mad = _robust_norm(y_fill)
#     trend, seasonal, resid = _decompose(z_fill, seasonal_period)

#     # (5) resíduo observado: NaN onde originalmente faltava
#     resid_obs = resid.copy()
#     resid_obs[miss_mask] = np.nan

#     # (6) Hankel do resíduo observado (NaNs preservados)
#     L = int(max(2, min(window, len(resid_obs) - 1)))
#     H = _build_hankel(resid_obs.values, L)

#     # (7) Imputação na Hankel via SoftImpute/IterativeSVD
#     imputer = _choose_imputer(H)
#     H_hat = imputer.fit_transform(H)

#     # (8) Reconstrução por média das anti-diagonais -> resíduo refinado
#     r_hat = _diagonal_averaging(H_hat)
#     r_hat = pd.Series(r_hat, index=resid_obs.index)  # mesmo tamanho da série

#     # (9) Reconstrução total em z e denormalização
#     z_hat = trend.fillna(0.0) + seasonal.fillna(0.0) + r_hat.fillna(0.0)
#     y_svd = _robust_denorm(z_hat, med, mad)

#     # (10) Imputar somente onde faltava (mistura SVD + KNN)
#     y_final = y.copy()
#     y_final[miss_mask] = (1 - blend) * y_svd[miss_mask] + blend * y_knn[miss_mask]

    

#     out = df_idx.copy()
#     out["throughput_bps"] = y_final.values
#     return out.reset_index().rename(columns={"index": "time"})

import numpy as np
import pandas as pd
from fancyimpute import SoftImpute, IterativeSVD
from statsmodels.tsa.seasonal import seasonal_decompose

def impute_hankel_knn_svd(  # nome mantido p/ compatibilidade
    df_missing,
    window=168,          # tamanho da janela L da Hankel
    k=5,                 # ignorado (mantido por compatibilidade)
    blend=0.2,           # mistura final entre SVD e interpolação
    seasonal_period=None # None => inferência simples (diária) pelo passo temporal
):
    """
    Híbrido: Decomposição + Interpolação temporal linear + Hankel + SVD (SoftImpute) + reconstrução.

    Suposições:
      - df_missing tem colunas 'time' (YYYY-MM-DD HH:MM:SS+00:00) e 'throughput_bps'.
      - Série univariada; index regular ou quase regular.

    Passos:
      1) Índice temporal.
      2) Interpolação temporal linear (both) para preenchimento provisório.
      3) Normalização robusta (log1p simétrico + mediana/MAD).
      4) Decomposição sazonal (trend + seasonal) sobre a série preenchida.
      5) Resíduo observado = resíduo com NaN nos pontos originalmente faltantes.
      6) Embedding Hankel do resíduo observado (NaNs ficam na matriz).
      7) Matrix completion com SoftImpute (ou IterativeSVD como fallback).
      8) Reconstrução por média de anti-diagonais (hankelização inversa).
      9) Combina: trend + seasonal + residual_svd; denormaliza.
     10) Imputa só onde havia NaN originalmente, com mistura SVD/interpolação (blend).
    """

    # -------- utilidades internas --------
    def _ensure_dt_index(dfx):
        out = dfx.copy()
        out["time"] = pd.to_datetime(out["time"])
        out = out.set_index("time").sort_index()
        return out

    def _robust_norm(y):
        log_vals = np.sign(y) * np.log1p(np.abs(y))
        med = np.nanmedian(log_vals)
        mad = np.nanmedian(np.abs(log_vals - med))
        z = (log_vals - med) / (mad + 1e-8)
        return pd.Series(z, index=y.index), med, mad

    def _robust_denorm(z, med, mad):
        log_vals = z * (mad + 1e-8) + med
        return np.sign(log_vals) * np.expm1(np.abs(log_vals))

    def _infer_daily_period(idx):
        if not isinstance(idx, pd.DatetimeIndex) or len(idx) < 3:
            return 24
        step_sec = np.median(np.diff(idx.view("int64"))) / 1e9
        step_min = max(1.0, step_sec / 60.0)
        p = int(round(24 * 60 / step_min))
        p = max(2, min(p, max(2, len(idx)//3)))
        return p

    def _decompose(series_z, period):
        filled = series_z.interpolate(method="time", limit_direction="both")
        if period is None:
            period = _infer_daily_period(series_z.index)
        if len(filled) < 3 * max(2, period):
            trend = filled.rolling(period, min_periods=1).mean()
            seasonal = (
                filled.groupby(filled.index.time).transform("median")
                if isinstance(series_z.index, pd.DatetimeIndex) else
                pd.Series(0.0, index=filled.index)
            )
            seasonal = seasonal - seasonal.mean()
            resid = filled - trend - seasonal
            return trend, seasonal, resid
        dec = seasonal_decompose(filled, model="additive", period=period, extrapolate_trend="freq")
        return dec.trend, dec.seasonal, dec.resid

    def _build_hankel(x, L):
        x = np.asarray(x, float)
        N = len(x); K = N - L + 1
        if K <= 0:
            L = max(2, min(N, L))
            K = N - L + 1
        H = np.empty((L, K))
        for j in range(K):
            H[:, j] = x[j:j+L]
        return H

    def _diagonal_averaging(H):
        L, K = H.shape
        N = L + K - 1
        y = np.zeros(N, dtype=float)
        w = np.zeros(N, dtype=float)
        for i in range(L):
            for j in range(K):
                n = i + j
                v = H[i, j]
                if not np.isnan(v):
                    y[n] += v
                    w[n] += 1.0
        y = np.where(w > 0, y / w, np.nan)
        return y

    def _choose_imputer(H):
        frac_nan = np.isnan(H).sum() / H.size
        if frac_nan > 0:
            shrink = max(0.1, min(1.0, frac_nan))
            return SoftImpute(shrinkage_value=shrink)
        rank = max(5, int(min(H.shape) * 0.3))
        rank = min(rank, 15, H.shape[0]//2 if H.shape[0]>=2 else 1, H.shape[1]//2 if H.shape[1]>=2 else 1)
        return IterativeSVD(rank=rank)

    # -------- pipeline --------
    df_idx = _ensure_dt_index(df_missing)
    y = df_idx["throughput_bps"].astype(float)
    miss_mask = y.isna()

    # (2) preenchimento provisório por interpolação temporal linear (both)
    y_lin = y.interpolate(method="time", limit_direction="both")

    # (3) normalização robusta e (4) decomposição sobre a série preenchida
    y_fill = y.copy(); y_fill[miss_mask] = y_lin[miss_mask]
    z_fill, med, mad = _robust_norm(y_fill)
    trend, seasonal, resid = _decompose(z_fill, seasonal_period)

    # (5) resíduo observado: NaN onde originalmente faltava
    resid_obs = resid.copy()
    resid_obs[miss_mask] = np.nan

    # (6) Hankel do resíduo observado (NaNs preservados)
    L = int(max(2, min(window, len(resid_obs) - 1)))
    H = _build_hankel(resid_obs.values, L)

    # (7) Imputação na Hankel via SoftImpute/IterativeSVD
    imputer = _choose_imputer(H)
    H_hat = imputer.fit_transform(H)

    # (8) Reconstrução por média das anti-diagonais -> resíduo refinado
    r_hat = _diagonal_averaging(H_hat)
    r_hat = pd.Series(r_hat, index=resid_obs.index)

    # (9) Reconstrução total em z e denormalização
    z_hat = trend.fillna(0.0) + seasonal.fillna(0.0) + r_hat.fillna(0.0)
    y_svd = _robust_denorm(z_hat, med, mad)

    # (10) Imputar somente onde faltava (mistura SVD + interpolação)
    y_final = y.copy()
    y_final[miss_mask] = (1 - blend) * y_svd[miss_mask] + blend * y_lin[miss_mask]

    out = df_idx.copy()
    out["throughput_bps"] = y_final.values
    return out.reset_index().rename(columns={"index": "time"})



In [None]:
results = []

def impute_linear_interpolation(df_missing):
    df_imputed = df_missing.copy()
    df_imputed["throughput_bps"] = df_imputed["throughput_bps"].interpolate(
        method="linear", limit_direction="both"
    )
    return df_imputed

# def evaluate_imputation(mask_missing, df, df_imputed, method):
#     # real and imputed values where theres missing
#     y_true = df.loc[mask_missing, "throughput_bps"].values
#     y_pred = df_imputed.loc[mask_missing, "throughput_bps"].values
    
#     if len(y_true) > 0: 
#         rmse = np.sqrt(mean_squared_error(y_true, y_pred))
#         nrmse = rmse / (y_true.max() - y_true.min()) # range
#         nrmse_mean = rmse / y_true.mean() # mean
#         mae = mean_absolute_error(y_true, y_pred)
#         mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
#         r2 = r2_score(y_true, y_pred)
        
#         results.append({
#             "file": file,
#             "rate": rate,
#             "method": method,
#             "rmse": rmse,
#             "nrmse": nrmse, # range
#             "nrmse_mean": nrmse_mean, # mean
#             "mae": mae,
#             "mape": mape,
#             "r2": r2,
#         })

#         return results

# for file, rates_dict in datasets_missing.items():
#     for rate, df_missing in rates_dict.items():
#         mask_missing = df_missing["throughput_bps"].isna()
        
#         # linear interpolation
#         df_imputed = impute_linear_interpolation(df_missing)
#         results = evaluate_imputation(mask_missing, df, df_imputed, "linear interpolation")

#         df_imputed_knn = impute_knn_imputer(df_missing, k=5)
#         results = evaluate_imputation(mask_missing, df, df_imputed_knn, "knn imputer (k=5)")

#         # 4) Kalman - ARIMA (ex.: ARIMA(1,1,1); sem sazonalidade)
#         df_kalman_arima = impute_kalman(
#             df_missing,
#             model="arima",
#             arima_order=(1, 1, 1),
#             seasonal_order=(0, 0, 0, 0)
#         )
#         results = evaluate_imputation(mask_missing, df, df_kalman_arima, "kalman arima (1,1,1)")

#         # 5) Kalman - Modelo Estrutural (nível local + sazonal diária de 24, se fizer sentido)
#         df_kalman_struct = impute_kalman(
#             df_missing,
#             model="structural",
#             level="local level",
#             seasonal_period=None  # ou 24, 7*24 etc., conforme seu dado
#         )

#         # df_sma = impute_moving_average(df_missing, window=5, center=True)
#         # results = evaluate_imputation(mask_missing, df, df_sma, "moving average (win=5, centered)")

#         # EWMA (alpha 0.2)
#         df_ewma = impute_ewma(df_missing, alpha=0.2)
#         results = evaluate_imputation(mask_missing, df, df_ewma, "ewma (alpha=0.2)")

#         results = evaluate_imputation(mask_missing, df, df_kalman_struct, "kalman structural (level)")
        
#         df_hankel = impute_hankel_knn_svd(df_missing, window=72, k=5, blend=0.2, seasonal_period=None)
#         results = evaluate_imputation(mask_missing, df, df_hankel, "hankel+knn+svd (L=72, k=5, λ=0.2)")

#         df_imputed, diag = impute_throughput_svd_knn(
#                 df_missing,
#                 col="throughput_bps",
#                 min_period=24,   # adjust for your sampling rate
#                 max_period=1000, # search window for period detection
#                 energy=0.9,
#                 k=5,
#                 allow_future=True
#             )

#         # Merge back into DataFrame with same structure
#         df_result = df_missing.copy()
#         df_result["throughput_bps"] = df_imputed.values

#         # --------------------------
#         # Evaluate
#         # --------------------------
#         results = evaluate_imputation(
#             mask_missing, 
#             df, df_result, 
#             method="Hankel+SVD+KNN"
#         )
        
# df_results = pd.DataFrame(results)
# df_results.head()
# df_results.to_csv("results.csv", index=False)

# print("Resultados salvos em results.csv")


In [None]:
# def create_temporal_matrix(df, n_lines=100):
#     throughput_vector = df["throughput_bps"].values
#     columns_quantity = df.shape[1]
#     temporal_matrix = (
#         throughput_vector[: columns_quantity * n_lines]
#         .reshape(columns_quantity, n_lines)
#         .T
#     )
#     return temporal_matrix


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

results = []

def evaluate_imputation(mask_missing, df_true, df_imp, method, file, rate):
    y_true = df_true.loc[mask_missing, "throughput_bps"].values
    y_pred = df_imp.loc[mask_missing, "throughput_bps"].values

    if y_true.size == 0:
        return results

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    nrmse = rmse / (y_true.max() - y_true.min())
    nrmse_mean = rmse / y_true.mean()
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    r2 = r2_score(y_true, y_pred)

    results.append({
        "file": file,
        "rate": int(rate),
        "method": method,
        "rmse": rmse,
        "nrmse": nrmse,
        "nrmse_mean": nrmse_mean,
        "mae": mae,
        "mape": mape,
        "r2": r2,
    })
    return results

datasets_original = {}
datasets_missing = {}

for file in os.listdir(BASE_DIR):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(BASE_DIR, file))
        
        # salva o dataset original
        base_key = file.removesuffix(".throughput.csv")  
        datasets_original[base_key] = df.copy()  

        # gera versões com missing
        datasets_missing[base_key] = {}
        for rate in [0.1, 0.2, 0.3, 0.4]:
            df_missing = introduce_missing_data(df, missing_rate=rate, seed=42)
            rate_key = str(int(rate * 100))
            datasets_missing[base_key][rate_key] = df_missing

for file, rates_dict in datasets_missing.items():
    df = datasets_original[file].copy()

    for rate_key, df_missing in rates_dict.items():
        rate = int(rate_key)
        mask_missing = df_missing["throughput_bps"].isna()

        df_lin = impute_linear_interpolation(df_missing)
        evaluate_imputation(mask_missing, df, df_lin, "linear interpolation", file, rate)

        df_knn = impute_knn_imputer(df_missing, k=5)
        evaluate_imputation(mask_missing, df, df_knn, "knn imputer (k=5)", file, rate)

        df_kalman_arima = impute_kalman(
            df_missing,
            model="arima",
            arima_order=(1, 1, 1),
            seasonal_order=(0, 0, 0, 0)
        )
        evaluate_imputation(mask_missing, df, df_kalman_arima, "kalman arima (1,1,1)", file, rate)

        df_kalman_struct = impute_kalman(
            df_missing,
            model="structural",
            level="local level",
            seasonal_period=None
        )
        evaluate_imputation(mask_missing, df, df_kalman_struct, "kalman structural (level)", file, rate)

        df_ewma = impute_ewma(df_missing, alpha=0.2)
        evaluate_imputation(mask_missing, df, df_ewma, "ewma (alpha=0.2)", file, rate)

        df_hankel = impute_hankel_knn_svd(df_missing, window=72, k=5, blend=0.2, seasonal_period=None)
        evaluate_imputation(mask_missing, df, df_hankel, "hankel+knn+svd (L=72, k=5, λ=0.2)", file, rate)

        df_imp_series, diag = impute_throughput_svd_knn(
            df_missing,
            col="throughput_bps",
            min_period=24,
            max_period=1000,
            energy=0.9,
            k=5,
            allow_future=True
        )
        df_result = df_missing.copy()
        df_result["throughput_bps"] = df_imp_series.values
        evaluate_imputation(mask_missing, df, df_result, "Hankel+SVD+KNN", file, rate)

df_results = pd.DataFrame(results)
df_results.to_csv("results.csv", index=False)
print("Resultados salvos em results.csv")
df_results.head()
