In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
BASE_DIR = "./test-data/" 

In [None]:
def introduce_missing_data(df, missing_rate, seed=42):
    rng = np.random.default_rng(seed)
    df_missing = df.copy()
    mask = rng.random(len(df_missing)) < missing_rate
    df_missing.loc[mask, "throughput_bps"] = np.nan
    print(f"Introduced {missing_rate * 100}% missing data.")
    # print(f"Mask: {mask}")
    return df_missing

datasets_missing = {}

for file in os.listdir(BASE_DIR):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(BASE_DIR, file))
        base_key = file.removesuffix(".throughput.csv")  
        datasets_missing[base_key] = {} 
        
        for rate in [0.1, 0.2, 0.3, 0.4]:
            df_missing = introduce_missing_data(df, missing_rate=rate, seed=42)
            rate_key = f"{int(rate * 100)}"
            datasets_missing[base_key][rate_key] = df_missing

datasets_missing  # DataFrame com 10% de dados faltantes

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
results = []

In [None]:
def evaluate_imputation(mask_missing, df, df_imputed, method):
    # real and imputed values where theres missing
    y_true = df.loc[mask_missing, "throughput_bps"].values
    y_pred = df_imputed.loc[mask_missing, "throughput_bps"].values
    
    if len(y_true) > 0: 
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        nrmse = rmse / (y_true.max() - y_true.min()) # range
        nrmse_mean = rmse / y_true.mean() # mean
        mae = mean_absolute_error(y_true, y_pred)
        mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        r2 = r2_score(y_true, y_pred)
        
        results.append({
            "file": file,
            "rate": rate,
            "method": method,
            "rmse": rmse,
            "nrmse": nrmse, # range
            # "nrmse_mean": nrmse_mean, # mean
            # "mae": mae,
            # "mape": mape,
            # "r2": r2,
        })

        return results
    
for file, rates_dict in datasets_missing.items():
    for rate, df_missing in rates_dict.items():
        mask_missing = df_missing["throughput_bps"].isna()
        
        # results = evaluate_imputation(mask_missing, df, df_kalman_arima, "kalman arima (1,1,1)")
        
        
df_results = pd.DataFrame(results)
df_results.head()
df_results.to_csv("results.csv", index=False)

print("Resultados salvos em results.csv")