# Análise Exploratória dos Dados
---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

plt.style.use("seaborn-v0_8")
pd.set_option("display.max_columns", 100)

BASE_DIR = Path("data_ons")

def find_latest_file(base_dir: Path, pattern: str, recursive: bool = False) -> Path:
    
    globber = base_dir.rglob if recursive else base_dir.glob
    matches = list(globber(pattern))
    if not matches:
        raise FileNotFoundError(f"Nenhum arquivo encontrado para o padrão: {pattern}")
    latest = max(matches, key=lambda p: p.stat().st_mtime)
    return latest

# Usar padrões com wildcard para lidar com sufixos de data
df_carga_v = pd.read_parquet(find_latest_file(BASE_DIR, "carga_verificada_SECO*.parquet"))
df_carga_p = pd.read_parquet(find_latest_file(BASE_DIR, "carga_programada_SECO*.parquet"))
df_balanco = pd.read_parquet(find_latest_file(BASE_DIR, "balanco_subsistemas*.parquet"))
df_cmo = pd.read_parquet(find_latest_file(BASE_DIR, "cmo_semi_horario_SECO*.parquet"))
df_gtm = pd.read_parquet(find_latest_file(BASE_DIR, "geracao_termica_motivo*.parquet"))

print("Arquivos carregados:")
print("df_carga_v:", find_latest_file(BASE_DIR, "carga_verificada_SECO*.parquet"))
print("df_carga_p:", find_latest_file(BASE_DIR, "carga_programada_SECO*.parquet"))
print("df_balanco:", find_latest_file(BASE_DIR, "balanco_subsistemas*.parquet"))
print("df_cmo:", find_latest_file(BASE_DIR, "cmo_semi_horario_SECO*.parquet"))
print("df_gtm:", find_latest_file(BASE_DIR, "geracao_termica_motivo*.parquet"))

df_carga_v.head()


In [None]:
def fix_datetime(df, col):
    df[col] = pd.to_datetime(df[col], utc=True, errors="coerce")
    df = df.dropna(subset=[col])
    df = df.sort_values(col)
    return df

df_carga_v = fix_datetime(df_carga_v, "din_referenciautc")
df_carga_p = fix_datetime(df_carga_p, "din_referenciautc")
df_balanco = fix_datetime(df_balanco, "din_instante")
df_cmo = fix_datetime(df_cmo, "din_instante") if "din_instante" in df_cmo.columns else fix_datetime(df_cmo, "din_referenciautc")
df_gtm = fix_datetime(df_gtm, "din_instante")

df_balanco.head()