In [5]:
# === analizar_frecuencia.py ===
# Script para analizar el intervalo real de muestreo por variable

import pandas as pd
import os
from pathlib import Path

# === CONFIGURACION ===
BRONZE_DIR = Path("../data/bronze")
VARIABLES = [
    "voltaje",
    "corriente_carga",
    "temperatura_aceite",
    "temperatura_ambiente",
    "temperatura_punto_caliente",
    "temperatura_burbujeo",
    "tap_position",
    "potencia_aparente",
]

# === FUNCION AUXILIAR ===
def limpiar_timestamp(columna_timestamp: pd.Series) -> pd.Series:
    columna = (
        columna_timestamp.astype(str)
        .str.replace(r"(\.\d{6})\d+", r"\1", regex=True)
        .str.replace("Z", "")
    )
    columna = pd.to_datetime(columna, format="mixed", errors="coerce")
    return columna.dt.tz_localize(None).dt.floor("s")

# === ANALISIS ===
print("\n=== Análisis de frecuencia de muestreo por variable ===")
for var in VARIABLES:
    path = BRONZE_DIR / f"{var}.parquet"
    if not path.exists():
        print(f"[ADVERTENCIA] No se encuentra: {path}")
        continue
    df = pd.read_parquet(path)
    if "timestamp" not in df.columns or "value" not in df.columns:
        print(f"[ERROR] {var}: columnas esperadas no encontradas")
        continue

    df["timestamp"] = limpiar_timestamp(df["timestamp"])
    print( f"nulos : {df["timestamp"].isnull().sum} ")
    # df = df.dropna(subset=["timestamp"])
    # df = df.sort_values("timestamp")
    # df["delta"] = df["timestamp"].diff().dt.total_seconds() / 60

    # print(f"\n{var.upper()}")
    # print(df["delta"].describe().round(2))


=== Análisis de frecuencia de muestreo por variable ===
nulos : <bound method Series.sum of 0          False
1          False
2          False
3          False
4          False
           ...  
3131306    False
3131307    False
3131308    False
3131309    False
3131310    False
Name: timestamp, Length: 3131311, dtype: bool> 
nulos : <bound method Series.sum of 0          False
1          False
2          False
3          False
4          False
           ...  
4419007    False
4419008    False
4419009    False
4419010    False
4419011    False
Name: timestamp, Length: 4419012, dtype: bool> 
nulos : <bound method Series.sum of 0        False
1        False
2        False
3        False
4        False
         ...  
11859    False
11860    False
11861    False
11862    False
11863    False
Name: timestamp, Length: 11864, dtype: bool> 
nulos : <bound method Series.sum of 0        False
1        False
2        False
3        False
4        False
         ...  
22869    False
22870    Fals

In [6]:
print(df)

                 timestamp      value
0      2024-09-10 04:00:00  30.917398
1      2024-09-10 04:01:00  31.090598
2      2024-09-10 04:02:00  30.933044
3      2024-09-10 04:03:00  30.888354
4      2024-09-10 04:04:00  30.664779
...                    ...        ...
372871 2025-06-30 02:33:00  28.294413
372872 2025-06-30 02:50:00  27.802991
372873 2025-06-30 03:23:00  26.947633
372874 2025-06-30 03:36:00  26.722704
372875 2025-06-30 03:49:00  26.494724

[372876 rows x 2 columns]


In [14]:
import pandas as pd
df_silver= pd.read_parquet('../data/silver/silver_data.parquet')
df_silver.head(50)

Unnamed: 0_level_0,voltaje,corriente_carga,temperatura_aceite,temperatura_ambiente,temperatura_punto_caliente,temperatura_burbujeo,potencia_aparente,tap_position
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-09-10 04:00:00,131.041825,739.327177,,26.5,,178.928398,30.745757,
2024-09-10 04:15:00,130.693073,728.593612,53.5,26.65,58.642727,178.854984,30.232734,
2024-09-10 04:30:00,130.812457,715.310359,53.342857,26.8,58.308074,178.78157,29.725678,
2024-09-10 04:45:00,130.924854,701.8143,53.185714,26.6,57.973421,178.41114,29.164683,
2024-09-10 05:00:00,130.564417,688.681148,,26.233334,57.638768,176.250397,28.610652,
2024-09-10 05:15:00,130.947408,679.651644,,25.866667,57.304115,176.126086,28.242603,
2024-09-10 05:30:00,131.011387,671.082906,52.714285,25.5,57.232162,176.001775,27.921659,
2024-09-10 05:45:00,131.140706,663.630621,52.557143,26.299999,56.91333,,27.625818,
2024-09-10 06:00:00,131.046645,656.529251,52.4,26.5,56.74929,175.753153,27.32453,
2024-09-10 06:15:00,131.39203,646.330457,52.042857,25.5,56.585251,175.628843,26.964923,


In [11]:
print(df.columns)

Index(['voltaje', 'corriente_carga', 'temperatura_aceite',
       'temperatura_ambiente', 'temperatura_punto_caliente',
       'temperatura_burbujeo', 'potencia_aparente', 'tap_position'],
      dtype='object')


In [9]:
for col in df_silver.columns:
    
    print(f" {col}, {df_silver[col].isnull().sum()}")

 voltaje, 303
 corriente_carga, 246
 temperatura_aceite, 2017
 temperatura_ambiente, 874
 temperatura_punto_caliente, 840
 temperatura_burbujeo, 2302
 potencia_aparente, 184
 tap_position, 24


In [None]:
for col in df_silver.columns:
    print(f" {col}, {df_silver[col].isnull().sum()}")

 voltaje, 318
 corriente_carga, 259
 temperatura_aceite, 4036
 temperatura_ambiente, 1764
 temperatura_punto_caliente, 1826
 temperatura_burbujeo, 4337
 potencia_aparente, 199
 tap_position, 24
