In [None]:
#IMPORTACION Y CONFIGURACION DE MODULOS


import pandas as pd
import logging
import sys
import os

# Configuracion de loging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Importacion de modulos 
from scripts.data_loader import CovidDataLoader
from scripts.data_cleaner import CovidDataCleaner
from scripts.data_imputer import CovidDataImputer
from scripts.feature_engineer import CovidFeatureEngineer

print("✅ Importaciones exitosas")

✅ Importaciones exitosas


In [None]:
#CARGA DE DATOS

# Instanciar el Loader
loader = CovidDataLoader()

# Inicializamos la variable como None por seguridad
df_raw = None

# Cargar datos
try:
    # Ajusta la ruta si es necesario.
    # Si el notebook está en 'api/', busca en 'data/owid-covid-data.csv'
    df_raw = loader.load_data(local_filepath="data/owid-covid-data.csv")
    print(f"✅ Datos cargados exitosamente. Dimensiones: {df_raw.shape}")
except Exception as e:
    print(f"❌ Error crítico cargando datos: {e}")

# Visualizar las primeras filas SOLO si la carga fue exitosa
if df_raw is not None:
    display(df_raw.head()) # 'display()' es mejor que print() en Jupyter
else:
    print("⚠️ No se puede continuar: El DataFrame está vacío.")

2025-11-18 17:48:57 | INFO     | scripts.data_loader | Cargando datos directamente desde archivo local: data\owid-covid-data.csv
2025-11-18 17:48:57,578 - INFO - Cargando datos directamente desde archivo local: data\owid-covid-data.csv
2025-11-18 17:49:04 | INFO     | scripts.data_loader | 334,813 registros cargados — 244 países/agregados detectados.
2025-11-18 17:49:04,120 - INFO - 334,813 registros cargados — 244 países/agregados detectados.


✅ Datos cargados exitosamente. Dimensiones: (334813, 67)


Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-01-03,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Asia,Afghanistan,2020-01-04,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Asia,Afghanistan,2020-01-05,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Asia,Afghanistan,2020-01-06,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Asia,Afghanistan,2020-01-07,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,


In [5]:
#LIMPIEZA DE DATOS


# Inicializamos df_clean como None por seguridad
df_clean = None

# Verificamos si la variable 'df_raw' existe y tiene datos
if 'df_raw' in locals() and df_raw is not None:
    
    print("--- 2. Iniciando Limpieza ---")
    # Instanciar el Cleaner
    cleaner = CovidDataCleaner()

    try:
        # Ejecutar limpieza
        df_clean = cleaner.clean_data(df_raw)

        # Ver reporte de limpieza
        print("\n--- Reporte de Limpieza ---")
        report = cleaner.get_cleaning_report()
        for key, value in report.items():
            print(f"{key}: {value}")

        # Verificar si se eliminaron duplicados o columnas vacías
        print(f"\n✅ Limpieza completada. Dimensiones post-limpieza: {df_clean.shape}")
        
    except Exception as e:
        print(f"❌ Error ejecutando la limpieza: {e}")

else:
    print("⚠️ No se puede ejecutar la limpieza: Falta 'df_raw'. Por favor corre la Celda 2 exitosamente primero.")

2025-11-18 17:53:10,159 - INFO - Starting data cleaning... Initial shape: (334813, 67)


--- 2. Iniciando Limpieza ---


2025-11-18 17:53:11,023 - INFO - Dropped 31 columns with >50.0% missing values
2025-11-18 17:53:16,599 - INFO - Handled 228190 outliers (capped) across numeric columns
2025-11-18 17:53:16,631 - INFO - ✅ Data cleaning completed. Final shape: (334813, 36)
2025-11-18 17:53:16,636 - INFO -    Duplicates removed: 0
2025-11-18 17:53:16,642 - INFO -    Columns dropped: 31
2025-11-18 17:53:16,645 - INFO -    Outliers handled: 228190



--- Reporte de Limpieza ---
duplicates_removed: 0
columns_dropped: 31
dropped_column_names: ['icu_patients', 'icu_patients_per_million', 'hosp_patients', 'hosp_patients_per_million', 'weekly_icu_admissions', 'weekly_icu_admissions_per_million', 'weekly_hosp_admissions', 'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests', 'total_tests_per_thousand', 'new_tests_per_thousand', 'new_tests_smoothed', 'new_tests_smoothed_per_thousand', 'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'total_boosters', 'new_vaccinations', 'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred', 'people_fully_vaccinated_per_hundred', 'total_boosters_per_hundred', 'handwashing_facilities', 'excess_mortality_cumulative_absolute', 'excess_mortality_cumulative', 'excess_mortality', 'excess_mortality_cumulative_per_million']
outliers_handled: 228190

✅ Limpieza completada. Dimensiones post-limpieza: (334813, 36)


In [6]:
#IMPUTACION DE DATOS

# Inicializamos df_imputed como None por seguridad
df_imputed = None

# Verificamos si 'df_clean' existe y TIENE DATOS
if 'df_clean' in locals() and df_clean is not None:
    
    print("--- 3. Iniciando Imputación ---")
    # Instanciar el Imputer
    imputer = CovidDataImputer()

    try:
        # Revisar nulos ANTES de imputar
        # (Ponemos esto dentro del try por si df_clean no tiene la columna)
        if 'total_cases' in df_clean.columns:
            nulos_antes = df_clean['total_cases'].isnull().sum()
            print(f"Nulos en 'total_cases' antes: {nulos_antes}")

        # Ejecutar imputación inteligente
        df_imputed = imputer.smart_imputation(df_clean)

        # Revisar nulos DESPUÉS de imputar
        if 'total_cases' in df_imputed.columns:
            nulos_despues = df_imputed['total_cases'].isnull().sum()
            print(f"Nulos en 'total_cases' después: {nulos_despues}")

        # Ver reporte de imputación
        print("\n--- Estadísticas de Imputación ---")
        print(imputer.get_imputation_report())
        
        print(f"\n✅ Imputación completada. Dimensiones: {df_imputed.shape}")

    except Exception as e:
        print(f"❌ Error durante la imputación: {e}")

else:
    print("⚠️ No se puede ejecutar la imputación: Falta 'df_clean'. Por favor corre la Celda 3 exitosamente primero.")

2025-11-18 17:55:34,432 - INFO - Starting smart imputation...


--- 3. Iniciando Imputación ---
Nulos en 'total_cases' antes: 37734


2025-11-18 17:55:35,180 - INFO - Filled static columns (ffill/bfill) for: ['population', 'population_density', 'median_age', 'aged_65_older', 'aged_70_older', 'gdp_per_capita', 'extreme_poverty', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'hospital_beds_per_thousand', 'life_expectancy', 'human_development_index']
2025-11-18 17:55:35,692 - INFO - Forward filled 798 values across 4 columns
2025-11-18 17:55:35,939 - INFO - Filled 754958 'new_*' values with 0
2025-11-18 17:55:38,484 - INFO - Interpolated 10373 numeric values across 6 columns
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmea

Nulos en 'total_cases' después: 12036

--- Estadísticas de Imputación ---
{'forward_filled': 798, 'interpolated': 10373, 'filled_with_stats': 260781, 'filled_new_with_zero': 754958}

✅ Imputación completada. Dimensiones: (334813, 36)


In [9]:
#INGENIERIA DE CARACTERISTICAS (FEATURE ENGINEERING)

# Inicializamos df_final como None por seguridad
df_final = None

# Verificamos si 'df_imputed' existe y TIENE DATOS
if 'df_imputed' in locals() and df_imputed is not None:

    print("--- 4. Iniciando Ingeniería de Características ---")
    # Instanciar el Feature Engineer
    engineer = CovidFeatureEngineer()

    try:
        # Crear métricas nuevas (Lags, Medias Móviles, etc.) y limpiar basura
        df_final = engineer.create_all_features(df_imputed)

        # Verificar las nuevas columnas creadas
        print(f"\n✅ ÉXITO FINAL: Pipeline completado. Dimensiones: {df_final.shape}")
        print("\nColumnas en el Dataset Final (Primeras 10):")
        print(df_final.columns.tolist()[:10])  # Mostrar solo las primeras 10 columnas

        # Inspeccionar una de las nuevas métricas (ej. tasa de letalidad)
        if 'case_fatality_rate' in df_final.columns:
            print("\nEjemplo de métrica calculada ('case_fatality_rate'):")
            # Usamos display() si está disponible (Jupyter), sino print()
            cols_to_show = ['location', 'date', 'case_fatality_rate']
            try:
                display(df_final[cols_to_show].dropna().head())
            except NameError:
                print(df_final[cols_to_show].dropna().head())

    except Exception as e:
        print(f"❌ Error durante la ingeniería de características: {e}")

else:
    print("⚠️ No se puede ejecutar la ingeniería: Falta 'df_imputed'. Por favor corre la Celda 4 exitosamente primero.")

2025-11-18 18:00:02,917 - INFO - Starting feature engineering pipeline...


--- 4. Iniciando Ingeniería de Características ---


2025-11-18 18:00:03,953 - INFO - Created 8 temporal features
2025-11-18 18:00:04,089 - INFO - Created 0 per capita features
2025-11-18 18:00:04,225 - INFO - Created 1 rate features
2025-11-18 18:00:05,665 - INFO - Created 4 moving average features
  df_feat[growth_col] = df_feat.groupby('location')[col].pct_change(periods=periods) * 100
  df_feat[growth_col] = df_feat.groupby('location')[col].pct_change(periods=periods) * 100
2025-11-18 18:00:06,631 - INFO - Created 2 growth rate features
2025-11-18 18:00:07,461 - INFO - Created 6 lag features
2025-11-18 18:00:08,156 - INFO - Created 0 cumulative features
2025-11-18 18:00:08,192 - INFO - Feature engineering completed: 21 raw features created (pre-cleanup)
2025-11-18 18:00:08,296 - INFO - Final cleanup: Dropped 18 intermediate engineering features.
2025-11-18 18:00:08,298 - INFO - Final dataset shape (post-cleanup): (334813, 39)



✅ ÉXITO FINAL: Pipeline completado. Dimensiones: (334813, 39)

Columnas en el Dataset Final (Primeras 10):
['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases', 'new_cases_smoothed', 'total_deaths', 'new_deaths', 'new_deaths_smoothed']

Ejemplo de métrica calculada ('case_fatality_rate'):


Unnamed: 0,location,date,case_fatality_rate
0,Afghanistan,2020-01-03,4.663385
1,Afghanistan,2020-01-04,4.663385
2,Afghanistan,2020-01-05,4.663385
3,Afghanistan,2020-01-06,4.663385
4,Afghanistan,2020-01-07,4.663385


In [None]:
import plotly.express as px

# Verificamos si 'df_final' existe y TIENE DATOS
if 'df_final' in locals() and df_final is not None:

    print("--- 5. Prueba de Visualización ---")
    
    try:
        # Filtrar un país para probar
        country_data = df_final[df_final['location'] == 'Ecuador']

        if not country_data.empty:
            print(f"Graficando {len(country_data)} registros para Ecuador...")
            
            # Crear gráfico
            fig = px.line(
                country_data, 
                x='date', 
                y='new_cases_smoothed', 
                title='Prueba de Flujo: Casos Suavizados en Ecuador'
            )
            fig.show()
        else:
            print("⚠️ No se encontraron datos para 'Ecuador'.")
            # Mostrar algunos países disponibles para probar
            if 'location' in df_final.columns:
                print("Países disponibles:", df_final['location'].unique()[:5])

    except Exception as e:
        print(f"❌ Error generando la gráfica: {e}")

else:
    print("⚠️ No se puede graficar: Falta 'df_final'. Por favor corre la Celda 5 exitosamente primero.")

--- 5. Prueba de Visualización ---
Graficando 1385 registros para Ecuador...
❌ Error generando la gráfica: Value of 'y' is not the name of a column in 'data_frame'. Expected one of ['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases', 'new_cases_smoothed', 'total_deaths', 'new_deaths', 'new_deaths_smoothed', 'total_cases_per_million', 'new_cases_per_million', 'new_cases_smoothed_per_million', 'total_deaths_per_million', 'new_deaths_per_million', 'new_deaths_smoothed_per_million', 'reproduction_rate', 'new_vaccinations_smoothed', 'new_vaccinations_smoothed_per_million', 'new_people_vaccinated_smoothed', 'new_people_vaccinated_smoothed_per_hundred', 'stringency_index', 'population_density', 'median_age', 'aged_65_older', 'aged_70_older', 'gdp_per_capita', 'extreme_poverty', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'hospital_beds_per_thousand', 'life_expectancy', 'human_development_index', 'population', 'case_fatality_rate', 'to