# Exploratory Data Analysis â€” TrilliumWatts

This notebook performs EDA on the Leticia energy dataset using the modular `trillium_watts` package.

In [None]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
from trillium_watts.config import load_config
from trillium_watts.data.loader import load_raw_data
from trillium_watts.data.cleaning import run_cleaning_pipeline
from trillium_watts.data.imputation import run_imputation_pipeline
from trillium_watts.data.outliers import detect_outliers_iqr, replace_outliers_with_interpolation
from trillium_watts.features.pipeline import build_feature_pipeline
from trillium_watts.visualization.plots_matplotlib import (
    plot_missing_values, plot_correlation_heatmap, plot_acf_pacf,
    plot_boxplots, plot_outlier_detection, plot_time_series,
    plot_seasonal_decomposition,
)

config = load_config()

## 1. Load and Clean Data

In [None]:
df_raw = load_raw_data(config.data.raw_data_path, config.data.csv_separator, config.data.csv_encoding)
df = run_cleaning_pipeline(df_raw, date_column=config.data.date_column, date_cutoff=config.data.date_cutoff)
df = build_feature_pipeline(df)
df.info()

## 2. Missing Values

In [None]:
print(df.isnull().sum())
plot_missing_values(df['ACTIVA'])

In [None]:
df_original = df.copy()
df = run_imputation_pipeline(df, config.data.missing_periods)
plot_missing_values(df_original['ACTIVA'], df['ACTIVA'], 'Serie ACTIVA imputada')

## 3. Outlier Detection

In [None]:
plot_boxplots(df, ['ACTIVA', 'REACTIVA', 'FP', 'ALLSKY_SFC_SW_DWN', 'T2M'])

In [None]:
outlier_mask = detect_outliers_iqr(df['ACTIVA'])
df_clean, n = replace_outliers_with_interpolation(df, 'ACTIVA')
print(f'Outliers replaced: {n}')
plot_outlier_detection(df['ACTIVA'], df_clean['ACTIVA'], outlier_mask, 'ACTIVA')
df = df_clean

## 4. Correlation Analysis

In [None]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
corr_pearson = df[num_cols].corr(method='pearson')
plot_correlation_heatmap(corr_pearson, 'Correlacion Pearson')

corr_spearman = df[num_cols].corr(method='spearman')
plot_correlation_heatmap(corr_spearman, 'Correlacion Spearman')

## 5. Autocorrelation

In [None]:
plot_acf_pacf(df['ACTIVA'], lags=60)

## 6. Time Series and Seasonality

In [None]:
plot_time_series(df, 'ACTIVA', 'Energia activa a lo largo del tiempo')
plot_seasonal_decomposition(df['ACTIVA'], period=365)

## 7. Stationarity Test

In [None]:
from statsmodels.tsa.stattools import adfuller

adf_result = adfuller(df['ACTIVA'])
print(f'ADF statistic: {adf_result[0]:.4f}')
print(f'p-value: {adf_result[1]:.6f}')
print('Stationary' if adf_result[1] < 0.05 else 'Non-stationary')