In [1]:
# 02 - AED Univariada

import os
import sys
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

# Garantir import local de src/
BASE_DIR = Path.cwd().parent
SRC_DIR = BASE_DIR / "src"
if str(BASE_DIR) not in sys.path:
    sys.path.append(str(BASE_DIR))
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

from src.plotting import plot_histograms, save_fig

INTERIM_PATH = os.path.join(str(BASE_DIR), "data", "interim", "focos_2019_2024.csv")
FIG_DIR = os.path.join(str(BASE_DIR), "docs", "figures")

# Carregar dataset unificado
df = pd.read_csv(INTERIM_PATH, low_memory=False)

# Estatísticas descritivas gerais
summary = df.describe(include="all").T
summary.to_csv(os.path.join(str(BASE_DIR), "data", "interim", "summary_univariado.csv"))
summary.head()



Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
month,144.0,72.0,2019-01,2.0,,,,,,,
Amazônia,72.0,,,,8631.180556,10981.887267,384.0,1219.5,3116.0,12139.5,41463.0
Caatinga,72.0,,,,1454.222222,1782.277244,22.0,154.0,441.0,2847.75,6939.0
Cerrado,72.0,,,,5270.652778,6169.121053,574.0,932.25,2806.0,7082.0,29319.0
Mata Atlântica,72.0,,,,1367.597222,1464.647212,240.0,450.5,722.5,1553.75,6715.0


In [2]:
# Relatório de NAs e tipos por coluna
na_report = (
    df.isna().sum().to_frame("num_na")
    .assign(percent_na=lambda x: (x["num_na"] / len(df) * 100).round(2))
    .sort_values("percent_na", ascending=False)
)
na_report_path = BASE_DIR / "data" / "interim" / "na_report.csv"
na_report.to_csv(na_report_path)
na_report.head(20)


Unnamed: 0,num_na,percent_na
Lara,2246,96.89
PIAUÍ,2246,96.89
Napo,2246,96.89
Nariño,2246,96.89
Neuquén,2246,96.89
Nickerie,2246,96.89
Norte de Santander,2246,96.89
Nueva Esparta,2246,96.89
Orellana,2246,96.89
Oruro,2246,96.89


In [3]:
# Histogramas e boxplots para colunas numéricas relevantes
import matplotlib.pyplot as plt
import seaborn as sns
from src.plotting import save_fig

num_cols = [c for c in ["lat","lon"] if c in df.columns]
for col in num_cols:
    fig, ax = plt.subplots(figsize=(6,4))
    sns.histplot(df[col].dropna(), kde=True, ax=ax)
    ax.set_title(f"Distribuição de {col}")
    save_fig(fig, str(FIG_DIR), f"hist_{col}.png")
    plt.close(fig)

# Boxplots por bioma/UF se existirem
if "bioma" in df.columns and num_cols:
    for col in num_cols:
        fig, ax = plt.subplots(figsize=(8,4))
        sns.boxplot(data=df, x="bioma", y=col, ax=ax)
        ax.set_title(f"Boxplot de {col} por Bioma")
        ax.tick_params(axis='x', rotation=30)
        save_fig(fig, str(FIG_DIR), f"box_{col}_por_bioma.png")
        plt.close(fig)

