# Exploratory Analysis — Pindamonhangaba Climate Data (1940–2025)

> **Purpose**: Quick sanity checks and visual exploration of the raw merged CSV before processing.
>
> **Input**: `data/raw/pindamonhangaba_1940_2025.csv` (produced by `fetch_climate_data.py`)
>
> Run cells in order. Requires: `pandas`, `numpy`, `matplotlib`, `seaborn`, `scipy`.

In [None]:
# ── Cell 1: Load raw CSV, display .head(), .info(), .describe() ──────────────

from pathlib import Path

import numpy as np
import pandas as pd

# Resolve path relative to this notebook's location
NOTEBOOK_DIR = Path(".").resolve()
RAW_CSV = NOTEBOOK_DIR.parent / "raw" / "pindamonhangaba_1940_2025.csv"

if not RAW_CSV.exists():
    raise FileNotFoundError(
        f"Raw CSV not found at {RAW_CSV}.\n"
        "Run `python data/scripts/fetch_climate_data.py` first."
    )

df = pd.read_csv(RAW_CSV, parse_dates=["date"])

print(f"Shape: {df.shape}")
print(f"Date range: {df['date'].min().date()} → {df['date'].max().date()}")
print(f"Expected rows (1940-01-01 to 2025-12-31): {(pd.Timestamp('2025-12-31') - pd.Timestamp('1940-01-01')).days + 1}")
print()

print("── head() ──────────────────────────────────────────────")
display(df.head(10))

print("\n── info() ──────────────────────────────────────────────")
df.info()

print("\n── describe() ──────────────────────────────────────────")
display(df.describe().round(2))

In [None]:
# ── Cell 2: Missing values heatmap (seaborn) ──────────────────────────────────
#
# Strategy: pivot the boolean NaN mask to a (year × month) grid per column,
# then show a heatmap of the fraction of missing days per year-month.
# This makes it easy to spot systematic gaps (e.g., entire months missing).

import matplotlib.pyplot as plt
import seaborn as sns

# Add helper columns
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month

COLS = ["temp_max", "temp_min", "temp_mean", "precipitation", "humidity", "wind_max"]

# Build a (year × variable) missing-count matrix
missing_by_year = (
    df.set_index("date")[COLS]
    .isnull()
    .resample("YE")  # resample to year-end
    .sum()
    .astype(int)
)
missing_by_year.index = missing_by_year.index.year  # use integer year labels

total_missing = missing_by_year.values.sum()
print(f"Total NaN values across all columns: {total_missing}")
print(f"Missing per column:\n{df[COLS].isnull().sum().to_string()}")

fig, ax = plt.subplots(figsize=(12, 10))

sns.heatmap(
    missing_by_year.T,
    ax=ax,
    cmap="YlOrRd",
    linewidths=0.3,
    linecolor="#cccccc",
    annot=False,
    cbar_kws={"label": "Missing days per year"},
    xticklabels=10,  # show every 10th year to avoid crowding
)

ax.set_title(
    "Missing Values Heatmap — Pindamonhangaba Climate Data (1940–2025)",
    fontsize=14, fontweight="bold", pad=16,
)
ax.set_xlabel("Year", fontsize=11)
ax.set_ylabel("Variable", fontsize=11)
ax.tick_params(axis="x", rotation=45)

plt.tight_layout()
plt.savefig(NOTEBOOK_DIR / "missing_values_heatmap.png", dpi=150, bbox_inches="tight")
plt.show()
print("Saved → missing_values_heatmap.png")

In [None]:
# ── Cell 3: T_max boxplot by decade ──────────────────────────────────────────
#
# Shows how the distribution of daily T_max has shifted across decades.
# Uses hue= to avoid the seaborn 0.14 FutureWarning about palette without hue.

import matplotlib.pyplot as plt
import seaborn as sns

# Create decade label
df["decade"] = (df["year"] // 10 * 10).astype(str) + "s"

# Palette: one colour per decade (cool → warm)
decades = sorted(df["decade"].unique())
palette = sns.color_palette("coolwarm", n_colors=len(decades))

fig, ax = plt.subplots(figsize=(14, 6))

sns.boxplot(
    data=df,
    x="decade",
    y="temp_max",
    hue="decade",          # assign x-variable to hue …
    palette=palette,
    order=decades,
    hue_order=decades,
    legend=False,          # … but hide the redundant legend
    ax=ax,
    flierprops=dict(marker=".", markersize=3, alpha=0.4),
)

ax.set_title(
    "Daily T_max Distribution by Decade — Pindamonhangaba (1940–2025)",
    fontsize=14, fontweight="bold", pad=16,
)
ax.set_xlabel("Decade", fontsize=11)
ax.set_ylabel("T_max (°C)", fontsize=11)
ax.tick_params(axis="x", rotation=0)

plt.tight_layout()
plt.savefig(NOTEBOOK_DIR / "tmax_boxplot_by_decade.png", dpi=150, bbox_inches="tight")
plt.show()
print("Saved → tmax_boxplot_by_decade.png")

In [None]:
# ── Cell 4: Hot days per year (HD30) — bars coloured by count ────────────────
#
# Definition: HD30 = days per year with T_max >= 30 °C.
#
# Colour encoding:
#   - white (or very light) at the LOW end (≈0 hot days)
#   - deep crimson-red only for the HIGHEST counts
# This makes the warming trend instantly visible as the bars deepen in colour
# toward the right side of the chart.

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np

HD30_THRESHOLD = 30.0  # °C

# Count HD30 per year (drop years with all-NaN temp_max)
hd30 = (
    df[df["temp_max"].notna()]
    .groupby("year")["temp_max"]
    .apply(lambda s: (s >= HD30_THRESHOLD).sum())
    .rename("hd30")
    .reset_index()
)

# ── colour map: white → deep red, driven by hd30 count ──────────────────────
# We use a diverging-ish ramp: very light (nearly white) at count=0,
# saturating to deep red (#8b0000 / darkred) at the maximum.
cmap = mcolors.LinearSegmentedColormap.from_list(
    "hd30_ramp",
    ["#ffffff", "#fdd0a2", "#fc8d59", "#d7301f", "#7f0000"],
)

vmin, vmax = 0, hd30["hd30"].max()
norm = mcolors.Normalize(vmin=vmin, vmax=vmax)
bar_colours = [cmap(norm(v)) for v in hd30["hd30"]]

# ── plot ─────────────────────────────────────────────────────────────────────
fig, ax = plt.subplots(figsize=(18, 5))

bars = ax.bar(
    hd30["year"],
    hd30["hd30"],
    color=bar_colours,
    width=0.85,
    edgecolor="none",
)

# Colour-bar legend
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = fig.colorbar(sm, ax=ax, orientation="vertical", pad=0.01, fraction=0.015)
cbar.set_label("Hot days (T_max ≥ 30 °C)", fontsize=10)

# Trend line (linear)
m, b = np.polyfit(hd30["year"], hd30["hd30"], 1)
ax.plot(
    hd30["year"],
    m * hd30["year"] + b,
    color="#333333",
    linewidth=1.5,
    linestyle="--",
    label=f"Trend ({m:+.1f} days/yr)",
)
ax.legend(fontsize=10, loc="upper left")

ax.set_title(
    "Hot Days per Year (T_max ≥ 30 °C) — Pindamonhangaba 1940–2025",
    fontsize=14, fontweight="bold", pad=14,
)
ax.set_xlabel("Year", fontsize=11)
ax.set_ylabel("Days per year", fontsize=11)
ax.set_xlim(hd30["year"].min() - 0.5, hd30["year"].max() + 0.5)
ax.set_ylim(0, vmax * 1.08)
ax.tick_params(axis="x", rotation=45)
ax.spines[["top", "right"]].set_visible(False)

plt.tight_layout()
plt.savefig(NOTEBOOK_DIR / "hd30_per_year.png", dpi=150, bbox_inches="tight")
plt.show()
print("Saved → hd30_per_year.png")

In [None]:
# ── Cell 5: Integrity checks ─────────────────────────────────────────────────
#   • No T_min > T_max violations
#   • No out-of-range values (T_max < 50 °C, T_min > -10 °C)

inv = df[df["temp_min"] > df["temp_max"]]
print(f"T_min > T_max violations : {len(inv)}")
if not inv.empty:
    display(inv[["date", "temp_min", "temp_max"]].head(10))

out_tmax = df[df["temp_max"] >= 50]
print(f"T_max >= 50 °C           : {len(out_tmax)}")

out_tmin = df[df["temp_min"] <= -10]
print(f"T_min <= -10 °C          : {len(out_tmin)}")

neg_precip = df[df["precipitation"] < 0]
print(f"Precipitation < 0        : {len(neg_precip)}")

print("\nAll checks passed ✓" if (len(inv) == 0 and len(out_tmax) == 0
                                  and len(out_tmin) == 0 and len(neg_precip) == 0)
      else "\nSome checks FAILED — review output above.")