# 02 · EDA & Anomaly Detection
Explore distributions, trends, and detect anomalies via z-scores and IsolationForest.

In [None]:

import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

DATA_DIR = Path("../data")
df = pd.read_csv(DATA_DIR / "water_quality_samples.csv", parse_dates=["timestamp"])

# Simple time series plot (example: turbidity by station)
for st in df["station_id"].unique():
    d = df[df["station_id"]==st]
    d = d.sort_values("timestamp")
    plt.figure()
    plt.plot(d["timestamp"], d["turbidity_NTU"])
    plt.title(f"Turbidity (NTU) – {st}")
    plt.xlabel("Time"); plt.ylabel("NTU")
    plt.show()


## Z-score anomaly flags

In [None]:

def zscore(x):
    return (x - np.nanmean(x)) / (np.nanstd(x) + 1e-9)

df = df.sort_values(["station_id","timestamp"]).copy()
df["turbidity_z"] = df.groupby("station_id")["turbidity_NTU"].transform(zscore)
df["nitrate_z"] = df.groupby("station_id")["nitrate_mgL"].transform(zscore)
df["ecoli_z"] = df.groupby("station_id")["e_coli_CFU_100mL"].transform(lambda s: (s - s.mean())/(s.std()+1e-9))

df["anomaly_flag"] = ((df["turbidity_z"].abs()>3) | (df["nitrate_z"].abs()>3) | (df["ecoli_z"].abs()>3)).astype(int)
df["anomaly_flag"].value_counts()


## IsolationForest (multivariate)

In [None]:

from sklearn.ensemble import IsolationForest

features = ["turbidity_NTU","nitrate_mgL","phosphate_mgL","ammonia_mgL","e_coli_CFU_100mL","chlorine_mgL"]
X = df[features].fillna(method="ffill").fillna(method="bfill")

iso = IsolationForest(n_estimators=200, contamination=0.02, random_state=42)
df["iforest_score"] = iso.fit_predict(X)  # -1 = anomaly, 1 = normal
df["iforest_anomaly"] = (df["iforest_score"]==-1).astype(int)

df["iforest_anomaly"].value_counts()
