In [None]:
# --- ALGERIA POPULATION-WEIGHTED SPEED ANALYSIS (dtype-safe & memory-safe) ---

import os, re, glob
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
from rasterstats import zonal_stats
from shapely import wkt
from shapely.geometry import box

# Paths
GRID_WKT     = "processed_data/algeria_grid_z12_wkt.csv"
GRID_LONG    = "processed_data/algeria_grid_data_long_z12_mobile.csv"
WORLDPOP_DIR = "worldpop_tifs/"
OUT_CSV      = "processed_data/algeria_pop_weighted_trends_2019_2025.csv"

YEARS = list(range(2019, 2026))  #
CHUNK_SIZE = 2500
POP_TMP_CSV = "processed_data/_tmp_pop_by_tile_year.csv"

g_tiles = pd.read_csv(GRID_WKT, dtype={"quadkey_z12": str})
if "quadkey_z12" not in g_tiles.columns:
    g_tiles["quadkey_z12"] = g_tiles["quadkey"].astype(str)
g_tiles["geometry"] = g_tiles["wkt"].apply(wkt.loads)
g_tiles = gpd.GeoDataFrame(g_tiles[["quadkey_z12", "geometry"]], geometry="geometry", crs="EPSG:4326")

df_long = pd.read_csv(GRID_LONG, encoding="utf-8", dtype={"quadkey_z12": str})
if "year" not in df_long.columns and "date" in df_long.columns:
    df_long["year"] = df_long["date"].astype(str).str[:4].astype(int)
df_long = df_long[df_long["year"].between(min(YEARS), max(YEARS))]

tile_year = (
    df_long.groupby(["quadkey_z12", "year"], as_index=False)
           .agg(download=("avg_download_mbps", "mean"),
                upload=("avg_upload_mbps", "mean"))
)

tifs = sorted(glob.glob(os.path.join(WORLDPOP_DIR, "*.tif")))

def infer_year(path):
    m = re.search(r"(20\d{2})", os.path.basename(path))
    return int(m.group(1)) if m else None

year_to_tif = {infer_year(fp): fp for fp in tifs if infer_year(fp) in YEARS}

os.makedirs(os.path.dirname(POP_TMP_CSV), exist_ok=True)
if os.path.exists(POP_TMP_CSV):
    os.remove(POP_TMP_CSV)

tiles_cache = {}

for y in YEARS:
    tif = year_to_tif.get(y)
    if tif is None:
        print(f"⚠ Missing WorldPop raster for {y}")
        continue

    with rasterio.Env(GDAL_CACHEMAX=128):
        with rasterio.open(tif) as src:
            crs_key = str(src.crs)

            if crs_key not in tiles_cache:
                tp = g_tiles.to_crs(src.crs).copy()
                rbox = box(*src.bounds)
                try:
                    sidx = tp.sindex
                    cand_idx = list(sidx.intersection(rbox.bounds))
                    tp = tp.iloc[cand_idx]
                except Exception:
                    pass
                tp = tp[tp.intersects(rbox)].reset_index(drop=True)
                tiles_cache[crs_key] = tp[["quadkey_z12", "geometry"]]

            tiles_proj = tiles_cache[crs_key]

            if tiles_proj.empty:
                print(f"⚠ No overlapping tiles for {y}")
                continue

            n = len(tiles_proj)
            for start in range(0, n, CHUNK_SIZE):
                end = min(start + CHUNK_SIZE, n)
                sub = tiles_proj.iloc[start:end]

                zs = zonal_stats(
                    list(sub["geometry"]),
                    tif,
                    stats=["sum"],
                    nodata=(src.nodata if src.nodata is not None else None),
                    all_touched=True
                )
                pop = np.fromiter((d["sum"] if (d["sum"] is not None) else 0.0 for d in zs), dtype=np.float64)

                out_chunk = pd.DataFrame({
                    "quadkey_z12": sub["quadkey_z12"].astype(str).values,
                    "year": y,
                    "pop": pop
                })

                write_header = not os.path.exists(POP_TMP_CSV)
                out_chunk.to_csv(POP_TMP_CSV, mode="a", header=write_header, index=False, encoding="utf-8")
                del out_chunk, pop, zs
            import gc; gc.collect()

pop_by_tile_year = pd.read_csv(POP_TMP_CSV, dtype={"quadkey_z12": str, "year": int})

tile_year["quadkey_z12"] = tile_year["quadkey_z12"].astype(str)
tile_year["year"] = tile_year["year"].astype(int)
pop_by_tile_year["quadkey_z12"] = pop_by_tile_year["quadkey_z12"].astype(str)
pop_by_tile_year["year"] = pop_by_tile_year["year"].astype(int)

tile_year["quadkey_z12"] = tile_year["quadkey_z12"].str.strip()
pop_by_tile_year["quadkey_z12"] = pop_by_tile_year["quadkey_z12"].str.strip()

df_m = (tile_year.merge(pop_by_tile_year, on=["quadkey_z12", "year"], how="left")
                 .fillna({"pop": 0.0}))

def weighted_average(group, col, wcol="pop"):
    valid = group[[col, wcol]].notna().all(axis=1)
    group_valid = group[valid]

    if len(group_valid) == 0:
        return np.nan

    w = group_valid[wcol].clip(lower=0)
    v = group_valid[col]

    total_weight = w.sum()
    if total_weight == 0:
        return np.nan

    return (v * w).sum() / total_weight

nat = (
    df_m.groupby("year", as_index=False)
        .apply(lambda g: pd.Series({
            "pw_download_mbps": weighted_average(g, "download"),
            "pw_upload_mbps":   weighted_average(g, "upload")
        }))
        .reset_index(drop=True)
        .sort_values("year")
)

os.makedirs(os.path.dirname(OUT_CSV), exist_ok=True)
nat.to_csv(OUT_CSV, index=False, encoding="utf-8")

print(f"Saved: {OUT_CSV}")
print(nat)
