In [None]:
"""
This script merges annual CMIP6 NetCDF files (downloaded from ESGF, one file per year)
into a single continuous file along the time dimension for a given variable and model.
Used here to create merged datasets for 2015–2030 and 2080–2100 periods.
"""
import xarray as xr
from tqdm import tqdm
import argparse
import os

# Command-line arguments
parser = argparse.ArgumentParser(description="Merge yearly NetCDF files into one along the time dimension.")
parser.add_argument("--base_dir", required=True, help="Base directory containing the NetCDF files")
parser.add_argument("--variable", required=True, help="Variable name, e.g., pr, tas")
parser.add_argument("--model", required=True, help="Climate model name")
parser.add_argument("--scenario", default="ssp585", help="Scenario name (default: ssp585)")
parser.add_argument("--member", default="r1i1p1f1", help="Ensemble member (default: r1i1p1f1)")
parser.add_argument("--grid", default="gn", help="Grid label (default: gn)")
parser.add_argument("--start", type=int, required=True, help="Start year")
parser.add_argument("--end", type=int, required=True, help="End year")
args = parser.parse_args()

# Build the list of NetCDF files for the given range of years
nc_files = [
    os.path.join(
        args.base_dir,
        args.model,
        "Daily",
        args.variable,
        f"{args.variable}_day_{args.model}_{args.scenario}_{args.member}_{args.grid}_{year}0101-{year}1231.nc"
    )
    for year in range(args.start, args.end + 1)
]

# Load all files into a list of xarray Datasets with progress bar
datasets = []
for file in tqdm(nc_files, desc="Loading files"):
    datasets.append(xr.open_dataset(file))
    tqdm.write(f"Loaded {file}")

# Concatenate the datasets along the time dimension
combined_dataset = xr.concat(datasets, dim='time')
print("All files have been concatenated successfully.")

# Output file path
output_dir = os.path.join(args.base_dir, args.model, "Daily", args.variable, "merged")
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(
    output_dir,
    f"{args.variable}_day_{args.model}_{args.scenario}_{args.member}_{args.grid}_{args.start}0101-{args.end}1231.nc"
)

# Save the combined dataset
combined_dataset.to_netcdf(output_path)
print(f"Merged file saved to {output_path}")

# Close datasets to free resources
for ds in datasets:
    ds.close()
print("All datasets have been closed.")

In [None]:
"""
Filter CMIP6 NetCDF climate model outputs (tas, tasmax, tasmin, pr) 
to include only grid cells corresponding to populated places from 
a Natural Earth shapefile. The script:

1. Loads the shapefile of populated places and ensures WGS84 CRS.
2. Opens each NetCDF, slices to the given time period (default 2080–2100).
3. Identifies the nearest model grid cell to each city centroid (nearest-neighbor selection, no interpolation).
4. Masks all other grid cells and keeps data only for these urban points.
5. Saves the filtered dataset to a new NetCDF file.

Note:
- Uses nearest-neighbor selection to match shapefile centroids to model grid points.
- Does not perform spatial averaging; results correspond to the single closest grid cell.

Intended for extracting climate projections over urban locations.
"""
import argparse
import os
import numpy as np
import xarray as xr
import geopandas as gpd
from scipy.spatial import cKDTree

# --- time slicing helper ---
def time_slice(dataset, start="2080-01-01", end="2100-12-31"):
    cal = dataset.time.encoding.get("calendar", "standard")
    # 360-day calendars don't have Dec 31
    if cal == "360_day":
        end = "2100-12-30"
    return dataset.sel(time=slice(start, end))

def process_nc_file(nc_file_path, shapefile_path, output_folder, start_date, end_date):
    # 1) Load shapefile (Natural Earth populated places is WGS84 / lon-lat)
    urban_areas = gpd.read_file(shapefile_path)
    if urban_areas.empty:
        raise ValueError("Shapefile is empty.")
    # Ensure CRS is WGS84
    if urban_areas.crs is None or urban_areas.crs.to_epsg() != 4326:
        urban_areas = urban_areas.to_crs(4326)

    # 2) Open NetCDF and slice time (use_cftime handles non-standard calendars)
    ds = xr.open_dataset(nc_file_path, use_cftime=True)
    ds = time_slice(ds, start=start_date, end=end_date)

    # 3) Pick the climate variable
    candidates = [v for v in ds.data_vars if v in {"tas", "tasmax", "tasmin", "pr"}]
    if not candidates:
        print(f"Skipping {nc_file_path}: no target variable found (tas/tasmax/tasmin/pr).")
        return
    variable = candidates[0]
    da = ds[variable]

    # 4) Prepare grid for nearest-neighbor search
    # normalize longitudes to [-180, 180] to match Natural Earth lon range
    if "lon" not in ds.coords or "lat" not in ds.coords:
        print(f"Skipping {nc_file_path}: 'lat'/'lon' coords not found.")
        return

    lon = ds["lon"].values
    lat = ds["lat"].values

    # Handle 1D vs 2D lon/lat
    if lon.ndim == 1 and lat.ndim == 1:
        # 1D coords -> build 2D mesh
        lon2d, lat2d = np.meshgrid(lon, lat)
    elif lon.ndim == 2 and lat.ndim == 2:
        lon2d, lat2d = lon, lat
    else:
        print(f"Skipping {nc_file_path}: unexpected lat/lon dimensions.")
        return

    lon2d_norm = ((lon2d + 180) % 360) - 180  # shift to [-180, 180]

    # 5) KD-tree over all grid points
    grid_points = np.column_stack([lon2d_norm.ravel(), lat2d.ravel()])
    tree = cKDTree(grid_points)

    # 6) Find nearest grid index for each urban centroid
    # (centroid in degrees is OK for picking nearest cell on a regular grid)
    centroids = urban_areas.geometry.centroid
    mask = np.zeros(lon2d.shape, dtype=bool)
    for pt in centroids:
        if pt.is_empty:
            continue
        # (x=lon, y=lat)
        lon_pt = float(pt.x)
        lat_pt = float(pt.y)
        _, idx = tree.query((lon_pt, lat_pt), k=1)
        iy, ix = np.unravel_index(idx, lon2d.shape)
        mask[iy, ix] = True

    # 7) Apply mask: keep only those grid cells (all time)
    mask_da = xr.DataArray(
        mask,
        dims=("lat", "lon"),
        coords={"lat": ds["lat"], "lon": ds["lon"]},
    )
    filtered = da.where(mask_da, drop=True)

    # 8) Save output
    os.makedirs(output_folder, exist_ok=True)
    base = os.path.basename(nc_file_path).replace(".nc", "")
    out_path = os.path.join(output_folder, f"{base}_filtered_HW_Drought.nc")
    filtered.to_netcdf(out_path)
    print(f"Saved: {out_path}")

def main():
    ap = argparse.ArgumentParser(description="Filter NetCDFs to urban cells from a Natural Earth shapefile, 2080–2100.")
    ap.add_argument("--shapefile", required=True, help="Path to Natural Earth populated places shapefile (e.g., ne_10m_populated_places.shp)")
    ap.add_argument("--output_dir", required=True, help="Directory to write filtered NetCDFs")
    ap.add_argument("--start", default="2080-01-01", help="Start date (default: 2080-01-01)")
    ap.add_argument("--end", default="2100-12-31", help="End date (default: 2100-12-31; 360_day calendars use 2100-12-30)")
    ap.add_argument("nc_files", nargs="+", help="One or more NetCDF file paths")
    args = ap.parse_args()

    for path in args.nc_files:
        try:
            process_nc_file(path, args.shapefile, args.output_dir, args.start, args.end)
        except Exception as e:
            print(f"Failed: {path} -> {e}")

if __name__ == "__main__":
    main()

In [None]:
"""
Compute multi-model 75th percentile daily climate values (Step 1 + Step 2 of method).

Method summary:
1. For each GCM file, compute the per-model daily mean (averaged over all years in the target period).
2. For each day-of-year and grid cell, compute the 75th percentile across all models.

Assumptions:
- Input files are per-model daily averages for the selected period (e.g., 2080–2100).
- Each file contains only one climate variable of interest ('tas', 'tasmax', 'tasmin', 'pr').
- All models share the same spatial grid.
- If 'day_of_year' is not in the file, a 'time' column must exist to derive it.
- Leap days (day 366) are included if present; adjust preprocessing if not desired.

Example usage:
    python compute_75th_percentile.py \
        --input_dir "./data/Daily/2080-2100/averages" \
        --output_file "./output/75th_percentile_values-daily-2080-2100_HW_drought.csv"
"""

import pandas as pd
import os
import argparse

# ------------------------
# Parse arguments
# ------------------------
parser = argparse.ArgumentParser(description="Compute multi-model 75th percentile daily climate values.")
parser.add_argument("--input_dir", required=True, help="Directory containing per-model daily averages CSV files")
parser.add_argument("--output_file", required=True, help="Path to save the final merged CSV file")
parser.add_argument("--variables", nargs="+", default=['tas', 'tasmax', 'tasmin', 'pr'],
                    help="List of climate variables to process")
args = parser.parse_args()

directory_path = args.input_dir
variables = args.variables

# Collect per-model daily means
per_model_means = {var: [] for var in variables}
error_log = []

def detect_var(df):
    """Identify which climate variable is in the DataFrame."""
    for v in variables:
        if v in df.columns:
            return v
    return None

# ------------------------
# Step 1: Per-model daily mean
# ------------------------
file_paths = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.csv')]

for i, file_path in enumerate(file_paths):
    print(f"Processing file {i+1}/{len(file_paths)}: {file_path}")
    try:
        df = pd.read_csv(file_path)

        # Ensure day_of_year exists
        if 'day_of_year' not in df.columns:
            if 'time' in df.columns:
                df['time'] = pd.to_datetime(df['time'], errors='coerce')
                df = df.dropna(subset=['time'])
                df['day_of_year'] = df['time'].dt.dayofyear
            else:
                raise ValueError("No 'day_of_year' or 'time' column found.")

        # Identify variable
        var = detect_var(df)
        if var is None:
            raise ValueError("No valid variable column found (tas/tasmax/tasmin/pr).")

        # Compute per-model daily mean over years
        daily_mean = (
            df.groupby(['lon', 'lat', 'day_of_year'])[var]
              .mean()
              .reset_index()
        )
        per_model_means[var].append(daily_mean)

    except Exception as e:
        error_log.append(f"Error processing {file_path}: {e}")
        print(error_log[-1])

# ------------------------
# Step 2: 75th percentile across models
# ------------------------
final_results = {}
for var, parts in per_model_means.items():
    if parts:
        all_models = pd.concat(parts, ignore_index=True)
        p75 = (
            all_models
            .groupby(['lon', 'lat', 'day_of_year'])[var]
            .quantile(0.75)
            .reset_index()
        )
        final_results[var] = p75

if not final_results:
    raise RuntimeError("No results computed. Check input files.")

# ------------------------
# Merge all variables
# ------------------------
print("Merging results...")
first_var = next(iter(final_results))
merged_df = final_results[first_var].rename(columns={first_var: f'75th_{first_var}'})

for var in variables:
    if var in final_results and var != first_var:
        merged_df = pd.merge(
            merged_df,
            final_results[var].rename(columns={var: f'75th_{var}'}),
            on=['lon', 'lat', 'day_of_year'],
            how='outer'
        )

# ------------------------
# Save results
# ------------------------
os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
merged_df.to_csv(args.output_file, index=False)
print(f"Saved final 75th percentile file to: {args.output_file}")

# Save error log if needed
if error_log:
    error_log_file = os.path.join(os.path.dirname(args.output_file), "error_log.csv")
    with open(error_log_file, 'w') as f:
        for line in error_log:
            f.write(line + '\n')
    print(f"Error log saved to {error_log_file}")