In [1]:
import os
import re
import glob
import xarray as xr
import rioxarray
import geopandas as gpd
import pandas as pd




In [None]:

# --- [ACTION REQUIRED] User-defined paths ---
nc_dir = 'C:/Users/arthu/OneDrive/Desktop/data'  # Folder containing Daymet .nc files
shapefile_path = 'C:/Users/arthu/OneDrive/Desktop/data/Shapefile/RussianSubbasinsSimple.shp'

# Daymet Lambert Conformal Conic (LCC) CRS
daymet_crs = (
    "+proj=lcc +lat_1=25 +lat_2=60 +lat_0=42.5 +lon_0=-100 "
    "+x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs"
)

# --- Load and reproject the shapefile to Daymet's CRS ---
basin = gpd.read_file(shapefile_path)
basin = basin.to_crs(daymet_crs)

# Regex to parse filenames like "11549_2020_prcp.nc" or "11549_2021_srad.nc"
filename_pattern = re.compile(r'(\d+)_(\d{4})_(\w+)\.nc')

# Dictionary to store DataFrames for each variable across files
var_dataframes = {}


In [3]:

# --- Loop through all .nc files in the directory ---
for nc_file in glob.glob(os.path.join(nc_dir, '*.nc')):
    filename = os.path.basename(nc_file)
    match = filename_pattern.match(filename)
    if not match:
        print(f"Skipping file with unexpected name format: {filename}")
        continue

    tile, year, var_name = match.groups()
    print(f"\n=== Processing file: {filename} ===")
    print(f"Tile: {tile}, Year: {year}, Variable: {var_name}")

    # 1) Test if the file can be opened
    try:
        ds = xr.open_dataset(nc_file)
    except Exception as e:
        print(f"Error opening {filename}: {e}")
        continue

    # 2) Assign the Daymet CRS to the dataset (if not already set)
    ds = ds.rio.write_crs(daymet_crs, inplace=False)

    # 3) Extract the desired variable or fall back to the first data_var
    if var_name in ds.data_vars:
        da = ds[var_name]
    else:
        # If the named variable doesn't exist, fallback
        # or skip the file entirely if that's preferred
        if len(ds.data_vars) == 0:
            print(f"No data variables found in {filename}, skipping.")
            ds.close()
            continue
        da = list(ds.data_vars.values())[0]
        print(f"Warning: Expected variable '{var_name}' not found in {filename}. Using '{da.name}' instead.")

    # Print dimension names for debugging
    print("Original dims:", da.dims)
    print("Original coords:", list(da.coords))

    # 4) If your actual dimensions are ('time', 'y', 'x'), set them as spatial dims
    #    If the dims differ, rename them accordingly here.
    try:
        da = da.rio.set_spatial_dims(x_dim='x', y_dim='y', inplace=False)
    except Exception as e:
        print(f"Failed to set spatial dims for {filename}: {e}")
        ds.close()
        continue

    # 5) Clip the DataArray to the basin polygon
    try:
        da_clipped = da.rio.clip(basin.geometry, basin.crs, drop=True)
    except Exception as e:
        print(f"Clipping failed for {filename}: {e}")
        ds.close()
        continue

    # Convert the clipped DataArray to a pandas DataFrame
    df = da_clipped.to_dataframe().reset_index()

    # Ensure the 'time' column is datetime
    if 'time' in df.columns and not pd.api.types.is_datetime64_any_dtype(df['time']):
        df['time'] = pd.to_datetime(df['time'])

    # 6) Aggregate spatial cells if multiple remain
    expected_cols = {'time', da_clipped.name}
    extra_cols = set(df.columns) - expected_cols
    if extra_cols:
        df = df.groupby('time')[da_clipped.name].mean().reset_index()

    # Rename the column to the variable name
    df = df[['time', da_clipped.name]].rename(columns={da_clipped.name: var_name})

    # Store the DataFrame by variable
    var_dataframes.setdefault(var_name, []).append(df)

    ds.close()



=== Processing file: 11549_2020_prcp.nc ===
Tile: 11549, Year: 2020, Variable: prcp
Original dims: ('time', 'y', 'x')
Original coords: ['lambert_conformal_conic', 'x', 'y', 'time', 'lat', 'lon']

=== Processing file: 11549_2021_prcp.nc ===
Tile: 11549, Year: 2021, Variable: prcp
Original dims: ('time', 'y', 'x')
Original coords: ['lambert_conformal_conic', 'x', 'y', 'time', 'lat', 'lon']

=== Processing file: 11549_2021_srad.nc ===
Tile: 11549, Year: 2021, Variable: srad
Original dims: ('time', 'y', 'x')
Original coords: ['lambert_conformal_conic', 'x', 'y', 'time', 'lat', 'lon']

=== Processing file: 11549_2021_tmax.nc ===
Tile: 11549, Year: 2021, Variable: tmax
Original dims: ('time', 'y', 'x')
Original coords: ['lambert_conformal_conic', 'x', 'y', 'time', 'lat', 'lon']

=== Processing file: 11549_2021_tmin.nc ===
Tile: 11549, Year: 2021, Variable: tmin
Original dims: ('time', 'y', 'x')
Original coords: ['lambert_conformal_conic', 'x', 'y', 'time', 'lat', 'lon']

=== Processing file

In [4]:

# --- Concatenate & merge DataFrames for each variable ---
for var in var_dataframes:
    var_dataframes[var] = pd.concat(var_dataframes[var]).sort_values('time').reset_index(drop=True)

merged_df = None
for var, df in var_dataframes.items():
    if merged_df is None:
        merged_df = df
    else:
        merged_df = pd.merge(merged_df, df, on='time', how='outer')

merged_df = merged_df.sort_values('time').reset_index(drop=True)

# --- Save final merged DataFrame ---
merged_df.to_csv('merged_daymet_data.csv', index=False)
print("Final merged Daymet DataFrame shape:", merged_df.shape)


Final merged Daymet DataFrame shape: (730, 6)
