# Group daily MERRA2 netCDF files into monthly while calculating the time dimension as daily from hourly data


In [None]:
import xarray as xr
import pandas as pd
from pathlib import Path
from datetime import datetime


def get_aggregation_method(var_name):
    # Updated for MERRA-2 variables
    if var_name in ["BCSMASS", "DUSMASS25", "OCSMASS"]:
        return "mean"
    else:
        raise ValueError(f"Unknown variable name: {var_name}")


def process_merra2_files(file_paths, output_folder):
    # Group files by month
    monthly_files = {}
    for file_path in file_paths:
        date = datetime.strptime(file_path.stem.split(".")[-2], "%Y%m%d")
        month_key = f"{date.year}{date.month:02d}"
        if month_key not in monthly_files:
            monthly_files[month_key] = []
        monthly_files[month_key].append(file_path)

    # Process each month
    for month_key, files in monthly_files.items():
        print(f"Processing month: {month_key}")
        datasets = []

        for file_path in files:
            ds = xr.open_dataset(file_path)
            ds["time"] = pd.to_datetime(ds["time"].values)
            datasets.append(ds)

        # Combine all datasets for the month
        combined_ds = xr.concat(datasets, dim="time")

        # Process each variable in the combined dataset
        for var_name in combined_ds.data_vars:
            print(f"Processing variable: {var_name}")

            # Determine the aggregation method based on the variable name
            agg_method = get_aggregation_method(var_name)

            # Resample data to daily frequency, applying the appropriate aggregation method
            if agg_method == "sum":
                ds_daily = combined_ds[var_name].resample(time="1D").sum()
            elif agg_method == "mean":
                ds_daily = combined_ds[var_name].resample(time="1D").mean()
            elif agg_method == "max":
                ds_daily = combined_ds[var_name].resample(time="1D").max()
            elif agg_method == "min":
                ds_daily = combined_ds[var_name].resample(time="1D").min()

            # Generate output file path
            output_file_path = (
                output_folder / f"MERRA2_{var_name}_daily_{month_key}.nc"
            )

            # Save the daily aggregated dataset to a new netCDF file
            ds_daily.to_netcdf(output_file_path)

            print(
                f"Monthly aggregated data for {var_name} saved to {output_file_path}"
            )

        # Close the datasets
        for ds in datasets:
            ds.close()


def main():
    input_folder = Path(
        input("Enter the path to the folder containing MERRA-2 .nc files: ")
    )
    output_folder = Path(
        input("Enter the path to the output folder for monthly files: ")
    )

    if not input_folder.is_dir():
        print(f"The input folder {input_folder} does not exist.")
        return

    if not output_folder.is_dir():
        output_folder.mkdir(parents=True, exist_ok=True)

    nc_files = list(input_folder.glob("*.nc"))
    nc_files.sort()

    if not nc_files:
        print(f"No .nc files found in {input_folder}")
        return

    process_merra2_files(nc_files, output_folder)

    print("All files processed successfully.")


if __name__ == "__main__":
    main()


# Daily aggregation of MERRA2 netCDF files


In [None]:
import xarray as xr
import pandas as pd
from pathlib import Path

# def investigate_netcdf(file_path):
#     ds = xr.open_dataset(file_path)
#     print(f"File: {file_path}")
#     print("Dimensions:", ds.dims)
#     print("Coordinates:", ds.coords)
#     print("Data variables:")
#     for var in ds.data_vars:
#         print(f"  {var}: {ds[var].attrs.get('long_name', 'No long name')} ({ds[var].attrs.get('units', 'No units')})")
#     print("Time range:", ds['time'].values[0], "to", ds['time'].values[-1])
#     print("Time resolution:", ds['time'].diff('time').values[0])
#     print("\n")
#     ds.close()


def get_aggregation_method(var_name):
    # Updated for MERRA-2 variables
    if var_name in ["BCSMASS", "DUSMASS25", "OCSMASS"]:
        return "mean"
    else:
        raise ValueError(f"Unknown variable name: {var_name}")


def process_merra2_file(file_path):
    ds = xr.open_dataset(file_path)

    # Ensure the 'time' coordinate is in datetime format
    ds["time"] = pd.to_datetime(ds["time"].values)

    # Process each variable in the file
    for var_name in ds.data_vars:
        print(f"Processing variable: {var_name}")

        # Determine the aggregation method based on the variable name
        agg_method = get_aggregation_method(var_name)

        # Resample data to daily frequency, applying the appropriate aggregation method
        if agg_method == "sum":
            ds_daily = ds[var_name].resample(time="1D").sum()
        elif agg_method == "mean":
            ds_daily = ds[var_name].resample(time="1D").mean()
        elif agg_method == "max":
            ds_daily = ds[var_name].resample(time="1D").max()
        elif agg_method == "min":
            ds_daily = ds[var_name].resample(time="1D").min()

        # Generate output file path
        output_file_path = file_path.with_name(
            f"{file_path.stem}_{var_name}_daily_aggregated_{agg_method}.nc"
        )

        # Save the daily aggregated dataset to a new netCDF file
        ds_daily.to_netcdf(output_file_path)

        print(
            f"Daily aggregated data for {var_name} saved to {output_file_path} using {agg_method} method"
        )

    # Close the dataset
    ds.close()


def main():
    folder_path = input(
        "Enter the path to the folder containing MERRA-2 .nc files: "
    )
    folder = Path(folder_path)

    if not folder.is_dir():
        print(f"The folder {folder_path} does not exist.")
        return

    nc_files = list(folder.glob("*.nc"))

    nc_files.sort()

    if not nc_files:
        print(f"No .nc files found in {folder_path}")
        return

    for file_path in nc_files:
        # print(f"Investigating {file_path}...")
        # investigate_netcdf(file_path)

        print(f"Processing {file_path}...")
        process_merra2_file(file_path)

    print("All files processed successfully.")


if __name__ == "__main__":
    main()
