# Addressing Forecast Hour Duplication Issue

The NAM and uWRF datasets are both outputs from operational weather models, and the data we are using contains forecasts generated from different model initialization times. For our project, we are focusing on the 00 UTC initialization time. We have daily data for the entire year of 2019, with the model being initialized once per day. Each initialization produces forecasts for the subsequent 84 hours at 3-hour intervals, which means we have forecasts for up to 2.5 days ahead.

While this data is valuable, it can create confusion for our deep learning model. Forecasts for the same hour from different initialization times are treated as duplicates, even though they represent different predictions. This issue arises because xarray assumes these samples represent the exact same time, when in fact they should be interpreted as separate forecasts. This notebook explores strategies to resolve this duplication problem.

In [16]:
import xarray as xr
import numpy as np
import ecubevis as ecv
import scipy as sp
import netCDF4 as nc
import dl4ds as dds

In [3]:
#FILES CONTAINING ALL PREPROCESSED DATA NEEDED FOR MODEL:
uwrf_train = xr.open_dataset('/D4/data/gvaillant/prep-uwrf/d02/pred-NYC-split/train/uWRF_final_01-02.nc')
uwrf_val = xr.open_dataset('/D4/data/gvaillant/prep-uwrf/d02/pred-NYC-split/val/uWRF_final_03.nc')
uwrf_test = xr.open_dataset('/D4/data/gvaillant/prep-uwrf/d02/pred-NYC-split/test/uWRF_final_03.nc')

nam_train = xr.open_dataset('/home/gvaillant1/aligned-NYC-data/aligned_nam_train_data.nc')
nam_val = xr.open_dataset('/home/gvaillant1/aligned-NYC-data/aligned_nam_test_data.nc')
nam_test = xr.open_dataset('/home/gvaillant1/aligned-NYC-data/aligned_nam_val_data.nc')

In [4]:
#High resolution (uWRF) data
T2_hr_train = uwrf_train.T2
T2_hr_val = uwrf_val.T2
T2_hr_test = uwrf_test.T2

PRES_hr_train = uwrf_train.PSFC
PRES_hr_val = uwrf_val.PSFC
PRES_hr_test = uwrf_test.PSFC

#--------------------------
#Low resolution (NAM) data
T2_lr_train = nam_train.T2
T2_lr_val = nam_val.T2
T2_lr_test = nam_test.T2

PRES_lr_train = nam_train.PSFC
PRES_lr_val = nam_val.PSFC
PRES_lr_test = nam_test.PSFC

# Checking duplicates

In [5]:
#uWRF
times = T2_hr_train['time'].to_index()
duplicate_times = times[times.duplicated()]
print(duplicate_times)
#1063 times duplicated

DatetimeIndex(['2019-01-02 00:00:00', '2019-01-02 03:00:00',
               '2019-01-02 06:00:00', '2019-01-02 09:00:00',
               '2019-01-02 12:00:00', '2019-01-02 15:00:00',
               '2019-01-02 18:00:00', '2019-01-02 21:00:00',
               '2019-01-03 00:00:00', '2019-01-03 03:00:00',
               ...
               '2019-03-01 09:00:00', '2019-03-01 12:00:00',
               '2019-03-01 15:00:00', '2019-03-01 18:00:00',
               '2019-03-01 21:00:00', '2019-03-02 00:00:00',
               '2019-03-02 03:00:00', '2019-03-02 06:00:00',
               '2019-03-02 09:00:00', '2019-03-02 12:00:00'],
              dtype='datetime64[ns]', name='time', length=1063, freq=None)


In [6]:
#NAM
times = T2_lr_train['time'].to_index()
duplicate_times = times[times.duplicated()]
print(duplicate_times)
#1063 times duplicated

DatetimeIndex(['2019-01-02 00:00:00', '2019-01-02 03:00:00',
               '2019-01-02 06:00:00', '2019-01-02 09:00:00',
               '2019-01-02 12:00:00', '2019-01-02 15:00:00',
               '2019-01-02 18:00:00', '2019-01-02 21:00:00',
               '2019-01-03 00:00:00', '2019-01-03 03:00:00',
               ...
               '2019-02-24 06:00:00', '2019-02-24 09:00:00',
               '2019-02-24 12:00:00', '2019-02-23 00:00:00',
               '2019-02-23 03:00:00', '2019-02-23 06:00:00',
               '2019-02-23 09:00:00', '2019-02-23 12:00:00',
               '2019-02-23 15:00:00', '2019-02-23 18:00:00'],
              dtype='datetime64[ns]', name='time', length=1092, freq=None)


just want to see how the model output shape looks like when we only use 1 day of data from both datasets

# Fixing NAM:

In [7]:
import os
import xarray as xr
from collections import defaultdict

def NAM_combine_seq(input_dirs, output_dir):
    # Given two directories, combine files sequentially
    combined_datasets = []
    
    # Dictionary to track the most recent file for each day
    daily_files = defaultdict(list)

    # Define the date range (from Jan 10 to Jan 18)
    start_date = "20190110"
    end_date = "20190118"

    for input_dir in input_dirs:
        # Get all the .nc files from the directory
        nc_files = sorted([os.path.join(input_dir, file) for file in os.listdir(input_dir) if file.endswith('.nc')])

        for file in nc_files:
            # Extract the date (e.g., '20190101') from the filename
            date = file.split('_')[2]  # This assumes the format 'domnys-nam_218_YYYYMMDD_HHMM_XXX.nc'

            # Check if the date is within the range of Jan 10 to Jan 18
            if start_date <= date <= end_date:
                # Store the file in the dictionary by the date
                daily_files[date].append(file)

    # For each day, take only the first 24 hours and use the last available instance
    for date, files in daily_files.items():
        selected_files = files[:8]  # We select 8 files for 24 hours (00, 03, 06, ..., 21)
        datasets = [xr.open_dataset(file) for file in selected_files]
        combined_datasets.extend(datasets)  # Add datasets sequentially
    
    # Concatenate along the time dimension
    combined_dataset = xr.concat(combined_datasets, dim='time')

    # Generate the output filename based on the directories' month identifiers
    month_range = "-".join([os.path.basename(dir) for dir in input_dirs])
    output_file_name = f'NAM_final_{month_range}_Jan10_to_Jan18.nc'
    output_file_path = os.path.join(output_dir, output_file_name)

    # Save the combined dataset
    combined_dataset.to_netcdf(output_file_path, encoding={'time': {'units': 'hours since 2019-1-1'}})
    print(f"Saved combined dataset to: {output_file_path}")

def main():
    input_dirs = [
        "/D4/data/gvaillant/NAM/2019/match-NYC-cut/01",
    ]
    
    output_dir = "/D4/data/gvaillant/NAM/2019/tendays-combined/train"
    print(f"Output directory: {output_dir}")
    
    NAM_combine_seq(input_dirs, output_dir)
    
    print("Done combining the NAM files for Jan 10 to Jan 18!")

#main()

In [8]:
#now do the same for the val and test:

import os
import xarray as xr
from collections import defaultdict

def NAM_create_validation_set(input_dirs, output_dir):
    # Given two directories, combine files for a single day (January 19th, 2019)
    combined_datasets = []
    
    # Dictionary to track the most recent file for each day
    daily_files = defaultdict(list)

    # Define the target date (Jan 19, 2019)
    target_date = "20190120"

    for input_dir in input_dirs:
        # Get all the .nc files from the directory
        nc_files = sorted([os.path.join(input_dir, file) for file in os.listdir(input_dir) if file.endswith('.nc')])

        for file in nc_files:
            # Extract the date (e.g., '20190101') from the filename
            date = file.split('_')[2]  # This assumes the format 'domnys-nam_218_YYYYMMDD_HHMM_XXX.nc'

            # Check if the date is the target date (Jan 19)
            if date == target_date:
                # Store the file in the dictionary by the date
                daily_files[date].append(file)

    # For the target date, take only the first 24 hours (00 to 21) and use the last available instance
    selected_files = daily_files[target_date][:8]  # We select 8 files for 24 hours (00, 03, 06, ..., 21)
    datasets = [xr.open_dataset(file) for file in selected_files]
    combined_datasets.extend(datasets)  # Add datasets sequentially
    
    # Concatenate along the time dimension
    combined_dataset = xr.concat(combined_datasets, dim='time')

    # Generate the output filename based on the directories' month identifiers
    month_range = "-".join([os.path.basename(dir) for dir in input_dirs])
    output_file_name = f'NAM_validation_{month_range}_Jan19.nc'
    output_file_path = os.path.join(output_dir, output_file_name)

    # Save the combined dataset
    combined_dataset.to_netcdf(output_file_path, encoding={'time': {'units': 'hours since 2019-1-19'}})
    print(f"Saved validation dataset to: {output_file_path}")

def main():
    input_dirs = [
        "/D4/data/gvaillant/NAM/2019/match-NYC-cut/01"
    ]
    
    output_dir = "/D4/data/gvaillant/NAM/2019/tendays-combined/test"
    print(f"Output directory: {output_dir}")
    
    NAM_create_validation_set(input_dirs, output_dir)
    
    print("Done creating the validation set for January 19, 2019!")

#main()

# Fixing uWRF:

In [19]:
import os
import glob
import xarray as xr

def uWRF_combine_seq(input_dirs, output_dir):
    combined_datasets = []

    target_dates = [f'2019-01-{str(day).zfill(2)}' for day in range(10, 19)]
    target_hours = ['00:00:00', '03:00:00', '06:00:00', '09:00:00', 
                    '12:00:00', '15:00:00', '18:00:00', '21:00:00']

    os.makedirs(output_dir, exist_ok=True)

    for input_dir in input_dirs:
        print(f"Processing input directory: {input_dir}")
        
        try:
            day_dirs = sorted(next(os.walk(input_dir))[1])
        except StopIteration:
            print(f"Error accessing directory: {input_dir}")
            continue
        
        print(f"Found day directories: {day_dirs}")

        for day_dir in day_dirs:
            day_path = os.path.join(input_dir, day_dir)
            print(f"Processing day directory: {day_path}")

            input_files = sorted(glob.glob(os.path.join(day_path, 'wrfout_d02_*')))
            print(f"Found {len(input_files)} files in {day_path}")

            for file in input_files:
                file_name = os.path.basename(file)
                try:
                    file_date = file_name.split('_')[2]
                    file_time = file_name.split('_')[3]
                except IndexError:
                    print(f"Skipping invalid file name format: {file_name}")
                    continue

                print(f"File: {file_name}, Date: {file_date}, Time: {file_time}")

                if file_date in target_dates and file_time in target_hours:
                    print(f"Selected file: {file}")
                    try:
                        dataset = xr.open_dataset(file)
                        combined_datasets.append(dataset)
                    except Exception as e:
                        print(f"Error opening file {file}: {e}")

    if combined_datasets:
        try:
            combined_dataset = xr.concat(combined_datasets, dim='time')
            output_file_name = 'uWRF_final_2019-01-10_to_2019-01-18.nc'
            output_file_path = os.path.join(output_dir, output_file_name)
            combined_dataset.to_netcdf(output_file_path, encoding={'time': {'units': 'hours since 2019-01-10'}})
            print(f'Combined dataset saved to {output_file_path}')
        except Exception as e:
            print(f"Error during concatenation or saving: {e}")
    else:
        print("No datasets to concatenate. Please check your input directory or file naming format.")

def main():
    input_dirs = [
        "/D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01"
    ]
    output_dir = '/D4/data/gvaillant/prep-uwrf/d02/tendays-combined/train'
    print(f"Output directory: {output_dir}")
    uWRF_combine_seq(input_dirs, output_dir)
    print("Done combining uWRF files!")

# Uncomment below to run:
#main()

In [1]:
#new:
import os
import glob
import xarray as xr

def uWRF_combine_seq(input_dirs, output_dir):
    combined_datasets = []

    target_dates = [f'2019-01-{str(day).zfill(2)}' for day in range(10, 19)]
    target_hours = ['00:00:00', '03:00:00', '06:00:00', '09:00:00', 
                    '12:00:00', '15:00:00', '18:00:00', '21:00:00']

    os.makedirs(output_dir, exist_ok=True)

    # Track processed dates and times
    #A set does not contain any duplicates
    processed = set()

    for input_dir in input_dirs:
        print(f"Processing input directory: {input_dir}")
        
        try:
            day_dirs = sorted(next(os.walk(input_dir))[1])
        except StopIteration:
            print(f"Error accessing directory: {input_dir}")
            continue
        
        print(f"Found day directories: {day_dirs}")

        for day_dir in day_dirs:
            day_path = os.path.join(input_dir, day_dir)
            print(f"Processing day directory: {day_path}")

            input_files = sorted(glob.glob(os.path.join(day_path, 'wrfout_d02_*')))
            print(f"Found {len(input_files)} files in {day_path}")

            for file in input_files:
                file_name = os.path.basename(file)
                try:
                    file_date = file_name.split('_')[2]
                    file_time = file_name.split('_')[3]
                except IndexError:
                    print(f"Skipping invalid file name format: {file_name}")
                    continue

                print(f"File: {file_name}, Date: {file_date}, Time: {file_time}")

                # Process only the first occurrence of each date and time
                if (file_date, file_time) in processed:
                    print(f"Skipping already processed file: {file_name}")
                    continue

                if file_date in target_dates and file_time in target_hours:
                    print(f"Selected file: {file}")
                    processed.add((file_date, file_time))  # Mark as processed
                    try:
                        dataset = xr.open_dataset(file)
                        combined_datasets.append(dataset)
                    except Exception as e:
                        print(f"Error opening file {file}: {e}")

    if combined_datasets:
        try:
            combined_dataset = xr.concat(combined_datasets, dim='time')
            output_file_name = 'uWRF_final_2019-01-10_to_2019-01-18.nc'
            output_file_path = os.path.join(output_dir, output_file_name)
            combined_dataset.to_netcdf(output_file_path, encoding={'time': {'units': 'hours since 2019-01-10'}})
            print(f'Combined dataset saved to {output_file_path}')
        except Exception as e:
            print(f"Error during concatenation or saving: {e}")
    else:
        print("No datasets to concatenate. Please check your input directory or file naming format.")

def main():
    input_dirs = [
        "/D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01"
    ]
    output_dir = '/D4/data/gvaillant/prep-uwrf/d02/tendays-combined/train'
    print(f"Output directory: {output_dir}")
    uWRF_combine_seq(input_dirs, output_dir)
    print("Done combining uWRF files!")

# Uncomment below to run:
main()


Output directory: /D4/data/gvaillant/prep-uwrf/d02/tendays-combined/train
Processing input directory: /D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01
Found day directories: ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31']
Processing day directory: /D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/01
Found 29 files in /D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/01
File: wrfout_d02_2019-01-01_00:00:00, Date: 2019-01-01, Time: 00:00:00
File: wrfout_d02_2019-01-01_03:00:00, Date: 2019-01-01, Time: 03:00:00
File: wrfout_d02_2019-01-01_06:00:00, Date: 2019-01-01, Time: 06:00:00
File: wrfout_d02_2019-01-01_09:00:00, Date: 2019-01-01, Time: 09:00:00
File: wrfout_d02_2019-01-01_12:00:00, Date: 2019-01-01, Time: 12:00:00
File: wrfout_d02_2019-01-01_15:00:00, Date: 2019-01-01, Time: 15:00:00
File: wrfout_d02_2019-01-01_18:00:00, Date: 2019-01-01, Time: 18:00:

In [5]:
ds = xr.open_dataset('/D4/data/gvaillant/prep-uwrf/d02/tendays-combined/train/uWRF_final_2019-01-10_to_2019-01-18.nc')

ds
#This is correct now

In [4]:
for i in ds['time'].values:
    print(i)

#Now this is working

2019-01-10T00:00:00.000000000
2019-01-10T03:00:00.000000000
2019-01-10T06:00:00.000000000
2019-01-10T09:00:00.000000000
2019-01-10T12:00:00.000000000
2019-01-10T15:00:00.000000000
2019-01-10T18:00:00.000000000
2019-01-10T21:00:00.000000000
2019-01-11T00:00:00.000000000
2019-01-11T03:00:00.000000000
2019-01-11T06:00:00.000000000
2019-01-11T09:00:00.000000000
2019-01-11T12:00:00.000000000
2019-01-11T15:00:00.000000000
2019-01-11T18:00:00.000000000
2019-01-11T21:00:00.000000000
2019-01-12T00:00:00.000000000
2019-01-12T03:00:00.000000000
2019-01-12T06:00:00.000000000
2019-01-12T09:00:00.000000000
2019-01-12T12:00:00.000000000
2019-01-12T15:00:00.000000000
2019-01-12T18:00:00.000000000
2019-01-12T21:00:00.000000000
2019-01-13T00:00:00.000000000
2019-01-13T03:00:00.000000000
2019-01-13T06:00:00.000000000
2019-01-13T09:00:00.000000000
2019-01-13T12:00:00.000000000
2019-01-13T15:00:00.000000000
2019-01-13T18:00:00.000000000
2019-01-13T21:00:00.000000000
2019-01-14T00:00:00.000000000
2019-01-14

In [10]:
import os
import glob
import xarray as xr

def uWRF_val(input_dirs, output_dir):
    combined_datasets = []

    # Define the specific date and times
    target_date = '2019-01-20'
    target_hours = ['00:00:00', '03:00:00', '06:00:00', '09:00:00', 
                    '12:00:00', '15:00:00', '18:00:00', '21:00:00']

    # Track processed times to ensure only the first occurrence is used
    processed_times = set()

    for input_dir in input_dirs:
        print(f"Processing input directory: {input_dir}")
        
        # List all day subdirectories within the input directory
        day_dirs = sorted(next(os.walk(input_dir))[1])  # Get subdirectories
        print(f"Found day directories: {day_dirs}")

        for day_dir in day_dirs:
            day_path = os.path.join(input_dir, day_dir)
            print(f"Processing day directory: {day_path}")

            # Collect all files in the day directory
            input_files = sorted(glob.glob(os.path.join(day_path, 'wrfout_d02_*')))
            print(f"Found {len(input_files)} files in {day_path}")

            for file in input_files:
                # Extract date and time from the filename
                file_name = os.path.basename(file)
                try:
                    file_date = file_name.split('_')[2]  # Extract the date
                    file_time = file_name.split('_')[3]  # Extract the time
                except IndexError:
                    print(f"Skipping invalid file name format: {file_name}")
                    continue

                # Check if the file matches the target date and time, and is not already processed
                if file_date == target_date and file_time in target_hours:
                    if file_time in processed_times:
                        print(f"Skipping already processed time: {file_time}")
                        continue
                    
                    print(f"Selected file: {file}")
                    processed_times.add(file_time)  # Mark the time as processed
                    try:
                        dataset = xr.open_dataset(file)
                        combined_datasets.append(dataset)
                    except Exception as e:
                        print(f"Error opening file {file}: {e}")

    if combined_datasets:
        try:
            # Concatenate all datasets along the 'time' dimension
            combined_dataset = xr.concat(combined_datasets, dim='time')
            output_file_name = 'uWRF_final_2019-01-20.nc'
            output_file_path = os.path.join(output_dir, output_file_name)
            combined_dataset.to_netcdf(output_file_path, encoding={'time': {'units': 'hours since 2019-01-20'}})
            print(f'Combined dataset saved to {output_file_path}')
        except Exception as e:
            print(f"Error during concatenation or saving: {e}")
    else:
        print("No datasets to concatenate. Please check your input directory or file naming format.")

def main():
    input_dirs = [
        "/D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01"  # Update this path as needed
    ]
    output_dir = '/D4/data/gvaillant/prep-uwrf/d02/tendays-combined/test'
    print(f"Output directory: {output_dir}")
    uWRF_val(input_dirs, output_dir)
    print("Done combining uWRF files!")

# Uncomment below to run:
main()


Output directory: /D4/data/gvaillant/prep-uwrf/d02/tendays-combined/test
Processing input directory: /D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01
Found day directories: ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31']
Processing day directory: /D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/01
Found 29 files in /D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/01
Processing day directory: /D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/02
Found 29 files in /D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/02
Processing day directory: /D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/03
Found 0 files in /D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/03
Processing day directory: /D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/04
Found 0 files in /D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/04
Processing day directory: /D4/data/gvaillant/prep-uwrf/d02/

In [11]:
test = xr.open_dataset('/D4/data/gvaillant/prep-uwrf/d02/tendays-combined/test/uWRF_final_2019-01-20.nc')

test

# Now after getting the uWRF files, we use that to regrid the NAM files:

In [16]:
#The files are already cut to NYC
## REGRIDDING NYC AREA:

import xarray as xr
import numpy as np

#this the one i used
uwrf_train_data = xr.open_dataset('/D4/data/gvaillant/prep-uwrf/d02/tendays-combined/train/uWRF_final_2019-01-10_to_2019-01-18.nc')
uwrf_val_data = xr.open_dataset('/D4/data/gvaillant/prep-uwrf/d02/tendays-combined/val/uWRF_final_2019-01-19.nc')
uwrf_test_data = xr.open_dataset('/D4/data/gvaillant/prep-uwrf/d02/tendays-combined/test/uWRF_final_2019-01-20.nc')
#--
nam_train_data = xr.open_dataset('/D4/data/gvaillant/NAM/2019/tendays-combined/train/NAM_final_01_Jan10_to_Jan18.nc')
nam_val_data = xr.open_dataset('/D4/data/gvaillant/NAM/2019/tendays-combined/val/NAM_validation_01_Jan19.nc')
nam_test_data = xr.open_dataset('/D4/data/gvaillant/NAM/2019/tendays-combined/test/NAM_validation_01_Jan19.nc')

nam_train_data = nam_train_data.rename({'PRES': 'PSFC'})
nam_val_data = nam_val_data.rename({'PRES': 'PSFC'})
nam_test_data = nam_test_data.rename({'PRES': 'PSFC'})

# Function to align datasets
def align_datasets(uwrf_data, nam_data):
    # uWRF grid dimensions
    uwrf_shape = uwrf_data.T2.shape  # Assuming T2 is representative of the shape
    uwrf_lons = uwrf_data.longitude
    uwrf_lats = uwrf_data.latitude

    # Assign number of uWRF cells per NAM cell
    uwrf_cells_per_lon = 4 #when using d03 we can use 12 bc we go from 12km to 1km
    uwrf_cells_per_lat = 4

    # Calculate new NAM grid dimensions
    new_nam_lon_count = uwrf_shape[2] // uwrf_cells_per_lon
    new_nam_lat_count = uwrf_shape[1] // uwrf_cells_per_lat

    # Function to aggregate 4x4 uWRF cells into one NAM cell
    def aggregate_4x4_grid(data):
        reshaped = data.reshape(
            data.shape[0],  # Time dimension remains unchanged
            new_nam_lat_count, uwrf_cells_per_lat, 
            new_nam_lon_count, uwrf_cells_per_lon
        )
        aggregated = reshaped.mean(axis=(2, 4))  # Aggregate over latitude and longitude cells
        return aggregated

    # Determine the minimum time dimension between NAM and uWRF
    min_time_steps = min(nam_data.time.size, uwrf_data.time.size)

    # Slice both datasets to the same time dimension
    nam_data_sliced = nam_data.isel(time=slice(0, min_time_steps))
    uwrf_data_sliced = uwrf_data.isel(time=slice(0, min_time_steps))

    # Initialize aligned data
    aligned_data = {}

    # Process both T2 and PRES
    for var_name in ['T2', 'PSFC']:
        if var_name in uwrf_data_sliced and var_name in nam_data_sliced:
            uwrf_var = uwrf_data_sliced[var_name].values
            aggregated_var = aggregate_4x4_grid(uwrf_var)
            aligned_data[var_name] = (['time', 'latitude', 'longitude'], aggregated_var)
        else:
            raise ValueError(f"Variable '{var_name}' not found in one of the datasets.")

    # Create a new dataset with aligned data
    aligned_nam = xr.Dataset(
        data_vars=aligned_data,
        coords={
            'time': nam_data_sliced.time,
            'latitude': uwrf_lats[::uwrf_cells_per_lat][:new_nam_lat_count],
            'longitude': uwrf_lons[::uwrf_cells_per_lon][:new_nam_lon_count]
        },
        attrs=nam_data.attrs
    )

    return aligned_nam

#Uncomment the lines below to run the code:
aligned_nam_data = align_datasets(uwrf_val_data, nam_val_data)
aligned_nam_data.to_netcdf("/home/gvaillant1/aligned-NYC-tendays-data/aligned_nam_val_data.nc")
print("Done")

Done


In [17]:
#checkkkk nam regridded data

ds = xr.open_dataset('/home/gvaillant1/aligned-NYC-tendays-data/aligned_nam_val_data.nc')

ds

# Final files:

In [14]:
#only looking at the first 24 hours and for a ten day time slice. just want to see if model output shape is fixed.
#------
#nam files:
#nvm i have to regrid these after i get the uwrf files
nam_train = xr.open_dataset('/home/gvaillant1/aligned-NYC-tendays-data/aligned_nam_train_data.nc')
nam_val = xr.open_dataset('/home/gvaillant1/aligned-NYC-tendays-data/aligned_nam_val_data.nc')
nam_test = xr.open_dataset('/home/gvaillant1/aligned-NYC-tendays-data/aligned_nam_test_data.nc')

#-------
#uwrf files:


uwrf_train = xr.open_dataset('/D4/data/gvaillant/prep-uwrf/d02/tendays-combined/train/uWRF_final_2019-01-10_to_2019-01-18.nc')
uwrf_val = xr.open_dataset('/D4/data/gvaillant/prep-uwrf/d02/tendays-combined/val/uWRF_final_2019-01-19.nc')
uwrf_test = xr.open_dataset('/D4/data/gvaillant/prep-uwrf/d02/tendays-combined/test/uWRF_final_2019-01-20.nc')