In [None]:
def NAM_filter_and_match_dims(input_dir, output_dir, variables):
    # List all files in input directory
    file_list = [(os.path.join(input_dir, file), file) for file in os.listdir(input_dir) if file.endswith('.nc')]

    for file_path, file_name in file_list:  # Unpack the tuple
        try:
            # Step 1: Filter the variables using NAM_filter_vars logic
            with xr.open_dataset(file_path) as ds:
                existing_vars = {var: ds[var] for var in variables.keys() if var in ds}
                if not existing_vars:
                    print(f"No matching variables found in {file_name}.")
                    continue

                # Filter and rename variables
                ds_filtered = xr.Dataset(existing_vars).rename(variables)

                for var in ds_filtered.data_vars:
                    if 'time' in ds_filtered[var].dims:
                        dims = ('time',) + tuple(d for d in ds_filtered[var].dims if d != 'time')
                        ds_filtered[var] = ds_filtered[var].transpose(*dims)

                for orig_var, new_var in variables.items():
                    if orig_var in ds:
                        ds_filtered[new_var].attrs = ds[orig_var].attrs

                ds_filtered.attrs = ds.attrs

                # Change longitude values to be in degrees west
                if 'longitude' in ds_filtered:
                    lon = ds_filtered['longitude'].values
                    lon = np.where(lon > 180, lon - 360, lon)
                    ds_filtered['longitude'].values = lon
                    ds_filtered['longitude'].attrs['units'] = 'degrees_west'
        
        except Exception as e:
            print(f"Error processing file {file_name}: {e}")
            continue
        
        # Step 2: Interpolate variables using NAM_match_dims logic
        latitudes = ds_filtered['latitude'].values  # Shape: (67, 71)
        longitudes = ds_filtered['longitude'].values  # Shape: (67, 71)
        time = ds_filtered['time']
        
        # Save all the attributes for each variable
        lat_attrs = ds_filtered['latitude'].attrs
        lon_attrs = ds_filtered['longitude'].attrs
        time_attrs = ds_filtered['time'].attrs
        
        # Flatten latitude and longitude for interpolation
        points = np.array([(lon, lat) for lat_row, lon_row in zip(latitudes, longitudes) for lat, lon in zip(lat_row, lon_row)])
        
        # Define the new latitude and longitude grid
        new_latitudes = np.linspace(np.min(latitudes), np.max(latitudes), num=67)
        new_longitudes = np.linspace(np.min(longitudes), np.max(longitudes), num=67)
        
        # Create new meshgrid
        new_lon_grid, new_lat_grid = np.meshgrid(new_longitudes, new_latitudes)
        
        new_vars = {}
        
        for var_name in ds_filtered.data_vars:
            var = ds_filtered[var_name]
            new_var_list = []
            
            for t in range(len(var.time)):
                weather_variable = var.values[t, :, :]  # Shape (67, 71)
                
                # Flatten the weather variable data
                values = weather_variable.flatten()
                
                # Interpolate the data onto the new grid
                new_weather_variable = griddata(points, values, (new_lon_grid, new_lat_grid), method='linear')
                
                # Append the interpolated data for the current time step
                new_var_list.append(new_weather_variable)
            
            # Stack the new variables along the time dimension
            new_vars[var_name] = (['time', 'latitude', 'longitude'], np.stack(new_var_list))
        
        # Create a new xarray Dataset
        new_ds = xr.Dataset(
            new_vars, coords={'latitude': new_latitudes,
                              'longitude': new_longitudes,
                              'time': time.values})
        
        # Add the original variable attributes
        new_ds['time'].attrs.update(time_attrs)
        new_ds['latitude'].attrs.update(lat_attrs)
        new_ds['longitude'].attrs.update(lon_attrs)

        # Add global attributes
        new_ds.attrs.update(ds_filtered.attrs)

        #Saving files
        filename = os.path.basename(file_name)
        output_file = os.path.join(output_dir, filename)
        print(f"Saving file to: {output_file}")  # Print the output file path
        new_ds.to_netcdf(output_file)
#-------------------------------------------------------------------------
def main():
    input_dir = "/D4/data/gvaillant/NAM-2019-netcdf/07"
    output_dir = "/D3/data/gvaillant/NAM/2019/intermediate/07"
    variables = {'TMP_2maboveground': 'T2'}
    print(f"Output directory: {output_dir}")
    
    # Assuming 'T2' is the dictionary of original variable names and their new names
    NAM_filter_and_match_dims(input_dir, output_dir, variables)
    
    print("Done processing stage1 NAM files!")

#main()

In [None]:
def NAM_spatial_filter(input_dir, output_dir):
    
    #Bounds to cover Manhattan (extend a bit higher?)
    min_lat = 40.533801
    max_lat = 40.955109
    min_lon = -74.131557
    max_lon = -73.762832
    

    for file_name in os.listdir(input_dir):
        print(f"Processing file: {file_name}")
        if file_name.endswith('.nc'):

            file_path = os.path.join(input_dir, file_name)
            dataset = xr.open_dataset(file_path)
            
            #Extract latitude and longitude variables
            lat_var = 'latitude'
            lon_var = 'longitude'
            lat = dataset[lat_var].values
            lon = dataset[lon_var].values
            
            #Filter the data based off of the spatial bounds
            filtered_data = dataset.where(
                (dataset[lat_var] >= min_lat) & (dataset[lat_var] <= max_lat) &
                (dataset[lon_var] >= min_lon) & (dataset[lon_var] <= max_lon), drop=True)

            output_file_path = os.path.join(output_dir, file_name)
            filtered_data.to_netcdf(output_file_path)

def main():
    input_dir = "/D4/data/gvaillant/NAM/2019/intermediate/06"
    output_dir = "/D4/data/gvaillant/NAM/2019/NYC-final/06"
    print(f"Output directory: {output_dir}")
    
    NAM_spatial_filter(input_dir, output_dir)
    
    print("Done processing stage1 NAM files!")

#main()

In [32]:
import os
import xarray as xr

def NAM_combine_seq(input_dirs, output_dir):
    # Given two directories, combine files sequentially
    combined_datasets = []
    
    for input_dir in input_dirs:
        nc_files = sorted([os.path.join(input_dir, file) for file in os.listdir(input_dir) if file.endswith('.nc')])
        datasets = [xr.open_dataset(nc_file) for nc_file in nc_files]
        combined_datasets.extend(datasets)  # Add datasets sequentially

    # Concatenate along the time dimension
    combined_dataset = xr.concat(combined_datasets, dim='time')
    
    # Generate the output filename based on the directories' month identifiers
    month_range = "-".join([os.path.basename(dir) for dir in input_dirs])
    output_file_name = f'NAM_final_{month_range}.nc'
    output_file_path = os.path.join(output_dir, output_file_name)
    
    # Save the combined dataset
    combined_dataset.to_netcdf(output_file_path, encoding={'time': {'units': 'hours since 2019-1-1'}})
    print(f"Saved combined dataset to: {output_file_path}")

def main():

    input_dirs = [
        "/D4/data/gvaillant/NAM/2019/NYC-final/03"
    ]
    
    output_dir = "/D4/data/gvaillant/NAM/2019/final/trial-run/test"
    print(f"Output directory: {output_dir}")
    
    NAM_combine_seq(input_dirs, output_dir)
    
    print("Done combining the first two months of NAM files!")

main()

Output directory: /D3/data/gvaillant/NAM/2019/final/trial-run/train
Saved combined dataset to: /D3/data/gvaillant/NAM/2019/final/trial-run/train/NAM_final_01-02.nc
Done combining the first two months of NAM files!


In [35]:
import os
import xarray as xr

def NAM_val_test(input_dirs, output_dir):
    # Given two directories, combine files sequentially
    combined_datasets = []
    
    for input_dir in input_dirs:
        # Get sorted list of .nc files
        nc_files = sorted([os.path.join(input_dir, file) for file in os.listdir(input_dir) if file.endswith('.nc')])
        
        # Take the first half of the files
        half_length = len(nc_files) // 2
        #selected_files = nc_files[:half_length]  # Select first half of the sorted files
        selected_files = nc_files[half_length:] #Select second half of the sorted files
        # Open the selected files
        datasets = [xr.open_dataset(nc_file) for nc_file in selected_files]
        combined_datasets.extend(datasets)  # Add datasets sequentially

    # Concatenate along the time dimension
    combined_dataset = xr.concat(combined_datasets, dim='time')
    
    # Generate the output filename based on the directories' month identifiers
    month_range = "-".join([os.path.basename(dir) for dir in input_dirs])
    output_file_name = f'NAM_final_{month_range}.nc'
    output_file_path = os.path.join(output_dir, output_file_name)
    
    # Save the combined dataset
    combined_dataset.to_netcdf(output_file_path, encoding={'time': {'units': 'hours since 2019-1-1'}})
    print(f"Saved combined dataset to: {output_file_path}")

def main():
    input_dirs = [
        "/D4/data/gvaillant/NAM/2019/NYC-final/03"
    ]
    
    output_dir = "/D4/data/gvaillant/NAM/2019/final/trial-run/test"
    print(f"Output directory: {output_dir}")
    
    NAM_val_test(input_dirs, output_dir)
    
    print("Done combining the first half of the NAM files!")

#main()

Output directory: /D3/data/gvaillant/NAM/2019/final/trial-run/test
Saved combined dataset to: /D3/data/gvaillant/NAM/2019/final/trial-run/test/NAM_final_03.nc
Done combining the first half of the NAM files!
