# Preprocessing NAM-NMM Dataset


Pre-req:
Download NAM-NMM files from BNL's remote servers
(using command terminal).

## Step 1:

Use the NAM_filter_vars function to filter the dataset down to the 4 variables of interest and change the names of the variables to match uWRF naming: 

* TMP_2maboveground -> T2
* UGRD_10maboveground -> U10
* VGRD_10maboveground -> V10
* PRES_surface -> PSFC

This function also changes the way the longitude values are measured.

## Step 2:
Change the dimensions of the dataset from y,x to latitude, longitude and keep them a consistent size

## Step 3: 
Filter spatially to only include area covering Manhattan

min_lat = 40.57384924257281

max_lat = 40.92

min_lon = -74.0481110602903

max_lon = -73.84627819243957

## Step 4:
Combine each day of data into a sequential format



In [382]:
import netCDF4
import xarray as xarray
import os
import glob
import numpy as np
from scipy.interpolate import griddata

In [331]:
def NAM_filter_vars(input_dir, output_dir, variables):

    os.makedirs(output_dir, exist_ok=True)
    input_files = glob.glob(os.path.join(input_dir, '*.nc')) #Take all the .nc files from input_dir

    for file in input_files:
        try:
            with xr.open_dataset(file) as ds:
                existing_vars = {var: ds[var] for var in variables.keys() if var in ds}
                if not existing_vars:
                    print(f"No matching variables found in file {file}.")
                    continue

                #Filter the variables
                ds_filtered = xr.Dataset(existing_vars)
                
                #Rename the variables
                ds_filtered = ds_filtered.rename(variables)

                for var in ds_filtered.data_vars:
                    if 'time' in ds_filtered[var].dims:
                        dims = ('time',) + tuple(d for d in ds_filtered[var].dims if d != 'time')
                        ds_filtered[var] = ds_filtered[var].transpose(*dims)
                
                
                for orig_var, new_var in variables.items():
                    if orig_var in ds:
                        ds_filtered[new_var].attrs = ds[orig_var].attrs
        
                ds_filtered.attrs = ds.attrs
                
                #Change longitude values
                if 'longitude' in ds_filtered:
                    lon = ds_filtered['longitude'].values
                    lon = np.where(lon > 180, lon - 360, lon)
                    ds_filtered['longitude'].values = lon
                    ds_filtered['longitude'].attrs['units'] = 'degrees_west'

                filename = os.path.basename(file)
                output_file = os.path.join(output_dir, filename)
                ds_filtered.to_netcdf(output_file)

        except Exception as e:
            print(f"Error processing file {file}: {e}")

    print('Done filtering files!')


In [332]:
def NAM_match_dims(input_dir, output_dir):

    os.makedirs(output_dir, exist_ok=True)
    
    for file_name in os.listdir(input_dir):
        if file_name.endswith('.nc'):
            
            input_file = os.path.join(input_dir, file_name)
            
            ds = xr.open_dataset(input_file)
            
            latitudes = ds['latitude'].values  # Shape: (67, 71)
            longitudes = ds['longitude'].values  # Shape: (67, 71)
            time = ds['time']
            
            #Save all the attributes for each variable
            lat_attrs = ds['latitude'].attrs
            lon_attrs = ds['longitude'].attrs
            time_attrs = ds['time'].attrs 
            tmp_attrs = ds['T2'].attrs
            u_attrs = ds['U10'].attrs
            v_attrs = ds['V10'].attrs
            psfc_attrs = ds['PSFC'].attrs
            
            #Flatten latitude and longitude for interpolation
            points = np.array([(lon, lat) for lat_row, lon_row in zip(latitudes, longitudes) for lat, lon in zip(lat_row, lon_row)])
            
            #Define the new latitude and longitude grid
            new_latitudes = np.linspace(np.min(latitudes), np.max(latitudes), num=67)
            new_longitudes = np.linspace(np.min(longitudes), np.max(longitudes), num=67)  #**CHANGED TO 67**
            
            #Create new meshgrid
            new_lon_grid, new_lat_grid = np.meshgrid(new_longitudes, new_latitudes)
            
            new_vars = {}
            
            for var_name in ds.data_vars:
                
                var = ds[var_name]
                new_var_list = []
                
                for t in range(len(var.time)):
                    weather_variable = var.values[t, :, :]  # Shape (67, 71)
                    
                    #Flatten the weather variable data
                    values = weather_variable.flatten()
                    
                    #Interpolate the data onto the new grid
                    new_weather_variable = griddata(points, values, (new_lon_grid, new_lat_grid), method='linear')
                    
                    #Append the interpolated data for the current time step
                    new_var_list.append(new_weather_variable)
                
                #Stack the new variables along the time dimension
                new_vars[var_name] = (['time', 'latitude', 'longitude'], np.stack(new_var_list))
            
            #Create a new xarray Dataset
            new_ds = xr.Dataset(
                new_vars, coords={'latitude': new_latitudes,
                    'longitude': new_longitudes,
                    'time': time.values})
            
            #Add the original variable attributes
            new_ds['time'].attrs.update(time_attrs)
            new_ds['latitude'].attrs.update(lat_attrs)
            new_ds['longitude'].attrs.update(lon_attrs)
            new_ds['T2'].attrs.update(tmp_attrs)
            new_ds['U10'].attrs.update(u_attrs)
            new_ds['V10'].attrs.update(v_attrs)
            new_ds['PSFC'].attrs.update(psfc_attrs)
            
            #Add global attributes
            new_ds.attrs.update(ds.attrs)
            
            output_file = os.path.join(output_dir, file_name)
            new_ds.to_netcdf(output_file)
            
    print('Done filtering files!')

In [390]:
def NAM_spatial_filter(input_dir, output_dir):
    
    os.makedirs(output_dir, exist_ok=True)
    
    #Bounds to cover Manhattan (extend a bit higher?)
    min_lat = 40.57384924257281
    max_lat = 40.92
    min_lon = -74.0481110602903
    max_lon = -73.84627819243957
    

    for file_name in os.listdir(input_dir):
        if file_name.endswith('.nc'):

            file_path = os.path.join(input_dir, file_name)
            dataset = xr.open_dataset(file_path)
            
            #Extract latitude and longitude variables
            lat_var = 'latitude'
            lon_var = 'longitude'
            lat = dataset[lat_var].values
            lon = dataset[lon_var].values
            
            #Filter the data based off of the spatial bounds
            filtered_data = dataset.where(
                (dataset[lat_var] >= min_lat) & (dataset[lat_var] <= max_lat) &
                (dataset[lon_var] >= min_lon) & (dataset[lon_var] <= max_lon), drop=True)

            output_file_path = os.path.join(output_dir, file_name)
            filtered_data.to_netcdf(output_file_path)
            dataset.close()
            
    print('Done spatially filtering files!')


In [391]:
def NAM_combine_seq(input_dir, output_dir):
    
    os.makedirs(output_dir, exist_ok=True)

    #Extract the date from the input directory name
    dir_name = os.path.basename(input_dir)
    date_str = dir_name.split('_')[1] 
    
    nc_files = [os.path.join(input_dir, file) for file in os.listdir(input_dir) if file.endswith('.nc')]
    nc_files.sort()
    
    datasets = [xr.open_dataset(nc_file) for nc_file in nc_files]
    combined_dataset = xr.concat(datasets, dim='time')
    

    output_file_name = f'NAM_final_{date_str}.nc'
    output_file_path = os.path.join(output_dir, output_file_name)
    combined_dataset.to_netcdf(output_file_path, encoding={'time': {'units': 'hours since 2019-10-11'}})
    
    for ds in datasets:
        ds.close()
    
    print(f'Combined dataset saved to {output_file_path}')


## Running functions to preprocess the NAM data:

In [392]:
#STEP 1:

input_dir = '/Users/gabbyvaillant/Downloads/BNL/0000_2019-10-11-files' 
output_dir = '/Users/gabbyvaillant/Downloads/BNL/0000_2019-10-11-files_filtered_test'
variables = {'TMP_2maboveground': 'T2', 'UGRD_10maboveground': 'U10', 'VGRD_10maboveground': 'V10', 'PRES_surface': 'PSFC'}

#Uncomment to run:
#NAM_filter_vars(input_dir, output_dir, variables)

In [337]:
#STEP 2:

input_dir = '/Users/gabbyvaillant/Downloads/BNL/0000_2019-10-11-files_filtered_test'
output_dir = '/Users/gabbyvaillant/Downloads/BNL/0000_2019-10-11-files_fixed_dims'

#Uncomment to run:
#NAM_match_dims(input_dir, output_dir)

Done filtering files!


In [393]:
#STEP 3:

input_dir = '/Users/gabbyvaillant/Downloads/BNL/0000_2019-10-11-files_fixed_dims'
output_dir = '/Users/gabbyvaillant/Downloads/BNL/0000_2019-10-11-stage1'

#Uncomment to run:
#NAM_spatial_filter(input_dir, output_dir)

Done spatially filtering files!


In [394]:
#STEP 4:

input_dir = '/Users/gabbyvaillant/Downloads/BNL/0000_2019-10-11-stage1'
output_dir = '/Users/gabbyvaillant/Downloads/BNL/0000_2019-10-11-final-NAM'

#Uncomment to run:
#NAM_combine_seq(input_dir, output_dir)

Combined dataset saved to /Users/gabbyvaillant/Downloads/BNL/0000_2019-10-11-final-NAM/NAM_final_2019-10-11-stage1.nc
