# Preprocessing NAM-NMM Dataset

## Step 1:

Retrieve NAM-NMM dataset from BNL's remote servers using scp command.

## Step 2:
Use the filter_vars function to filter the dataset down to the variables of interest: 
* TMP_2maboveground
* UGRD_10maboveground
* VGRD_10maboveground
* PRES_surface

## Step 3: 
Filter spatially to only include area covering Manhattan

## Step 4:
Combine each day of data into a sequential format

In [28]:
import netCDF4
import xarray as xarray
import os
import glob

In [22]:
#STEP 2:
def filter_vars(input_dir, output_dir, variables):

    """

    Filter netCDF files down to contain variables of interest
    
    Args:
    input_dir: directory on computer holding orignal netCDF files
    output_dir: directory on computer where you want the filtered datasets to be stored
    variables: list of variables to keep after filtering

    Returns:
    
    None. Filtered datasets placed in the specified output_dir.

    """

    os.makedirs(output_dir, exist_ok=True)

    #Get all the netCDF files in directory
    input_files = glob.glob(os.path.join(input_dir, '*.nc'))
    
    #Loop through all the files in the input_dir
    for file in input_files:
        #Read and open the file
        data = xr.open_dataset(file)

        #Only keep the selected variables
        data_filtered = data[variables]
        
        #Create the output file path
        filename = os.path.basename(file)
        output_file = os.path.join(output_dir, filename)

        #Save the file to a new NetCDF file
        data_filtered.to_netcdf(output_file)
        data.close()
        data_filtered.close()


    print('Done filtering files!')


#Args to get data for October 11th 2019
input_dir = '/Users/gabbyvaillant/Downloads/BNL/0000_2019-10-11-files' 
output_dir = '/Users/gabbyvaillant/Downloads/BNL/0000_2019-10-11-files_filtered'
variables = ['TMP_2maboveground', 'UGRD_10maboveground', 'VGRD_10maboveground', 'PRES_surface']

#Uncomment to do filtering
#filter_vars(input_dir, output_dir, variables)


In [20]:
#STEP 3:
def stage_1_filtering(file_path, output_dir):
    
    """
    
    Spatially filtering the datasets to only inlcude data representing Manhattan (Stage 1)

    Args:
    file_path: path to file that you want to spatially filter.
    output_directory: path to directory where you wanted the spatially filtered dataset to be stored.

    Returns:
    None. Spatially filtered netCDF files is placed in the specified output_dir.

    """
    
    dataset = xr.open_dataset(file_path)

    os.makedirs(output_dir, exist_ok = True)
    
    #These x and y values are used to define the bounds of Stage 1
    x_min = 572977.0
    x_max = 585168.0
    y_min = 207247.0
    y_max = 243820.0
    
    #Extract the necessary variables
    x = dataset['x']
    y = dataset['y']
    
    #Filter the data based on the `(y, x)` bounds
    filtered_data = dataset.where(
        (x >= x_min) & (x <= x_max) &
        (y >= y_min) & (y <= y_max), drop=True
    )

    #Construct the output file path
    output_file_path = os.path.join(output_dir, os.path.basename(file_path))
    
    #Save the filtered data to a new NetCDF file
    filtered_data.to_netcdf(output_file_path)
    
    dataset.close()


#Directory on my computer holding all the original oct 11 2019 files:
input_dir = '/Users/gabbyvaillant/Downloads/BNL/0000_2019-10-11-files_filtered'

#Where I want the filtered files to go:
output_dir = '/Users/gabbyvaillant/Downloads/BNL/stage-1-files'

#Uncomment to create the files: 

"""
#Loop through all files in the input directory and apply the filtering function
for filename in os.listdir(input_dir):
    if filename.endswith('.nc'):  # Ensure you are processing only NetCDF files
        file_path = os.path.join(input_dir, filename)
        stage_1_filtering(file_path, output_dir)
"""


In [None]:
#STEP 4:

"""

Taking all the files from stage 1 for 10/11/2019 and combining them into one file in sequential order.
Dimension for stage 1: (time: 29, y: 4, x: 2)

"""

#Directory containing the stage 1 files:
stage1_file_dir = '/Users/gabbyvaillant/Downloads/BNL/stage-1-files'
output_file_path = '/Users/gabbyvaillant/Downloads/BNL/stage1-sequential/stage1_20191011_seq.nc'
'/Users/gabbyvaillant/Downloads/BNL/final-files/NAM/NAM_2019{}{}_final.nc'


nc_files = [os.path.join(stage1_file_dir, file) for file in os.listdir(stage1_file_dir) if file.endswith('.nc')]

#Sort files in sequential order
#this may be what is getting rid of the forecast hour after 24
nc_files.sort()

# Open all files as xarray datasets and combine them along the time dimension
datasets = [xr.open_dataset(nc_file) for nc_file in nc_files]

#Merge on time dimension
combined_dataset = xr.concat(datasets, dim='time')
combined_dataset.to_netcdf(output_file_path)

for ds in datasets:
    ds.close()

print(f'Combined dataset saved to {output_file_path}')