# Preprocessing uWRF Dataset

## Step 1:

Retrieve uWRF data from BNL's remote servers.
NOTE: Currently only working with the data for October 11th 2019
## Step 2:
Use the filter_vars function to filter the dataset down to the variables of interest:

* T2 (Temperature 2m above surface)
* U10 (U component of wind 10m above surface)
* V10 (V component of wind 10m above surface)
* PSFC (Pressure at the surface)


## Step 3: 
Filter spatially to only include area covering Manhattan.

## Step 4:
Combine each day of data into a sequential format
d02 = time: 29 (3 hourly)
d03 = 85 (hourly)

In [97]:
import netCDF4
import xarray as xarray
import os
import glob
from netCDF4 import Dataset
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature

In [77]:
#STEP 2:
def filter_vars(input_dir, output_dir, variables):

    """

    Filter netCDF files down to contain variables of interest
    
    Args:
    input_dir: directory on computer holding orignal netCDF files
    output_dir: directory on computer where you want the filtered datasets to be stored
    variables: list of variables to keep after filtering

    Returns:
    
    Filtered datasets in the specified output_dir

    """

    #Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Get a list of all files in the input directory
    input_files = glob.glob(os.path.join(input_dir, '*'))  # Match all files
    
    #Loop through all the files in the input_dir
    for file in input_files:
        #Read and open the file
        data = xr.open_dataset(file)

        #Only keep the selected variables
        data_filtered = data[variables]
        
        #Create the output file path
        filename = os.path.basename(file)
        output_file = os.path.join(output_dir, filename)

        #Save the file to a new NetCDF file
        data_filtered.to_netcdf(output_file)
        
        # Close the datasets
        data.close()
        data_filtered.close()


    print('done filtering files')


input_dir = '/Users/gabbyvaillant/Downloads/BNL/NYC_wrfout_20191011' 
output_dir = '/Users/gabbyvaillant/Downloads/BNL/uWRF-20191011-filtered'
variables = ['T2', 'U10', 'V10', 'PSFC']

#Uncomment to do filtering
#filter_vars(input_dir, output_dir, variables)

In [74]:
#STEP 3:

"""

Taking all the uWRF files for 10/11/2019 and combining them into one file in sequential order.
Edit this code to work for either domain 2 or domain 3.

"""

#Directory containing the stage 1 files:
stage1_file_dir = '/Users/gabbyvaillant/Downloads/BNL/uWRF-20191011-filtered'
output_file_path = '/Users/gabbyvaillant/Downloads/BNL/uwrf-sequential/uwrf_d02_20191011_seq'

#output_file_path = '/Users/gabbyvaillant/Downloads/BNL/uwrf-sequential/uwrf_d03_20191011_seq.nc'

os.makedirs(output_dir, exist_ok=True)

#List all files in the directory that match the naming format for domain 2
nc_files = [os.path.join(stage1_file_dir, file) for file in os.listdir(stage1_file_dir)
            if file.startswith('wrfout_d02_')]

#Sort files in sequential order
nc_files.sort()

#Open all files as xarray datasets and combine them along the time dimension
datasets = [xr.open_dataset(nc_file) for nc_file in nc_files]

#Merge on time dimension
combined_dataset = xr.concat(datasets, dim='Time')
combined_dataset.to_netcdf(output_file_path)

for ds in datasets:
    ds.close()

print(f'Combined dataset saved to {output_file_path}')

Combined dataset saved to /Users/gabbyvaillant/Downloads/BNL/uwrf-sequential/uwrf_d02_20191011_seq


In [100]:
#STEP 4:
def stage_1_filtering(file_path, output_directory):

    ds = xr.open_dataset(file_path)
    
    print("Dataset variables and dimensions:")
    print(ds)
    
    lat_var = 'XLAT'
    lon_var = 'XLONG'
    
    if lat_var not in ds.variables or lon_var not in ds.variables:
        #Print available variables if defaults are not found
        print(f"Available variables: {list(ds.variables)}")
        raise ValueError(f"Dataset does not contain '{lat_var}' or '{lon_var}' variables")
    
    lat = ds[lat_var].values
    lon = ds[lon_var].values
    
    # Specified bounds of the dataset
    min_lat = 40.57384924257281
    max_lat = 40.90231421796557
    min_lon = -74.0481110602903
    max_lon = -73.84627819243957
    
    print(f"Latitude bounds: {min_lat} to {max_lat}")
    print(f"Longitude bounds: {min_lon} to {max_lon}")
    
    #Filter the data based on the lat and lon bounds
    filtered_data = ds.where(
        (ds[lat_var] >= min_lat) & (ds[lat_var] <= max_lat) &
        (ds[lon_var] >= min_lon) & (ds[lon_var] <= max_lon), drop=True
    )

    #Construct the output file path
    output_file_path = os.path.join(output_directory, os.path.basename(file_path))
    
    filtered_data.to_netcdf(output_file_path)
    
    ds.close()

#Define directories
input_directory = '/Users/gabbyvaillant/Downloads/BNL/uwrf-sequential/'
output_directory = '/Users/gabbyvaillant/Downloads/BNL/uWRF-final-files/'

#Make sure the output directory exists or create it if not
os.makedirs(output_directory, exist_ok=True)

#Loop through all files in the input directory and apply the filtering function
for filename in os.listdir(input_directory):
    file_path = os.path.join(input_directory, filename)
    if filename.endswith('.nc'):
        stage_1_filtering(file_path, output_directory)


Dataset variables and dimensions:
<xarray.Dataset> Size: 14MB
Dimensions:  (Time: 85, south_north: 81, west_east: 84)
Coordinates:
    XLAT     (Time, south_north, west_east) float32 2MB ...
    XLONG    (Time, south_north, west_east) float32 2MB ...
    XTIME    (Time) datetime64[ns] 680B ...
Dimensions without coordinates: Time, south_north, west_east
Data variables:
    T2       (Time, south_north, west_east) float32 2MB ...
    U10      (Time, south_north, west_east) float32 2MB ...
    V10      (Time, south_north, west_east) float32 2MB ...
    PSFC     (Time, south_north, west_east) float32 2MB ...
Attributes: (12/119)
    TITLE:                            OUTPUT FROM WRF V3.9.1.1 MODEL
    START_DATE:                      2019-10-11_00:00:00
    SIMULATION_START_DATE:           2019-10-11_00:00:00
    WEST-EAST_GRID_DIMENSION:        85
    SOUTH-NORTH_GRID_DIMENSION:      82
    BOTTOM-TOP_GRID_DIMENSION:       51
    ...                              ...
    ISLAKE:            