# Preprocessing uWRF Dataset

Pre-req: Download uWRF output files from BNL's remote server (using command terminal).

NOTE: using domain 02 (3km 3-hourly) becuase it has the same temporal aspect as NAM-NMM

## Step 1:
Use the uWRF_filter_vars function to filter the dataset down to the 4 variables of interest:

* T2 (Temperature 2m above surface)
* U10 (U component of wind 10m above surface)
* V10 (V component of wind 10m above surface)
* PSFC (Pressure at the surface)

This function also change the names of 'XLAT', 'XLONG', 'XTIME' to 'latitude', 'longitude' and 'time'.

## Step 2:

Change the dimensions of the dataset from 'Time', 'north_south', 'west_east' to 'time', 'latitude', 'longitude.'

## Step 3: 
Filter spatially to only include area covering Manhattan.

min_lat = 40.57384924257281

max_lat = 40.92

min_lon = -74.0481110602903

max_lon = -73.84627819243957

## Step 4:
Combine each day of data into a sequential format

NOTE: 

Domain 2 (d02):

time: 29 files up to 84 forecast hours (every 3 hours)

spatial = 3 km 

In [402]:
import netCDF4
import xarray as xarray
import os
import glob
from netCDF4 import Dataset
from scipy.interpolate import griddata
import numpy as np

In [403]:
def uWRF_filter_vars(input_dir, output_dir, variables):

    os.makedirs(output_dir, exist_ok=True)
    
    #Only using the files for Domain 2
    input_files = [file for file in glob.glob(os.path.join(input_dir, '*')) if 'd02' in os.path.basename(file)]
    
    for file in input_files:
        
        ds = xr.open_dataset(file)
        
        #Filter out the variables
        ds_filtered = ds[variables]

        #Change XLAT, XLON, XTIME
        ds_filtered = ds_filtered.rename({'XLAT': 'latitude'})
        ds_filtered = ds_filtered.rename({'XLONG': 'longitude'})
        ds_filtered = ds_filtered.rename({'XTIME': 'time'})

        #Saving and closing files
        filename = os.path.basename(file)
        output_file = os.path.join(output_dir, filename)
        ds_filtered.to_netcdf(output_file)
        ds.close()
        ds_filtered.close()

    print('Done filtering files')

In [413]:
def uWRF_match_dims(input_dir, output_dir):
    
    os.makedirs(output_dir, exist_ok=True)
    
    input_files = glob.glob(os.path.join(input_dir, '*'))  # Handle all uWRF files
    
    for file_name in input_files:
        
        ds = xr.open_dataset(file_name)

        lat_values = ds['latitude'].values 
        lon_values = ds['longitude'].values 
        time = ds['time']
        
        latitudes = lat_values[0,:,:] #take all the lat values from the first time step
        longitudes = lon_values[0,:,:] #take all of the lon values from the first time step
        
        lat_attrs = ds['latitude'].attrs
        lon_attrs = ds['longitude'].attrs
        time_attrs = ds['time'].attrs
        
        #Flatten latitude and longitude for interpolation
        #zip gets pairs and flatten makes them 1d arrays
        points = np.array([(lon, lat) for lat, lon in zip(latitudes.flatten(), longitudes.flatten())])
        
        #Define the new latitude and longitude grid
        new_latitudes = np.linspace(np.min(latitudes), np.max(latitudes), num=latitudes.shape[0])
        new_longitudes = np.linspace(np.min(longitudes), np.max(longitudes), num=longitudes.shape[1])
        
        new_lon_grid, new_lat_grid = np.meshgrid(new_longitudes, new_latitudes)
        
        new_vars = {}
        
        for var_name in ds.data_vars:
            var = ds[var_name]
            new_var_list = []
            
            for t in range(var.sizes['Time']):
                weather_variable = var.isel(Time=t).values  # Extract the data for the time step
                
                #Flatten the weather variable data
                values = weather_variable.flatten()
                
                #Interpolate the data onto the new grid
                new_weather_variable = griddata(points, values, (new_lon_grid, new_lat_grid), method='linear')
                
                #Append the interpolated data for the current time step
                new_var_list.append(new_weather_variable)
            
            new_vars[var_name] = (['time', 'latitude', 'longitude'], np.stack(new_var_list))
        
        new_ds = xr.Dataset(
            new_vars, coords={'latitude': new_latitudes,
                              'longitude': new_longitudes,
                              'time': ds['time'].values})
        
        new_ds['latitude'].attrs.update(lat_attrs)
        new_ds['longitude'].attrs.update(lon_attrs)
        new_ds['time'].attrs.update(time_attrs)
        
        for var_name in ds.data_vars:
            new_ds[var_name].attrs.update(ds[var_name].attrs)
            
        #Drop the 'Time' dimension
        if 'Time' in new_ds.dims:
            new_ds = new_ds.drop_dims('Time')

        new_ds.attrs.update(ds.attrs)
        
        output_file_name = os.path.basename(file_name)
        output_file_path = os.path.join(output_dir, output_file_name)
        new_ds.to_netcdf(output_file_path)
        
        ds.close()
        
    print('Done regridding uWRF files!')

In [405]:
def uWRF_spatial_filtering(input_dir, output_dir):

    #Bounds to cover Manhattan
    min_lat = 40.57384924257281
    max_lat = 40.92
    min_lon = -74.0481110602903
    max_lon = -73.84627819243957

    os.makedirs(output_dir, exist_ok = True)
    input_files = glob.glob(os.path.join(input_dir, '*')) #don't specify .nc bc for some reason uWRF aren't .nc files
    
    for file in input_files:
        ds = xr.open_dataset(file)

        #Extract latitude and longitude variables
        lat_var = 'latitude'
        lon_var = 'longitude'
        lat = ds[lat_var].values
        lon = ds[lon_var].values

        #Filter the data based off of the spatial bounds
        filtered_data = ds.where(
            (ds[lat_var] >= min_lat) & (ds[lat_var] <= max_lat) &
            (ds[lon_var] >= min_lon) & (ds[lon_var] <= max_lon), drop=True)

        
        filename = os.path.basename(file)
        output_file_path = os.path.join(output_dir, filename)
        filtered_data.to_netcdf(output_file_path)
        ds.close()
        filtered_data.close()
        
    print('Done spatially filtering files!')

In [406]:
def uWRF_combine_seq(input_dir, output_dir):
    
    os.makedirs(output_dir, exist_ok=True)

    dir_name = os.path.basename(input_dir)
    date_str = dir_name.split('-')[1]
    
    input_files = glob.glob(os.path.join(input_dir, '*'))  # Don't specify .nc because uWRF files aren't .nc files
    input_files.sort()

    datasets = [xr.open_dataset(file) for file in input_files]

    #Concatenate datasets along the shared 'time' dimension
    combined_dataset = xr.concat(datasets, dim='time')

    output_file_name = f'uWRF_final_{date_str}.nc'
    output_file_path = os.path.join(output_dir, output_file_name)
    
    combined_dataset.to_netcdf(output_file_path, encoding={'time': {'units': 'hours since 2019-10-11'}}) #idk if this works
    
    combined_dataset.close()
    for ds in datasets:
        ds.close()
    
    print(f'Combined dataset saved to {output_file_path}')

## Running functions to preprocess the uWRF data:

In [407]:
#STEP 1:
input_dir = '/Users/gabbyvaillant/Downloads/BNL/og_uWRF_files/NYC_wrfout_20191011' 
output_dir = '/Users/gabbyvaillant/Downloads/BNL/uWRF-20191011-filtered'
variables = ['T2', 'U10', 'V10', 'PSFC']

#Uncomment to do filtering
#uWRF_filter_vars(input_dir, output_dir, variables)

Done filtering files


In [408]:
#STEP 2:
input_dir = '/Users/gabbyvaillant/Downloads/BNL/uWRF-20191011-filtered'
output_dir = '/Users/gabbyvaillant/Downloads/BNL/uWRF-20191011-fixed_dims'

#uWRF_match_dims(input_dir, output_dir)

Done regridding uWRF files!


In [409]:
#STEP 3:
input_dir = '/Users/gabbyvaillant/Downloads/BNL/uWRF-20191011-fixed_dims'
output_dir = '/Users/gabbyvaillant/Downloads/BNL/uWRF-20191011-spatial'

#Uncomment to do filtering
#uWRF_spatial_filtering(input_dir, output_dir)

Done spatially filtering files!


In [411]:
#STEP 4:
input_dir = '/Users/gabbyvaillant/Downloads/BNL/uWRF-20191011-spatial'
output_dir = '/Users/gabbyvaillant/Downloads/BNL/final-uWRF-files'

#Uncomment to do filtering
#uWRF_combine_seq(input_dir, output_dir)

Combined dataset saved to /Users/gabbyvaillant/Downloads/BNL/final-uWRF-files/uWRF_final_20191011.nc
