# Downscaling other variable of interest: Wind Speed

In order to downscale wind speed, we need the same variable present in both the NAM and uWRF datasets. 

We are mostly interested in wind speed at the height of wind turbines. Only in the NAM dataset we have the wind speed for that altitude. The uWRF dataset does not provide that information. Instead, we will use the U and V components of wind (to calulcate the wind speed) at 10 meters. 

In [7]:
import os
import glob
import xarray as xr
import numpy as np
from scipy.interpolate import griddata

In [15]:
def uWRF_filter_vars_with_pred(input_dir, output_dir, variables):

    input_files = glob.glob(os.path.join(input_dir, '*'))
    
    for file in input_files:
        
        print(f"Processing file: {file}")
        ds = xr.open_dataset(file)
        
        #Keep variable choosen by the user. This should be the variable being downscaled and an associated predictor variable
        ds_filtered = ds[variables]

        #Rename variables
        ds_filtered = ds_filtered.rename({'XLAT': 'latitude', 'XLONG': 'longitude', 'XTIME': 'time'})

        filename = os.path.basename(file)
        output_file = os.path.join(output_dir, filename)
        print(f"Saving file to: {output_file}")
        ds_filtered.to_netcdf(output_file)
       
def main():
    
    for i in range(1, 32):
        remote_input_dir = f"/D4/data/gvaillant/uwrf/03/{str(i).zfill(2)}/d02_files" #Use either d02 files or d03 files
        print(f"Processing directory: {remote_input_dir}")
        
        remote_output_dir = f"/D4/data/gvaillant/prep-uwrf/d02/wind-stage1/03/{str(i).zfill(2)}"
        print(f"Output directory: {remote_output_dir}")
        
        uWRF_filter_vars_with_pred(remote_input_dir, remote_output_dir, ['U10', 'V10', 'PSFC']) #User chooses the variables 
            
    print("Done processing stage1 uWRF files with predictor!")


#Uncomment below to run:
main()

Processing directory: /D4/data/gvaillant/uwrf/03/01/d02_files
Output directory: /D4/data/gvaillant/prep-uwrf/d02/wind-stage1/03/01
Processing file: /D4/data/gvaillant/uwrf/03/01/d02_files/wrfout_d02_2019-03-01_00:00:00
Saving file to: /D4/data/gvaillant/prep-uwrf/d02/wind-stage1/03/01/wrfout_d02_2019-03-01_00:00:00
Processing file: /D4/data/gvaillant/uwrf/03/01/d02_files/wrfout_d02_2019-03-01_03:00:00
Saving file to: /D4/data/gvaillant/prep-uwrf/d02/wind-stage1/03/01/wrfout_d02_2019-03-01_03:00:00
Processing file: /D4/data/gvaillant/uwrf/03/01/d02_files/wrfout_d02_2019-03-01_06:00:00
Saving file to: /D4/data/gvaillant/prep-uwrf/d02/wind-stage1/03/01/wrfout_d02_2019-03-01_06:00:00
Processing file: /D4/data/gvaillant/uwrf/03/01/d02_files/wrfout_d02_2019-03-01_09:00:00
Saving file to: /D4/data/gvaillant/prep-uwrf/d02/wind-stage1/03/01/wrfout_d02_2019-03-01_09:00:00
Processing file: /D4/data/gvaillant/uwrf/03/01/d02_files/wrfout_d02_2019-03-01_12:00:00
Saving file to: /D4/data/gvaillant/pr

In [13]:
ds = xr.open_dataset('/D4/data/gvaillant/prep-uwrf/d02/wind-stage1/01/31/wrfout_d02_2019-02-03_12:00:00')

ds

In [32]:
import os
import glob
import xarray as xr
import numpy as np

def wind_speed(input_dir, output_dir):
    """
    Calculates wind speed from U10 and V10 components for NetCDF files in the input directory,
    adds it as a new variable ('WS'), keeps only PSFC and WS, and saves the modified files to the output directory.

    Parameters:
        input_dir (str): Path to the directory containing input NetCDF files.
        output_dir (str): Path to the directory to save the modified NetCDF files.
    """

    # Get list of all files in the input directory
    input_files = glob.glob(os.path.join(input_dir, '*'))

    for file_name in input_files:
        print(f"Processing file: {file_name}")
        try:
            # Open the NetCDF file
            ds = xr.open_dataset(file_name)

            # Calculate wind speed
            u = ds['U10'].data  # Extract data as numpy array
            v = ds['V10'].data  # Extract data as numpy array
            wind_speed = np.sqrt(u**2 + v**2)

            # Add wind speed as a new variable to the dataset
            ds['WS'] = (('Time', 'south_north', 'west_east'), wind_speed)
            ds['WS'].attrs['units'] = 'm/s'
            ds['WS'].attrs['description'] = 'Calculated wind speed from U10 and V10'

            # Keep only PSFC and WS
            new_ds = ds[['PSFC', 'WS']]

            # Save the modified dataset to the output directory
            output_file_name = os.path.basename(file_name)
            output_file_path = os.path.join(output_dir, output_file_name)
            new_ds.to_netcdf(output_file_path)
            print(f"Saved file to: {output_file_path}")

            # Close the dataset
            ds.close()
            new_ds.close()

        except Exception as e:
            print(f"Error processing file {file_name}: {e}")

def main():
    for i in range(31, 32):
        input_dir = f"/D4/data/gvaillant/prep-uwrf/d02/wind-stage1/01/{str(i).zfill(2)}"
        print(f"Processing directory: {input_dir}")

        output_dir = f"/D4/data/gvaillant/prep-uwrf/d02/windspeed/01/{str(i).zfill(2)}"
        print(f"Output directory: {output_dir}")

        wind_speed(input_dir, output_dir)
        print(f"Done processing directory: {input_dir}")

    print("Done adding wind speed variable to uWRF files!")

main()

Processing directory: /D4/data/gvaillant/prep-uwrf/d02/wind-stage1/01/31
Output directory: /D4/data/gvaillant/prep-uwrf/d02/windspeed/01/31
Processing file: /D4/data/gvaillant/prep-uwrf/d02/wind-stage1/01/31/wrfout_d02_2019-01-31_00:00:00
Saved file to: /D4/data/gvaillant/prep-uwrf/d02/windspeed/01/31/wrfout_d02_2019-01-31_00:00:00
Processing file: /D4/data/gvaillant/prep-uwrf/d02/wind-stage1/01/31/wrfout_d02_2019-01-31_03:00:00
Saved file to: /D4/data/gvaillant/prep-uwrf/d02/windspeed/01/31/wrfout_d02_2019-01-31_03:00:00
Processing file: /D4/data/gvaillant/prep-uwrf/d02/wind-stage1/01/31/wrfout_d02_2019-01-31_06:00:00
Saved file to: /D4/data/gvaillant/prep-uwrf/d02/windspeed/01/31/wrfout_d02_2019-01-31_06:00:00
Processing file: /D4/data/gvaillant/prep-uwrf/d02/wind-stage1/01/31/wrfout_d02_2019-01-31_09:00:00
Saved file to: /D4/data/gvaillant/prep-uwrf/d02/windspeed/01/31/wrfout_d02_2019-01-31_09:00:00
Processing file: /D4/data/gvaillant/prep-uwrf/d02/wind-stage1/01/31/wrfout_d02_2019-

In [26]:
ds = xr.open_dataset('/D4/data/gvaillant/prep-uwrf/d02/windspeed/03/30/wrfout_d02_2019-04-02_12:00:00')

ds

In [36]:
def uWRF_match_dims_with_pred(input_dir, output_dir):
    
    input_files = glob.glob(os.path.join(input_dir, '*'))

    for file_name in input_files:
        
        print(f"Processing file: {file_name}")
        ds = xr.open_dataset(file_name)

        #Extract latitude, longitude, and time values (these will be the dimensions of the dataset)
        lat_values = ds['latitude'].values[0, :, :]  # Use the first timestep for latitudes *CHECKKKKKKK*
        lon_values = ds['longitude'].values[0, :, :]  # Use the first timestep for longitudes
        time = ds['time']

        #Preserve attributes
        lat_attrs = ds['latitude'].attrs
        lon_attrs = ds['longitude'].attrs
        time_attrs = ds['time'].attrs

        #Reorganize dataset dimensions
        new_vars = {}
        for var_name in ds.data_vars:
            var = ds[var_name]
            new_vars[var_name] = (['time', 'latitude', 'longitude'], var.values)  #Reassign dimensions

        #Create a new dataset with updated dimensions
        new_ds = xr.Dataset(
            new_vars,
            coords={
                'latitude': (['latitude'], lat_values[:, 0]),  # Convert to 1D
                'longitude': (['longitude'], lon_values[0, :]),  # Convert to 1D
                'time': time.values
            }
        )

        #Add attributes
        new_ds['latitude'].attrs.update(lat_attrs)
        new_ds['longitude'].attrs.update(lon_attrs)
        new_ds['time'].attrs.update(time_attrs)

        #Add variable attributes
        for var_name in ds.data_vars:
            new_ds[var_name].attrs.update(ds[var_name].attrs)

        #Add global attributes
        new_ds.attrs.update(ds.attrs)

        output_file_name = os.path.basename(file_name)
        output_file_path = os.path.join(output_dir, output_file_name)
        new_ds.to_netcdf(output_file_path)
        print(f"Saved file to: {output_file_path}")

    print("Done with uWRF dimension adjustment!")


def main():
    for i in range(1, 32):
        input_dir = f"/D4/data/gvaillant/prep-uwrf/d02/windspeed/03/{str(i).zfill(2)}"
        print(f"Processing directory: {input_dir}")

        output_dir = f"/D4/data/gvaillant/prep-uwrf/d02/wind-stage2/03/{str(i).zfill(2)}"
        print(f"Output directory: {output_dir}")

        uWRF_match_dims_with_pred(input_dir, output_dir)
        print(f"Done processing directory: {input_dir}")

    print("Done processing stage2 uWRF files!")

#Uncomment below to run:
#main()

In [39]:
def adjust_bounds_to_match(lat_values, lon_values, min_lat, max_lat, min_lon, max_lon):
    """
    Adjust bounds until the number of latitude and longitude values is the same,
    neither is a prime number, and both are divisible by 4.
    """
    while True:
        lat_count = len(lat_values[(lat_values >= min_lat) & (lat_values <= max_lat)])
        lon_count = len(lon_values[(lon_values >= min_lon) & (lon_values <= max_lon)])

        # Check if BOTH conditions are met
        if lat_count == lon_count and lat_count % 4 == 0:
            break  # Conditions satisfied: same count, not prime, divisible by 4

        # Adjust bounds to change the count
        if lat_count != lon_count:
            if lat_count < lon_count:
                max_lat += (lat_values[1] - lat_values[0])  # Increment latitude bound
            else:
                max_lon += (lon_values[1] - lon_values[0])  # Increment longitude bound
        else:  # If counts are equal but not meeting the conditions
            max_lat += (lat_values[1] - lat_values[0])  # Increment latitude bound
            max_lon += (lon_values[1] - lon_values[0])  # Increment longitude bound

    return min_lat, max_lat, min_lon, max_lon


def uWRF_spatial_cut(input_dir, output_dir, min_lat, max_lat, min_lon, max_lon):
    """
    Function to spatially filter uWRF data to cover NYC, the boroughs, and ensure 
    consistent latitude and longitude dimensions, avoiding prime numbers and ensuring divisibility by 4.
    """
    input_files = glob.glob(os.path.join(input_dir, '*'))

    for file in input_files:
        print(f"Processing file: {file}")
        ds = xr.open_dataset(file)

        lat = ds['latitude']
        lon = ds['longitude']

        # Handle 1D or 2D coordinates dynamically
        if lat.ndim == 1 and lon.ndim == 1:
            # 1D Coordinates
            lat_values = lat.values
            lon_values = lon.values

            # Adjust bounds to match dimensions and avoid prime numbers
            min_lat, max_lat, min_lon, max_lon = adjust_bounds_to_match(
                lat_values, lon_values, min_lat, max_lat, min_lon, max_lon
            )

            filtered_data = ds.sel(
                latitude=slice(min_lat, max_lat),
                longitude=slice(min_lon, max_lon)
            )
        else:
            # 2D Coordinates
            lat_mask = (lat >= min_lat) & (lat <= max_lat)
            lon_mask = (lon >= min_lon) & (lon <= max_lon)
            combined_mask = lat_mask & lon_mask

            filtered_data = ds.where(combined_mask, drop=True)

            # Ensure lat/lon counts are equal, not prime, and divisible by 4
            lat_values = filtered_data['latitude'].values
            lon_values = filtered_data['longitude'].values
            min_lat, max_lat, min_lon, max_lon = adjust_bounds_to_match(
                lat_values, lon_values, min_lat, max_lat, min_lon, max_lon
            )

        # Save the output
        os.makedirs(output_dir, exist_ok=True)
        filename = os.path.basename(file)
        output_file = os.path.join(output_dir, filename)
        print(f"Saving file to: {output_file}")
        filtered_data.to_netcdf(output_file)


def main():
    for i in range(1, 32):
        input_dir = f"/D4/data/gvaillant/prep-uwrf/d02/wind-stage2/03/{str(i).zfill(2)}"
        print(f"Processing directory: {input_dir}")

        output_dir = f"/D4/data/gvaillant/prep-uwrf/d02/wind-stage3/03/{str(i).zfill(2)}"
        print(f"Output directory: {output_dir}")

        # Bounds to cover NYC and Boroughs
        min_lat = 40.4774
        max_lat = 40.9176
        min_lon = -74.2591
        max_lon = -73.7004

        uWRF_spatial_cut(input_dir, output_dir, min_lat, max_lat, min_lon, max_lon)
        print(f"Done spatially filtering the uWRF files to NYC!")


#main()

Processing directory: /D4/data/gvaillant/prep-uwrf/d02/wind-stage2/03/01
Output directory: /D4/data/gvaillant/prep-uwrf/d02/wind-stage3/03/01
Processing file: /D4/data/gvaillant/prep-uwrf/d02/wind-stage2/03/01/wrfout_d02_2019-03-01_00:00:00
Saving file to: /D4/data/gvaillant/prep-uwrf/d02/wind-stage3/03/01/wrfout_d02_2019-03-01_00:00:00
Processing file: /D4/data/gvaillant/prep-uwrf/d02/wind-stage2/03/01/wrfout_d02_2019-03-01_03:00:00
Saving file to: /D4/data/gvaillant/prep-uwrf/d02/wind-stage3/03/01/wrfout_d02_2019-03-01_03:00:00
Processing file: /D4/data/gvaillant/prep-uwrf/d02/wind-stage2/03/01/wrfout_d02_2019-03-01_06:00:00
Saving file to: /D4/data/gvaillant/prep-uwrf/d02/wind-stage3/03/01/wrfout_d02_2019-03-01_06:00:00
Processing file: /D4/data/gvaillant/prep-uwrf/d02/wind-stage2/03/01/wrfout_d02_2019-03-01_09:00:00
Saving file to: /D4/data/gvaillant/prep-uwrf/d02/wind-stage3/03/01/wrfout_d02_2019-03-01_09:00:00
Processing file: /D4/data/gvaillant/prep-uwrf/d02/wind-stage2/03/01/wr

In [40]:
import os
import glob
import xarray as xr

def uWRF_combine_seq(input_dirs, output_dir):
    combined_datasets = []
    
    for input_dir in input_dirs:
        # Loop through each day's subdirectory within the monthly directory
        day_dirs = sorted(next(os.walk(input_dir))[1])  # Get list of day subdirectories
        for day_dir in day_dirs:
            day_path = os.path.join(input_dir, day_dir)  # Full path to each day's subdirectory
            input_files = sorted(glob.glob(os.path.join(day_path, '*')))  # Gather all files in the daily subdirectory
            
            # Load each file into a dataset and add to the list
            datasets = [xr.open_dataset(file) for file in input_files]
            combined_datasets.extend(datasets)

    # Concatenate all datasets along the 'time' dimension
    combined_dataset = xr.concat(combined_datasets, dim='time')

    # Generate output filename based on month range from input directories
    month_range = "-".join([os.path.basename(month_dir) for month_dir in input_dirs])
    output_file_name = f'uWRF_final_{month_range}.nc'
    output_file_path = os.path.join(output_dir, output_file_name)

    # Save the concatenated dataset to a NetCDF file
    combined_dataset.to_netcdf(output_file_path, encoding={'time': {'units': 'hours since 2019-10-11'}}) #that isnt correct
    print(f'Combined dataset saved to {output_file_path}')

def main():
    input_dirs = [
        "/D4/data/gvaillant/prep-uwrf/d02/wind-stage3/01",
        "/D4/data/gvaillant/prep-uwrf/d02/wind-stage3/02"
    ]
    output_dir = '/D4/data/gvaillant/prep-uwrf/d02/wind-NYC-split/train'
    print(f"Output directory: {output_dir}")

    uWRF_combine_seq(input_dirs, output_dir)
    print("Done combining uWRF files!")

#Uncomment below to run:
#main()

Output directory: /D4/data/gvaillant/prep-uwrf/d02/wind-NYC-split/train
Combined dataset saved to /D4/data/gvaillant/prep-uwrf/d02/wind-NYC-split/train/uWRF_final_01-02.nc
Done combining uWRF files!


In [42]:
def uWRF_val_test(input_dirs, output_dir):
    combined_datasets = []
    
    for input_dir in input_dirs:
        # Loop through each day's subdirectory within the monthly directory
        day_dirs = sorted(next(os.walk(input_dir))[1])  # Get list of day subdirectories
        for day_dir in day_dirs:
            day_path = os.path.join(input_dir, day_dir)  # Full path to each day's subdirectory
            nc_files = sorted(glob.glob(os.path.join(day_path, '*')))  # Gather all files in the daily subdirectory

            # Take the first half of the files
            half_length = len(nc_files) // 2
            selected_files = nc_files[:half_length]  # Select first half of the sorted files (VALIDATION)
            #selected_files = nc_files[half_length:] #Select second half of the sorted files (TESTING)
            
            # Open the selected files
            # Load each file into a dataset and add to the list
            datasets = [xr.open_dataset(file) for file in selected_files]
            combined_datasets.extend(datasets)

    # Concatenate all datasets along the 'time' dimension
    combined_dataset = xr.concat(combined_datasets, dim='time')

    # Generate output filename based on month range from input directories
    month_range = "-".join([os.path.basename(month_dir) for month_dir in input_dirs])
    output_file_name = f'uWRF_final_{month_range}.nc'
    output_file_path = os.path.join(output_dir, output_file_name)

    # Save the concatenated dataset to a NetCDF file
    combined_dataset.to_netcdf(output_file_path, encoding={'time': {'units': 'hours since 2019-10-11'}})
    print(f'Combined dataset saved to {output_file_path}')

def main():
    input_dirs = [
        "/D4/data/gvaillant/prep-uwrf/d02/wind-stage3/03"
    ]
    output_dir = '/D4/data/gvaillant/prep-uwrf/d02/wind-NYC-split/val' #change to val or test
    print(f"Output directory: {output_dir}")

    uWRF_combine_seq(input_dirs, output_dir)
    print("Done combining uWRF files!")

#Uncomment below to run:
#main()

Output directory: /D4/data/gvaillant/prep-uwrf/d02/wind-NYC-split/val
Combined dataset saved to /D4/data/gvaillant/prep-uwrf/d02/wind-NYC-split/val/uWRF_final_03.nc
Done combining uWRF files!


# NAM functions: 

In [None]:
#Fix this one for wind speed
import netCDF4
import xarray as xr
import os
import glob
import numpy as np
from scipy.interpolate import griddata

def NAM_filter_and_match_dims(input_dir, output_dir, variables):
    # List all files in input directory
    file_list = [(os.path.join(input_dir, file), file) for file in os.listdir(input_dir) if file.endswith('.nc')]

    for file_path, file_name in file_list:  # Unpack the tuple
        try:
            # Step 1: Filter the variables using NAM_filter_vars logic
            with xr.open_dataset(file_path) as ds:
                existing_vars = {var: ds[var] for var in variables.keys() if var in ds}
                if not existing_vars:
                    print(f"No matching variables found in {file_name}.")
                    continue

                # Filter and rename variables
                ds_filtered = xr.Dataset(existing_vars).rename(variables)

                for var in ds_filtered.data_vars:
                    if 'time' in ds_filtered[var].dims:
                        dims = ('time',) + tuple(d for d in ds_filtered[var].dims if d != 'time')
                        ds_filtered[var] = ds_filtered[var].transpose(*dims)

                for orig_var, new_var in variables.items():
                    if orig_var in ds:
                        ds_filtered[new_var].attrs = ds[orig_var].attrs

                ds_filtered.attrs = ds.attrs

                # Change longitude values to be in degrees west
                if 'longitude' in ds_filtered:
                    lon = ds_filtered['longitude'].values
                    lon = np.where(lon > 180, lon - 360, lon)
                    ds_filtered['longitude'].values = lon
                    ds_filtered['longitude'].attrs['units'] = 'degrees_west'
        
        except Exception as e:
            print(f"Error processing file {file_name}: {e}")
            continue
        
        # Step 2: Interpolate variables using NAM_match_dims logic
        latitudes = ds_filtered['latitude'].values  # Shape: (67, 71)
        longitudes = ds_filtered['longitude'].values  # Shape: (67, 71)
        time = ds_filtered['time']
        
        # Save all the attributes for each variable
        lat_attrs = ds_filtered['latitude'].attrs
        lon_attrs = ds_filtered['longitude'].attrs
        time_attrs = ds_filtered['time'].attrs
        
        # Flatten latitude and longitude for interpolation
        points = np.array([(lon, lat) for lat_row, lon_row in zip(latitudes, longitudes) for lat, lon in zip(lat_row, lon_row)])
        
        # Define the new latitude and longitude grid
        new_latitudes = np.linspace(np.min(latitudes), np.max(latitudes), num=67)
        new_longitudes = np.linspace(np.min(longitudes), np.max(longitudes), num=67)
        
        # Create new meshgrid
        new_lon_grid, new_lat_grid = np.meshgrid(new_longitudes, new_latitudes)
        
        new_vars = {}
        
        for var_name in ds_filtered.data_vars:
            var = ds_filtered[var_name]
            new_var_list = []
            
            for t in range(len(var.time)):
                weather_variable = var.values[t, :, :]  # Shape (67, 71)
                
                # Flatten the weather variable data
                values = weather_variable.flatten()
                
                # Interpolate the data onto the new grid
                new_weather_variable = griddata(points, values, (new_lon_grid, new_lat_grid), method='linear')
                
                # Append the interpolated data for the current time step
                new_var_list.append(new_weather_variable)
            
            # Stack the new variables along the time dimension
            new_vars[var_name] = (['time', 'latitude', 'longitude'], np.stack(new_var_list))
        
        # Create a new xarray Dataset
        new_ds = xr.Dataset(
            new_vars, coords={'latitude': new_latitudes,
                              'longitude': new_longitudes,
                              'time': time.values})
        
        # Add the original variable attributes
        new_ds['time'].attrs.update(time_attrs)
        new_ds['latitude'].attrs.update(lat_attrs)
        new_ds['longitude'].attrs.update(lon_attrs)

        # Add global attributes
        new_ds.attrs.update(ds_filtered.attrs)

        #Saving files
        filename = os.path.basename(file_name)
        output_file = os.path.join(output_dir, filename)
        print(f"Saving file to: {output_file}")  # Print the output file path
        new_ds.to_netcdf(output_file)

def main():
    input_dir = "/D4/data/gvaillant/NAM-2019-netcdf/03"
    output_dir = "/D4/data/gvaillant/NAM/2019/match-pred/03"
    variables = {'TMP_2maboveground': 'U10', 'PRES_surface': 'PSFC'}
    print(f"Output directory: {output_dir}")
    
    NAM_filter_and_match_dims(input_dir, output_dir, variables)
    
    print("Done processing stage1 NAM files!")

#main()

In [43]:
ds = xr.open_dataset('/D4/data/gvaillant/NAM-2019-netcdf/01/domnys-nam_218_20190101_0000_000.nc')
#find the u10 and v10
ds

In [None]:
# FINAL uWRF files for wind speed and surface pressure as a predictor

uwrf_train_data = xr.open_dataset('/D4/data/gvaillant/prep-uwrf/d02/wind-NYC-split/train/uWRF_final_01-02.nc')
uwrf_val_data = xr.open_dataset('/D4/data/gvaillant/prep-uwrf/d02/wind-NYC-split/val/uWRF_final_03.nc')
uwrf_test_data = xr.open_dataset('/D4/data/gvaillant/prep-uwrf/d02/wind-NYC-split/test/uWRF_final_03.nc')

# ---
# NAM: