In [None]:
import os
import glob
import xarray as xr
import numpy as np
from scipy.interpolate import griddata

In [5]:
def uWRF_filter_vars(input_dir, output_dir, variables):

    input_files = glob.glob(os.path.join(input_dir, '*'))
    
    for file in input_files:
        ds = xr.open_dataset(file)
        ds_filtered = ds[variables]

        #Rename specific variables
        ds_filtered = ds_filtered.rename({'XLAT': 'latitude', 'XLONG': 'longitude', 'XTIME': 'time'})

        filename = os.path.basename(file)
        output_file = os.path.join(output_dir, filename)
        print(f"Saving file to: {output_file}")
        ds_filtered.to_netcdf(output_file)
       
def main():
    
    for i in range(1, 32):
        remote_input_dir = f"/D4/data/gvaillant/uwrf/01/{str(i).zfill(2)}/d02_files" #CHANGED TO DOMAIN 2 FOR ONLY SPATIAL DOWNSCALING
        print(f"Processing directory: {remote_input_dir}")
        
        remote_output_dir = f"/D4/data/gvaillant/prep-uwrf/12stage1/01/{str(i).zfill(2)}"
        print(f"Output directory: {remote_output_dir}")
        
        uWRF_filter_vars(remote_input_dir, remote_output_dir, ['T2'])
            
    print("Done processing stage1 uWRF files!")

#main()

In [3]:
def uWRF_match_dims(input_dir, output_dir):
    
    input_files = glob.glob(os.path.join(input_dir, '*'))
    
    for file_name in input_files:
        
        print(f"Processing file: {file_name}")
        ds = xr.open_dataset(file_name)

        lat_values = ds['latitude'].values 
        lon_values = ds['longitude'].values 
        time = ds['time']
        
        latitudes = lat_values[0,:,:] #take all the lat values from the first time step
        longitudes = lon_values[0,:,:] #take all of the lon values from the first time step
        
        lat_attrs = ds['latitude'].attrs
        lon_attrs = ds['longitude'].attrs
        time_attrs = ds['time'].attrs
        
        #Flatten latitude and longitude for interpolation
        #zip gets pairs and flatten makes them 1d arrays
        points = np.array([(lon, lat) for lat, lon in zip(latitudes.flatten(), longitudes.flatten())])
        
        #Define the new latitude and longitude grid
        new_latitudes = np.linspace(np.min(latitudes), np.max(latitudes), num=latitudes.shape[0])
        new_longitudes = np.linspace(np.min(longitudes), np.max(longitudes), num=longitudes.shape[1])
        
        new_lon_grid, new_lat_grid = np.meshgrid(new_longitudes, new_latitudes)
        
        new_vars = {}
        
        for var_name in ds.data_vars:
            var = ds[var_name]
            new_var_list = []
            
            for t in range(var.sizes['Time']):
                weather_variable = var.isel(Time=t).values  # Extract the data for the time step
                
                #Flatten the weather variable data
                values = weather_variable.flatten()
                
                #Interpolate the data onto the new grid
                new_weather_variable = griddata(points, values, (new_lon_grid, new_lat_grid), method='linear')
                
                #Append the interpolated data for the current time step
                new_var_list.append(new_weather_variable)
            
            new_vars[var_name] = (['time', 'latitude', 'longitude'], np.stack(new_var_list))
        
        new_ds = xr.Dataset(
            new_vars, coords={'latitude': new_latitudes,
                              'longitude': new_longitudes,
                              'time': ds['time'].values})
        
        new_ds['latitude'].attrs.update(lat_attrs)
        new_ds['longitude'].attrs.update(lon_attrs)
        new_ds['time'].attrs.update(time_attrs)
        
        for var_name in ds.data_vars:
            new_ds[var_name].attrs.update(ds[var_name].attrs)
            
        #Drop the 'Time' dimension
        if 'Time' in new_ds.dims:
            new_ds = new_ds.drop_dims('Time')

        new_ds.attrs.update(ds.attrs)
        
        output_file_name = os.path.basename(file_name)
        output_file_path = os.path.join(output_dir, output_file_name)
        new_ds.to_netcdf(output_file_path)


def main():
    for i in range(1, 32):
        input_dir = f"/D4/data/gvaillant/prep-uwrf/d02/stage1/01/{str(i).zfill(2)}"
        print(f"Processing directory: {input_dir}")

        output_dir = f"/D4/data/gvaillant/prep-uwrf/d02/stage2/01/{str(i).zfill(2)}"
        print(f"Output directory: {output_dir}")

        uWRF_match_dims(input_dir, output_dir)
        print(f"Done processing directory: /D3/data/gvaillant/prep-uwrf/d02/stage1/01/{str(i).zfill(2)}")
        
    print(f"Done processing stage2 uWRF files!")

#main()

In [6]:
def uWRF_spatial_filtering(input_dir, output_dir):

    #Bounds to cover Manhattan
    #May have to change later
    min_lat = 40.533801
    max_lat = 40.955109
    min_lon = -74.131557
    max_lon = -73.762832

    """
    min_lat = 40.57384924257281
    max_lat = 40.92
    min_lon = -74.0481110602903
    max_lon = -73.84627819243957
    """
    input_files = glob.glob(os.path.join(input_dir, '*')) #* = Not specifying file format
    
    for file in input_files:
        print(f"Processing file: {file}")
        ds = xr.open_dataset(file)

        #Extract latitude and longitude variables
        lat_var = 'latitude'
        lon_var = 'longitude'
        lat = ds[lat_var].values
        lon = ds[lon_var].values

        #Filter the data based off of the spatial bounds
        filtered_data = ds.where(
            (ds[lat_var] >= min_lat) & (ds[lat_var] <= max_lat) &
            (ds[lon_var] >= min_lon) & (ds[lon_var] <= max_lon), drop=True)

        
        filename = os.path.basename(file)
        output_file_path = os.path.join(output_dir, filename)
        filtered_data.to_netcdf(output_file_path)

def main():

    for i in range(1, 32):
        
        input_dir = f"/D4/data/gvaillant/prep-uwrf/stage2/01/{str(i).zfill(2)}" 
        print(f"Processing directory: {input_dir}")
        
        output_dir = f"/D4/data/gvaillant/prep-uwrf/12stage3/01/{str(i).zfill(2)}"
        print(f"Output directory: {output_dir}")

        uWRF_spatial_filtering(input_dir, output_dir)
        print(f"Done processing directory: /D3/data/gvaillant/prep-uwrf/stage2/01/{str(i).zfill(2)}")
        
    print("Done processing stage3 uWRF files!")
        

#main()

In [2]:
def uWRF_combine_seq(input_dirs, output_dir):
    combined_datasets = []
    
    for input_dir in input_dirs:
        # Loop through each day's subdirectory within the monthly directory
        day_dirs = sorted(next(os.walk(input_dir))[1])  # Get list of day subdirectories
        for day_dir in day_dirs:
            day_path = os.path.join(input_dir, day_dir)  # Full path to each day's subdirectory
            input_files = sorted(glob.glob(os.path.join(day_path, '*')))  # Gather all files in the daily subdirectory
            
            # Load each file into a dataset and add to the list
            datasets = [xr.open_dataset(file) for file in input_files]
            combined_datasets.extend(datasets)

    # Concatenate all datasets along the 'time' dimension
    combined_dataset = xr.concat(combined_datasets, dim='time')

    # Generate output filename based on month range from input directories
    month_range = "-".join([os.path.basename(month_dir) for month_dir in input_dirs])
    output_file_name = f'uWRF_final_{month_range}.nc'
    output_file_path = os.path.join(output_dir, output_file_name)

    # Save the concatenated dataset to a NetCDF file
    combined_dataset.to_netcdf(output_file_path)
    print(f'Combined dataset saved to {output_file_path}')

def main():
    input_dirs = [
        "/D4/data/gvaillant/prep-uwrf/d02/pred-stage2/01",
        "/D4/data/gvaillant/prep-uwrf/d02/pred-stage2/02"
    ]
    output_dir = '/D4/data/gvaillant/prep-uwrf/d02/pred-split/train'
    print(f"Output directory: {output_dir}")

    uWRF_combine_seq(input_dirs, output_dir)
    print("Done combining uWRF files!")


#main()

In [1]:
def uWRF_val_test(input_dirs, output_dir):
    combined_datasets = []
    
    for input_dir in input_dirs:
        #Loop through each day's subdirectory within the monthly directory
        day_dirs = sorted(next(os.walk(input_dir))[1])  # Get list of day subdirectories
        for day_dir in day_dirs:
            day_path = os.path.join(input_dir, day_dir)  # Full path to each day's subdirectory
            nc_files = sorted(glob.glob(os.path.join(day_path, '*')))  # Gather all files in the daily subdirectory

            #Take the first half of the files OR second half
            half_length = len(nc_files) // 2
            #selected_files = nc_files[:half_length]  #EDIT: Select first half of the sorted files (VALIDATION)
            selected_files = nc_files[half_length:] #EDIT: Select second half of the sorted files (TESTING)
            
            datasets = [xr.open_dataset(file) for file in selected_files]
            combined_datasets.extend(datasets)

    #Concatenate all datasets along the 'time' dimension
    combined_dataset = xr.concat(combined_datasets, dim='time')

    #Generate output filename based on month range from input directories
    month_range = "-".join([os.path.basename(month_dir) for month_dir in input_dirs])
    output_file_name = f'uWRF_final_{month_range}.nc'
    output_file_path = os.path.join(output_dir, output_file_name)

    combined_dataset.to_netcdf(output_file_path)
    print(f'Combined dataset saved to {output_file_path}')

def main():
    input_dirs = [
        "/D4/data/gvaillant/prep-uwrf/d02/pred-stage2/03"
    ]
    output_dir = '/D4/data/gvaillant/prep-uwrf/d02/pred-split/test' #EDIT: Change to test or val
    print(f"Output directory: {output_dir}")

    uWRF_combine_seq(input_dirs, output_dir)
    print("Done combining uWRF files!")


#main()