# Fixing Time Duplication Problem (In progress)

Instead of inputting one large sequential file into the model at once we instead input multiple, smaller, sequential files into the model. Currently, we input about 3 months of sequential training data into the model, but now we will input each day at a time into the model for the same 3 months. Each day file will consist of 84 hours. We try this approach to avoid the time duplication problem we experience when combining all the days together into one file, because there are overlapping forecasts.


Day files are now found in /D4/data/gvaillant/prep-uwrf/d02/day_by_day

TO DO:


In the jupyter notebooks that contain each model, we need to apply the scaling, adding channel dim steps to each file in that directory. I think we can create another new directory named 'uwrf_train'. Then all the data will be ready for the model.

For the 'data_train' argument in the model, we need to replace uwrf_train with something like [for file in uwrf_train] so there is a list of data files the model will interate through. Adjust the batch size accordingly... maybe 84 for the 84 forecast hours?

Dimensions for each NAM training file:
* time = 29 (84 total forecast hours, every 3 hours)
* lat = 5
* lon = 5
* channel = 1


This method allows us to save all the 84 forecast hours, but also avoid the time duplication ... 
I guess each training dataset can be thought of day files OR forecast intilizations, since there is one model initialization per day



# Reorganizing uWRF files

In [None]:
import os
import glob
import xarray as xr
import numpy as np
from datetime import datetime, timedelta
#Correct code for making the dummy filler files for the missing ones!
def create_netcdf_for_missing_day(reference_file_path, missing_day_dir_path, start_date, duration_hours=84):
    """
    Create NetCDF files for a missing day with timestamps every 3 hours.
    
    Parameters:
    - reference_file_path: Path to the reference NetCDF file
    - missing_day_dir_path: Directory path for the missing day
    - start_date: Start date of the missing period
    - duration_hours: Total duration of files (default 84 hours)
    """
    # Ensure the missing day directory exists
    os.makedirs(missing_day_dir_path, exist_ok=True)
    
    # Check if directory is empty
    input_files = sorted(glob.glob(os.path.join(missing_day_dir_path, 'wrfout_d02_*')))
    if len(input_files) > 0:
        print(f"Directory {missing_day_dir_path} is not empty. Skipping.")
        return
    
    # Open reference dataset
    ds_ref = xr.open_dataset(reference_file_path)
    
    # Extract latitude and longitude
    lat = ds_ref['latitude'].values
    lon = ds_ref['longitude'].values
    
    # Extract reference variable data (first timestep)
    T2_data = ds_ref['T2'].isel(time=0).values
    PSFC_data = ds_ref['PSFC'].isel(time=0).values
    
    # Ensure data shapes match coordinates
    T2_data = T2_data[:len(lat), :len(lon)]
    PSFC_data = PSFC_data[:len(lat), :len(lon)]
    
    # Generate timestamps every 3 hours
    current_date = datetime.strptime(start_date, "%Y-%m-%dT%H:%M:%S")
    end_date = current_date + timedelta(hours=duration_hours)
    
    while current_date <= end_date:
        timestamp = np.datetime64(current_date)
        
        # Create output path
        output_path = os.path.join(missing_day_dir_path, f'wrfout_d02_{timestamp.astype(datetime).strftime("%Y-%m-%d_%H:%M:%S")}')
        
        # Create a new dataset
        ds = xr.Dataset(
            {
                "T2": (["latitude", "longitude"], T2_data),
                "PSFC": (["latitude", "longitude"], PSFC_data),
            },
            coords={
                "latitude": lat,
                "longitude": lon,
                "time": [timestamp]
            }
        )
        
        # Copy attributes from reference dataset
        ds.attrs = ds_ref.attrs
        
        # Save the new NetCDF file
        ds.to_netcdf(output_path)
        print(f"Created file: {output_path}")
        
        # Increment by 3 hours
        current_date += timedelta(hours=3)
    
    # Close the reference dataset
    ds_ref.close()

def generate_missing_files():
    # January missing days
    january_missing_days = [
        {'dir': '/D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/03', 'ref': '/D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/02/wrfout_d02_2019-01-02_00:00:00', 'start': '2019-01-03T00:00:00'},
        {'dir': '/D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/04', 'ref': '/D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/02/wrfout_d02_2019-01-02_00:00:00', 'start': '2019-01-04T00:00:00'},
        {'dir': '/D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/05', 'ref': '/D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/02/wrfout_d02_2019-01-02_00:00:00', 'start': '2019-01-05T00:00:00'},
        {'dir': '/D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/06', 'ref': '/D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/02/wrfout_d02_2019-01-02_00:00:00', 'start': '2019-01-06T00:00:00'},
        {'dir': '/D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/07', 'ref': '/D4/data/gvaillant/prep-uwrf/d02/pred-stage3/01/02/wrfout_d02_2019-01-02_00:00:00', 'start': '2019-01-07T00:00:00'}
    ]
    
    # February missing day
    february_missing_day = [
        {'dir': '/D4/data/gvaillant/prep-uwrf/d02/pred-stage3/02/06', 'ref': '/D4/data/gvaillant/prep-uwrf/d02/pred-stage3/02/05/wrfout_d02_2019-02-05_00:00:00', 'start': '2019-02-06T00:00:00'}
    ]
    
    # Combine and process all missing days
    all_missing_days = january_missing_days + february_missing_day
    
    for day_info in all_missing_days:
        create_netcdf_for_missing_day(
            reference_file_path=day_info['ref'], 
            missing_day_dir_path=day_info['dir'], 
            start_date=day_info['start']
        )

if __name__ == "__main__":
    generate_missing_files()

In [None]:
import glob
import xarray as xr
import os

output_dir = "/D4/data/gvaillant/prep-uwrf/d02/day_by_day/02" # **CHANGE DEPENDING ON MONTH**

for i in range(1, 29):

    #Take each day directory in the original directory
    day_dir_path = f"/D4/data/gvaillant/prep-uwrf/d02/pred-stage3/02/{str(i).zfill(2)}" # ** CHANGE **
    input_files = sorted(glob.glob(os.path.join(day_dir_path, 'wrfout_d02_*')))  # Gather all files in the daily directory
    
    if not input_files:
        print(f"No files found for day {i} in directory {day_dir_path} Skipping...")
        continue  # Skip the current day if no files are found
    
    # Print the files being found (debugging step)
    print(f"Files found for day {i}: {input_files}")
    #Load each file into a dataset and add to the list
    datasets = [xr.open_dataset(file) for file in input_files]

    #Combine datasets along the time dimension
    combined_dataset = xr.concat(datasets, dim='time')
    
    #Now all the forecasts that made up one day, are combined into one file!

    #Construct the output file name
    output_file_name = f'uwrf_2019_02_{str(i).zfill(2)}.nc' # ** CHANGE **
    output_file_path = os.path.join(output_dir, output_file_name) 
    
    time_origin = f"hours since 2019-02-{str(i).zfill(2)} 00:00:00" # ** CHANGE **

    combined_dataset.to_netcdf(output_file_path, encoding={'time': {'units': time_origin}})
    
    print(f'Combined dataset for day {i} saved to {output_file_path}')

# Now that we have the uWRF data in the correct squential format, we need to dictate which files are for training and testing

In [None]:
import os
import glob
import shutil

# Define the input directories and the output directory
input_dirs = [
    '/D4/data/gvaillant/prep-uwrf/d02/day_by_day/01',
    '/D4/data/gvaillant/prep-uwrf/d02/day_by_day/02'
]
output_dir = '/D4/data/gvaillant/prep-uwrf/d02/day-final/train'


# Iterate over input directories and move files to the output directory
for input_dir in input_dirs:
    # Get sorted list of files in the input directory
    files = sorted(glob.glob(os.path.join(input_dir, '*')))
    
    for file in files:
        # Extract the filename to avoid overwriting files with the same name
        filename = os.path.basename(file)
        target_path = os.path.join(output_dir, filename)
        
        # Copy the file to the output directory
        shutil.copy(file, target_path)
        print(f"Copied: {file} -> {target_path}")


In [None]:
import os
import glob
import shutil

# Define the input directories and the output directory
input_dirs = [
    '/D4/data/gvaillant/prep-uwrf/d02/day_by_day/03'
]

output_dir = '/D4/data/gvaillant/prep-uwrf/d02/day-final/val'

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Iterate over input directories and copy files to the output directory
for input_dir in input_dirs:
    # Get sorted list of files in the input directory
    files = sorted(glob.glob(os.path.join(input_dir, '*')))

    # First half of the March files will be val
    half_length = len(files) // 2
    selected_files = files[:half_length]
    
    for file in selected_files:
        # Extract the filename to avoid overwriting files with the same name
        filename = os.path.basename(file)
        target_path = os.path.join(output_dir, filename)
        
        # Copy the file to the output directory
        shutil.copy(file, target_path)
        print(f"Copied: {file} -> {target_path}")


In [None]:
import os
import glob
import shutil

# Define the input directories and the output directory
input_dirs = [
    '/D4/data/gvaillant/prep-uwrf/d02/day_by_day/03'
]

output_dir = '/D4/data/gvaillant/prep-uwrf/d02/day-final/test'


# Iterate over input directories and move files to the output directory
for input_dir in input_dirs:
    # Get sorted list of files in the input directory
    files = sorted(glob.glob(os.path.join(input_dir, '*')))

    #Second half of the march files will be test
    half_length = len(files) // 2
    selected_files = files[half_length:]
    
    for file in selected_files:
        # Extract the filename to avoid overwriting files with the same name
        filename = os.path.basename(file)
        target_path = os.path.join(output_dir, filename)
        
        # Copy the file to the output directory
        shutil.copy(file, target_path)
        print(f"Copied: {file} -> {target_path}")

# Reorganizing NAM files

In [15]:
## check for missing files for each month and then create them..
import os
import re
import pandas as pd
import xarray as xr


source_dir = '/D4/data/gvaillant/NAM/2019/match-NYC-cut/03'
output_dir = '/D4/data/gvaillant/NAM/2019/match-NYC-cut/03'

# Sample file pattern
file_pattern = re.compile(r"domnys-nam_218_(\d{8})_(\d{4})_0(\d{2})\.nc")  # Capture date, time, and sequence number

# Organize files by date based on the filename
files_by_date = {}
for filename in os.listdir(source_dir):
    if filename.endswith('.nc'):
        match = file_pattern.match(filename)
        if match:
            date = match.group(1)  # Extract the date (YYYYMMDD)
            time = match.group(3)  # Extract the hour (HHH)
            files_by_date.setdefault(date, []).append(time)  # Store just the time (HHMM) for each file

# Check for dates with missing files (assuming we expect 29 files)
date_with_missing_files = []
for date, times in files_by_date.items():
    if len(times) != 29:
        date_with_missing_files.append(date)
        print(f"{date} does not have all files")

# List of reference hours (0, 3, 6, ..., 81)
reference_hours = [f"{h:02d}" for h in range(0, 85, 3)]  # Expected hours: '00', '03', ..., '81'

# List to hold the actual missing hours for each date
actual_missing_files = []

# Find the missing hours for each date
for date in date_with_missing_files:
    # Extract all hours (first two digits of the time, i.e., 'HH' from 'HHMM')
    present_hours = [time[:2] for time in files_by_date[date]]  # Get the 'HH' part of the time
    
    # Check for missing hours
    missing_hours = [hour for hour in reference_hours if hour not in present_hours]
    
    if missing_hours:
        actual_missing_files.append((date, missing_hours))

print("Missing Files to Create:")
print(actual_missing_files)


def get_timestamp(date, hour):
    # Assuming date is in YYYYMMDD format and hour is in two-digit format
    datetime_str = f"{date[:4]}-{date[4:6]}-{date[6:]} {hour}:00:00"
    timestamp = pd.to_datetime(datetime_str)
    return timestamp

def create_missing_files(missing_data, source_dir, output_dir, reference_file):

    # Open the reference dataset
    ds_ref = xr.open_dataset(reference_file)

    # Extract reference data
    lat = ds_ref['latitude'].values
    lon = ds_ref['longitude'].values

    for date, hours in missing_data:
        for hour in hours:
            # Generate the missing filename
            filename = f"domnys-nam_218_{date}_0000_0{hour}.nc"
            output_path = os.path.join(output_dir, filename)

            # Adjust the hour and date for timestamps exceeding 24
            base_date = pd.to_datetime(date, format='%Y%m%d')
            hour = int(hour)
            day_offset = hour // 24  # Calculate the number of extra days
            adjusted_hour = hour % 24  # Get the hour within the day
            adjusted_date = base_date + pd.Timedelta(days=day_offset)  # Add offset days
            timestamp = adjusted_date + pd.Timedelta(hours=adjusted_hour)  # Final timestamp

            # Extract reference variable data
            T2_data = ds_ref['T2'].isel(time=0).values
            PRES_data = ds_ref['PRES'].isel(time=0).values

            # Ensure data shapes match coordinates
            T2_data = T2_data[:len(lat), :len(lon)]  # Adjust T2 data shape
            PRES_data = PRES_data[:len(lat), :len(lon)]  # Adjust PRES data shape

            # Create a new dataset
            ds = xr.Dataset(
                {
                    "T2": (["latitude", "longitude"], T2_data),
                    "PRES": (["latitude", "longitude"], PRES_data),
                },
                coords={
                    "latitude": lat,
                    "longitude": lon,
                    "time": [timestamp]
                },
            )

            # Save the dataset to a NetCDF file
            ds.to_netcdf(output_path)
            print(f"Created file: {output_path} with timestamp {timestamp}")



#I dont want to have to choose the reference file everytime..
#fix this method later on..
reference_file = '/D4/data/gvaillant/NAM/2019/match-NYC-cut/03/domnys-nam_218_20190301_0000_000.nc' 

create_missing_files(actual_missing_files, source_dir, output_dir, reference_file)

Missing Files to Create:
[]


In [None]:
import os
import xarray as xr
import re

# Define directories
source_dir = '/D4/data/gvaillant/NAM/2019/match-NYC-cut/03' # ** CHANGE **
target_dir = '/D4/data/gvaillant/NAM/2019/day-by-day/03' # ** CHANGE **

# File naming pattern to extract the date
file_pattern = re.compile(r"domnys-nam_218_(\d{8})_\d{4}_\d{3}\.nc")

# Organize files by date based on the filename
files_by_date = {}
for filename in os.listdir(source_dir):
    if filename.endswith('.nc'):
        match = file_pattern.match(filename)
        if match:
            date = match.group(1)  # Extract the date (YYYYMMDD)
            files_by_date.setdefault(date, []).append(os.path.join(source_dir, filename))

# Process files for each date
for date, file_list in files_by_date.items():
    print(f"Processing {len(file_list)} files for date {date}...")
    
    # Open and combine datasets for the specific date
    datasets = [xr.open_dataset(filepath) for filepath in sorted(file_list)]
    
    # Combine datasets along the time dimension, ensuring no artificial time creation
    combined_dataset = xr.concat(datasets, dim="time")
    
    # Close individual datasets after concatenation
    for ds in datasets:
        ds.close()

    # Save the combined dataset for this date
    output_file = os.path.join(target_dir, f"domnys-nam_218_{date}.nc")
    combined_dataset.to_netcdf(output_file, encoding={'time': {'dtype': 'float64'}})
    print(f"Saved combined dataset to {output_file}")


# MAKE INTO TRAIN TEST AND VAL BEFORE REGRIDDING ? 

In [None]:
import os
import glob
import shutil

# Define the input directories and the output directory
input_dirs = [
    '/D4/data/gvaillant/NAM/2019/day-by-day/01',
    '/D4/data/gvaillant/NAM/2019/day-by-day/02'
]
output_dir = '/D4/data/gvaillant/NAM/2019/day-final/train'


# Iterate over input directories and move files to the output directory
for input_dir in input_dirs:
    # Get sorted list of files in the input directory
    files = sorted(glob.glob(os.path.join(input_dir, '*')))
    
    for file in files:
        # Extract the filename to avoid overwriting files with the same name
        filename = os.path.basename(file)
        target_path = os.path.join(output_dir, filename)
        
        # Copy the file to the output directory
        shutil.copy(file, target_path)
        print(f"Copied: {file} -> {target_path}")


In [None]:
import os
import glob
import shutil

# Define the input directories and the output directory
input_dirs = [
    '/D4/data/gvaillant/NAM/2019/day-by-day/03'
]

output_dir = '/D4/data/gvaillant/NAM/2019/day-final/val'

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Iterate over input directories and copy files to the output directory
for input_dir in input_dirs:
    # Get sorted list of files in the input directory
    files = sorted(glob.glob(os.path.join(input_dir, '*')))

    # First half of the March files will be val
    half_length = len(files) // 2
    selected_files = files[:half_length]
    
    for file in selected_files:
        # Extract the filename to avoid overwriting files with the same name
        filename = os.path.basename(file)
        target_path = os.path.join(output_dir, filename)
        
        # Copy the file to the output directory
        shutil.copy(file, target_path)
        print(f"Copied: {file} -> {target_path}")

In [None]:
import os
import glob
import shutil


input_dirs = ['/D4/data/gvaillant/NAM/2019/day-by-day/03']

output_dir = '/D4/data/gvaillant/NAM/2019/day-final/test'


# Iterate over input directories and move files to the output directory
for input_dir in input_dirs:
    # Get sorted list of files in the input directory
    files = sorted(glob.glob(os.path.join(input_dir, '*')))

    #Second half of the march files will be test
    half_length = len(files) // 2
    selected_files = files[half_length:]
    
    for file in selected_files:
        # Extract the filename to avoid overwriting files with the same name
        filename = os.path.basename(file)
        target_path = os.path.join(output_dir, filename)
        
        # Copy the file to the output directory
        shutil.copy(file, target_path)
        print(f"Copied: {file} -> {target_path}")

# Apply regridding to NAM

In [None]:
#ALMOST CORRECT BUT NEED TO FIX NUMBER OF UWRF FILES BC SOME MISSING
import xarray as xr
import numpy as np
import os
import glob


def align_datasets(uwrf_data, nam_data):
    # uWRF grid dimensions
    uwrf_shape = uwrf_data.T2.shape
    uwrf_lons = uwrf_data.longitude
    uwrf_lats = uwrf_data.latitude

    # Assign number of uWRF cells per NAM cell
    uwrf_cells_per_lon = 4 
    uwrf_cells_per_lat = 4

    # Calculate new NAM grid dimensions
    new_nam_lon_count = uwrf_shape[2] // uwrf_cells_per_lon
    new_nam_lat_count = uwrf_shape[1] // uwrf_cells_per_lat

    # Function to aggregate 4x4 uWRF cells into one NAM cell
    def aggregate_4x4_grid(data):
        reshaped = data.reshape(
            data.shape[0],  # Time dimension remains unchanged
            new_nam_lat_count, uwrf_cells_per_lat,
            new_nam_lon_count, uwrf_cells_per_lon
        )
        aggregated = reshaped.mean(axis=(2, 4))  # Aggregate over latitude and longitude cells
        return aggregated

    # Determine the minimum time dimension between NAM and uWRF
    min_time_steps = min(nam_data.time.size, uwrf_data.time.size)

    # Slice both datasets to the same time dimension
    nam_data_sliced = nam_data.isel(time=slice(0, min_time_steps))
    uwrf_data_sliced = uwrf_data.isel(time=slice(0, min_time_steps))

    # Initialize aligned data
    aligned_data = {}

    # Process both T2 and PSFC
    for var_name in ['T2', 'PSFC']:
        if var_name in uwrf_data_sliced and var_name in nam_data_sliced:
            uwrf_var = uwrf_data_sliced[var_name].values
            aggregated_var = aggregate_4x4_grid(uwrf_var)
            aligned_data[var_name] = (['time', 'latitude', 'longitude'], aggregated_var)
        else:
            raise ValueError(f"Variable '{var_name}' not found in one of the datasets.")

    # Create a new dataset with aligned data
    aligned_nam = xr.Dataset(
        data_vars=aligned_data,
        coords={
            'time': nam_data_sliced.time,
            'latitude': uwrf_lats[::uwrf_cells_per_lat][:new_nam_lat_count],
            'longitude': uwrf_lons[::uwrf_cells_per_lon][:new_nam_lon_count]
        },
        attrs=nam_data.attrs
    )

    return aligned_nam



def process_and_save(uwrf_dir, nam_dir, save_dir, dataset_type):
    uwrf_files = sorted(glob.glob(os.path.join(uwrf_dir, '*')))
    nam_files = sorted(glob.glob(os.path.join(nam_dir, '*')))

    #print(len(uwrf_files), len(nam_files))

    
    for uwrf_file, nam_file in zip(uwrf_files, nam_files):
        uwrf_data = xr.open_dataset(uwrf_file)
        nam_data = xr.open_dataset(nam_file).rename({'PRES': 'PSFC'})

        aligned_nam = align_datasets(uwrf_data, nam_data)
        output_file = os.path.join(save_dir, f"{os.path.basename(nam_file)}")
        aligned_nam.to_netcdf(output_file)
        print(f"Saved aligned dataset: {output_file}")



process_and_save(
    uwrf_dir='/D4/data/gvaillant/prep-uwrf/d02/day-final/train',
    nam_dir='/D4/data/gvaillant/NAM/2019/day-final/train',
    save_dir='/D4/data/gvaillant/NAM/2019/aligned/train',
    dataset_type='train'
)

process_and_save(
    uwrf_dir='/D4/data/gvaillant/prep-uwrf/d02/day-final/val',
    nam_dir='/D4/data/gvaillant/NAM/2019/day-final/val',
    save_dir='/D4/data/gvaillant/NAM/2019/aligned/val',
    dataset_type='val'
)

process_and_save(
    uwrf_dir='/D4/data/gvaillant/prep-uwrf/d02/day-final/test',
    nam_dir='/D4/data/gvaillant/NAM/2019/day-final/test',
    save_dir='/D4/data/gvaillant/NAM/2019/aligned/test',
    dataset_type='test'
)