# Fixing Time Duplication Problem (In progress)

Instead of inputting one large sequential file into the model at once we instead input multiple, smaller, sequential files into the model. Currently, we input about 3 months of sequential training data into the model, but now we will input each day at a time into the model for the same 3 months. Each day file will consist of 84 hours. We try this approach to avoid the time duplication problem we experience when combining all the days together into one file, because there are overlapping forecasts.

Missing data: 
once in day files, then create code to check which files are missing.

Day files are now found in /D4/data/gvaillant/prep-uwrf/d02/day_by_day

TO DO:

Finshed making the uwrf files in the correct day format, apply the same changes to NAM data

In the jupyter notebooks that contain each model, we need to apply the scaling, adding channel dim steps to each file in that directory. I think we can create another new directory named 'uwrf_train'. Then all the data will be ready for the model.

For the 'data_train' argument in the model, we need to replace uwrf_train with something like [for file in uwrf_train] so there is a list of data files the model will interate through. Adjust the batch size accordingly... maybe 84 for the 84 forecast hours?

Dimensions for each training file:
* time = 29 (84 total forecast hours, every 3 hours)
* lat = 20
* lon = 20
* channel = 1


This method allows us to save all the 84 forecast hours, but also avoid the time duplication ... 
i guess each training dataset can be thought of day files OR forecast intilizations, since there is one model initialization per day



# Reorganizing uWRF files

In [None]:
import glob
import xarray as xr
import os

output_dir = "/D4/data/gvaillant/prep-uwrf/d02/day_by_day/03" # **CHANGE DEPENDING ON MONTH**

for i in range(1, 32):

    #Take each day directory in the original directory
    day_dir_path = f"/D4/data/gvaillant/prep-uwrf/d02/pred-stage3/03/{str(i).zfill(2)}"
    input_files = sorted(glob.glob(os.path.join(day_dir_path, 'wrfout_d02_*')))  # Gather all files in the daily directory
    
    if not input_files:
        print(f"No files found for day {i} in directory {day_dir_path} Skipping...")
        continue  # Skip the current day if no files are found
    
    # Print the files being found (debugging step)
    print(f"Files found for day {i}: {input_files}")
    #Load each file into a dataset and add to the list
    datasets = [xr.open_dataset(file) for file in input_files]

    #Combine datasets along the time dimension
    combined_dataset = xr.concat(datasets, dim='time')
    
    #Now all the forecasts that made up one day, are combined into one file!

    #Construct the output file name
    output_file_name = f'uwrf_2019_03_{str(i).zfill(2)}.nc'
    output_file_path = os.path.join(output_dir, output_file_name)

    time_origin = f"hours since 2019-03-{str(i).zfill(2)} 00:00:00"

    combined_dataset.to_netcdf(output_file_path, encoding={'time': {'units': time_origin}})
    
    print(f'Combined dataset for day {i} saved to {output_file_path}')

# Reorganizing NAM files

In [47]:
#location for files before they get combined sequentially:
input_dir = '/D4/data/gvaillant/NAM/2019/match-NYC-cut/01' # ** change depending on the month we are prepping **

#Where the new day files will end up:
output_dir = '/D4/data/gvaillant/NAM/2019/day-by-day/01'

#Get all the files in that specific month's directory
nc_files = sorted([os.path.join(input_dir, file) for file in os.listdir(input_dir) if file.endswith('.nc')])

#Open them
datasets = [xr.open_dataset(nc_file) for nc_file in nc_files]

#INSERT CODE TO SEPERATE BY DAY
#bc right now it would combine the whole month of data

#Concatenate along the time dimension
combined_dataset = xr.concat(datasets, dim='time')

#Construct the output file name
output_file_name = f'uwrf_2019_01_{str(i).zfill(2)}.nc'
output_file_path = os.path.join(output_dir, output_file_name)

time_origin = f"hours since 2019-01-{str(i).zfill(2)} 00:00:00"

combined_dataset.to_netcdf(output_file_path, encoding={'time': {'units': time_origin}})
    
print(f'Combined dataset for day {i} saved to {output_file_path}')

In [None]:
#now in this directory all the files for each forecast are in their corresponding months directory, they are not split up into days
#example of file names

# this is just for the first day
domnys-nam_218_20190101_0000_000.nc
domnys-nam_218_20190101_0000_003.nc
domnys-nam_218_20190101_0000_006.nc
domnys-nam_218_20190101_0000_009.nc
domnys-nam_218_20190101_0000_012.nc
domnys-nam_218_20190101_0000_015.nc
domnys-nam_218_20190101_0000_018.nc
domnys-nam_218_20190101_0000_021.nc
domnys-nam_218_20190101_0000_024.nc
domnys-nam_218_20190101_0000_027.nc
domnys-nam_218_20190101_0000_030.nc
domnys-nam_218_20190101_0000_033.nc
domnys-nam_218_20190101_0000_036.nc
domnys-nam_218_20190101_0000_039.nc
domnys-nam_218_20190101_0000_042.nc
domnys-nam_218_20190101_0000_045.nc
domnys-nam_218_20190101_0000_048.nc
domnys-nam_218_20190101_0000_051.nc
domnys-nam_218_20190101_0000_054.nc
domnys-nam_218_20190101_0000_057.nc
domnys-nam_218_20190101_0000_060.nc
domnys-nam_218_20190101_0000_063.nc
domnys-nam_218_20190101_0000_066.nc
domnys-nam_218_20190101_0000_069.nc
domnys-nam_218_20190101_0000_072.nc
domnys-nam_218_20190101_0000_075.nc
domnys-nam_218_20190101_0000_078.nc
domnys-nam_218_20190101_0000_081.nc
domnys-nam_218_20190101_0000_084.nc

In [65]:
import os
import xarray as xr
from collections import defaultdict

# Location for files before they get combined sequentially
input_dir = '/D4/data/gvaillant/NAM/2019/match-NYC-cut/01'  # ** Change depending on the month we are prepping **

# Where the new day files will end up
output_dir = '/D4/data/gvaillant/NAM/2019/day-by-day/01'

# Ensure output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Get all the files in that specific month's directory
nc_files = sorted([os.path.join(input_dir, file) for file in os.listdir(input_dir) if file.endswith('.nc')])

nc_files


['/D4/data/gvaillant/NAM/2019/match-NYC-cut/01/domnys-nam_218_20190101_0000_000.nc',
 '/D4/data/gvaillant/NAM/2019/match-NYC-cut/01/domnys-nam_218_20190101_0000_003.nc',
 '/D4/data/gvaillant/NAM/2019/match-NYC-cut/01/domnys-nam_218_20190101_0000_006.nc',
 '/D4/data/gvaillant/NAM/2019/match-NYC-cut/01/domnys-nam_218_20190101_0000_009.nc',
 '/D4/data/gvaillant/NAM/2019/match-NYC-cut/01/domnys-nam_218_20190101_0000_012.nc',
 '/D4/data/gvaillant/NAM/2019/match-NYC-cut/01/domnys-nam_218_20190101_0000_015.nc',
 '/D4/data/gvaillant/NAM/2019/match-NYC-cut/01/domnys-nam_218_20190101_0000_018.nc',
 '/D4/data/gvaillant/NAM/2019/match-NYC-cut/01/domnys-nam_218_20190101_0000_021.nc',
 '/D4/data/gvaillant/NAM/2019/match-NYC-cut/01/domnys-nam_218_20190101_0000_024.nc',
 '/D4/data/gvaillant/NAM/2019/match-NYC-cut/01/domnys-nam_218_20190101_0000_027.nc',
 '/D4/data/gvaillant/NAM/2019/match-NYC-cut/01/domnys-nam_218_20190101_0000_030.nc',
 '/D4/data/gvaillant/NAM/2019/match-NYC-cut/01/domnys-nam_218_201

In [None]:
# Group files by day using a dictionary
files_by_day = defaultdict(list)
for file in nc_files:
    # Extract the date (assuming date is from character 53 to 61 in the filename)
    date = file[53:61]
    files_by_day[date].append(file)

# Process files day by day
for date, day_files in files_by_day.items():
    # Open datasets for the current day
    datasets = [xr.open_dataset(f) for f in day_files]
    
    # Concatenate along the time dimension
    combined_dataset = xr.concat(datasets, dim='time')
    
    # Construct the output file name
    output_file_name = f'nam_2019_01_{date[-2:]}.nc'
    output_file_path = os.path.join(output_dir, output_file_name)
    
    # Time origin for the current day
    time_origin = f"hours since {date[:4]}-{date[4:6]}-{date[6:]} 00:00:00"
    
    # Save the combined dataset
    combined_dataset.to_netcdf(output_file_path, encoding={'time': {'units': time_origin}})
    print(f'Combined dataset for day {date} saved to {output_file_path}')
