In [7]:
import glob
import os
import re
import datetime
import numpy as np
import multiprocessing
import xarray as xr
from netCDF4 import Dataset

import sys
sys.path.insert(1, "../inversion_scripts/operators/")
sys.path.insert(1, "../inversion_scripts")

import yaml
with open("config_write_BCs.yml", "r") as f:
    config = yaml.safe_load(f)

from operator_utilities import nearest_loc
from TROPOMI_operator import apply_tropomi_operator
from utils import save_obj, load_obj

#### Get a list of all TROPOMI_files that interesct our time period of interest

In [2]:
start_time_of_interest = np.datetime64(datetime.datetime.strptime(config["startdate"], "%Y%m%dT%H%M%S"))
end_time_of_interest = np.datetime64(datetime.datetime.strptime(config["enddate"], "%Y%m%dT%H%M%S"))

def get_TROPOMI_times(filename):
    file_times = re.search(r'(\d{8}T\d{6})_(\d{8}T\d{6})', filename)
    start_TROPOMI_time = np.datetime64(datetime.datetime.strptime(file_times.group(1), "%Y%m%dT%H%M%S"))
    end_TROPOMI_time = np.datetime64(datetime.datetime.strptime(file_times.group(2), "%Y%m%dT%H%M%S"))
    return start_TROPOMI_time, end_TROPOMI_time
    
TROPOMI_files = [file for file in glob.glob(os.path.join(config["tropomi_cache"], "*.nc"))
                 if (start_time_of_interest <= get_TROPOMI_times(file)[0] <= end_time_of_interest)
                 and (start_time_of_interest <= get_TROPOMI_times(file)[1] <= end_time_of_interest)]

TROPOMI_files.sort()

#### Get what TROPOMI would have seen looking at GEOS-Chem and save one pkl file per TROPOMI file (i.e., Step1_convert_GC)

In [5]:
def apply_tropomi_operator_to_one_tropomi_file(TROPOMI_file):
    
    result = apply_tropomi_operator(
        filename = TROPOMI_file,
        n_elements = False,
        gc_startdate = start_time_of_interest,
        gc_enddate = end_time_of_interest,
        xlim = [-180, 180],
        ylim = [-90, 90],
        gc_cache = config["gccache"],
        build_jacobian = False,
        sensi_cache = False)
    
    save_obj(result, os.path.join(config["workdir"], "step1", os.path.basename(TROPOMI_file).replace(".nc","_GCtoTROPOMI.pkl")))

# Run the function across as many cores as you have
num_processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=num_processes) as pool:
    pool.map(apply_tropomi_operator_to_one_tropomi_file, TROPOMI_files)
    pool.close()
    pool.join()

#### Regrid the TROPOMI and GEOS-Chem data to daily averages on the 4 x 5 grid (i.e., Step2_regrid_fast)

In [8]:
# Read any of the GEOS-Chem files to get the lat/lon grid
with xr.open_dataset(glob.glob(os.path.join(config["gccache"], "GEOSChem.SpeciesConc*.nc4"))[0]) as data:
    LON = data["lon"].values
    LAT = data["lat"].values
    
# List of all days in our time range of interest
alldates = np.arange(start_time_of_interest, end_time_of_interest + np.timedelta64(1, 'D'), dtype='datetime64[D]')
alldates = [day.astype(datetime.datetime).strftime("%Y%m%d") for day in alldates]

# Initialize arrays for regridding
daily_TROPOMI = np.zeros((len(LON), len(LAT), len(alldates)))
daily_GC = np.zeros((len(LON), len(LAT), len(alldates)))
daily_count = np.zeros((len(LON), len(LAT), len(alldates)))

# List of files from step 1 to get data to regrid from
files_from_step1 = glob.glob(os.path.join(config["workdir"], "step1", "*.pkl"))
files_from_step1.sort()

# Perform regridding
for file in files_from_step1:
    obs_GC = load_obj(file)["obs_GC"]
    NN = obs_GC.shape[0]
    if NN == 0:
        continue
    
    # For each TROPOMI obsevation, assign it to a GEOS-Chem grid cell
    for iNN in range(NN):
        
        # Which day are we on (this is not perfect right now because orbits can cross from one day to the next...
        # but it is the best we can do right now without changing apply_tropomi_operator)
        file_times = re.search(r'(\d{8}T\d{6})_(\d{8}T\d{6})', file)
        date = datetime.datetime.strptime(file_times.group(1), "%Y%m%dT%H%M%S").strftime("%Y%m%d")
        time_ind = alldates.index(date)

        c_TROPOMI, c_GC, lon0, lat0 = obs_GC[iNN, :4]
        ii = nearest_loc(lon0, LON, tolerance=5)
        jj = nearest_loc(lat0, LAT, tolerance=4)
        daily_TROPOMI[ii, jj, time_ind] += c_TROPOMI
        daily_GC[ii, jj, time_ind] += c_GC
        daily_count[ii, jj, time_ind] += 1

# Normalize by how many observations got assigned to a grid cell to finish the regridding
daily_count[daily_count == 0] = np.nan
daily_TROPOMI = daily_TROPOMI / daily_count
daily_GC = daily_GC / daily_count

# Change order of dimensions
regrid_CH4 = np.einsum("ijl->lji", daily_TROPOMI) # (lon, lat, time) -> (time, lat, lon)
regrid_GC = np.einsum("ijl->lji", daily_GC) # (lon, lat, time) -> (time, lat, lon)

# Write the netCDF file
outputname = os.path.join(config["workdir"], "step2", "Daily_CH4.nc")
with Dataset(outputname, "w", format="NETCDF4_CLASSIC") as dataset:
    
    lat = dataset.createDimension("lat", len(LAT))
    lon = dataset.createDimension("lon", len(LON))
    time = dataset.createDimension("time", len(alldates))
    
    latitudes = dataset.createVariable("lat", "f8", ("lat",))
    longitudes = dataset.createVariable("lon", "f8", ("lon",))
    dates = dataset.createVariable("date", "i", ("time",))
    
    nc_CH4 = dataset.createVariable("CH4", "f8", ("time", "lat", "lon"))
    nc_GC = dataset.createVariable("GC", "f8", ("time", "lat", "lon"))
    
    latitudes[:] = LAT
    longitudes[:] = LON
    dates[:] = alldates
    nc_CH4[:, :, :] = regrid_CH4
    nc_GC[:, :, :] = regrid_GC