Import packages and set the directory to load data

In [1]:
# work with jl2815 environment
import xarray as xr # for netCDF4 
import pandas as pd
import numpy as np
from netCDF4 import Dataset
from matplotlib import pyplot as plt
import os


In [2]:
current_directory = os.getcwd()
print("Current working directory:", current_directory)
new_directory = "D:\\GEMS_UNZIPPED"
os.chdir(new_directory)
updated_directory = os.getcwd()
print("Updated working directory:", updated_directory)

Current working directory: c:\Users\joonw\TCO\newpipeline
Updated working directory: D:\GEMS_UNZIPPED


2024 01: hours: 00 to 05   N3035_E100110
2023 04: hours: 00 to 07   N3035_E100110
2023 01: hours: 00 to 05   N3035_E100110

2023: y23m04day20_8 and y23m07day13_8 missing !! for N3035_E100110
2023: y23m04day20_8 !! for N3035_E110120


2023 04: hours: 00 to 07   N3035_E110120           
2023 07: hours: 00 to 07   N3035_E110120
2024 04: hours: 00 to 07   N3035_E110120
2024 07: hours: 00 to 07   N3035_E110120

Function to extract the region from lat_s,lat_e,lon_s,lon_e = 5, 10, 110, 120.

In [20]:
class gems_loader_2023:          
    def __init__(self, file_path,lat_s,lat_e,lon_s,lon_e):
        self.file_path = file_path       
        self.lat_s = lat_s 
        self.lat_e = lat_e  
        self.lon_s = lon_s
        self.lon_e = lon_e                         
  
    def extract_data(self,file_path):
        location = xr.open_dataset(file_path, group='Geolocation Fields')
        Z = xr.open_dataset(file_path, group='Data Fields')
        
        location_variables = ['Latitude', 'Longitude', 'Time']
        tmp1 = location[location_variables]

        # Convert xarray.Dataset to pandas DataFrame
        location_df = tmp1.to_dataframe().reset_index()
        location_df = location_df[location_variables]

        Z_variables = ['ColumnAmountO3','FinalAlgorithmFlags']
        tmp2 = Z[Z_variables]

        Z_df = tmp2.to_dataframe().reset_index()
        Z_df = Z_df[Z_variables]

        mydata = pd.concat([location_df, Z_df], axis=1)
        mydata = mydata[ location_variables + Z_variables ]
        
        # Close the NetCDF file
        location.close()
        Z.close()
        return mydata
    
    def dropna(self):
        mydata = self.extract_data(self.file_path)
        mydata = mydata.dropna(subset=['Latitude', 'Longitude','Time','ColumnAmountO3','FinalAlgorithmFlags'])
        
        return mydata

    def result(self):
        # self.mydata =  self.extract_data(self.file_path)

        df = self.dropna()
  
        df2 = df[ (df['Latitude']<= self.lat_e) & (df['Latitude']>= self.lat_s) & (df['Longitude']>= self.lon_s) & (df['Longitude']<= self.lon_e) ]
        
        df2 = df2[df2.iloc[:,3]<1000]    # Cut off missing values

        df2['Time'] = np.mean(df2.iloc[:,2])

        # Convert 'Time' column to datetime type
        # print(df2['Time'])

        df2['Time'] = pd.to_datetime(df2['Time'], unit='h')
        df2['Time'] = df2['Time'].dt.floor('min')  
        
        return df2

Create csv file by uing loop for days given month and year

In [None]:
# Ignore warnings due to duplicated dimension names
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning, module="xarray")


year = 2023
for month in range(9, 10):  # From January to November


    if month == 2:
        day_str = "0128"  # Handle February specifically
    else:
        day_str = "0131" if (month in [1, 3, 5, 7, 8, 10, 12]) else "0130"

    last_day_range = int(day_str[2:])+1
    
    def makefilenames(year,month): #year 2024 month 7 integer
        base_directory = f'{year}{month:02d}{day_str}/'
        
        file_prefixes = []
        for i in range(1,last_day_range):
            file_prefixes.append(f'{year}{month:02d}{i:02d}_')
        
        filenames = [f"{base_directory}{prefix}{hour:02d}45.nc" for prefix in file_prefixes for hour in range(0, 8)] # 6 for january 8 for else

        return filenames
    filenames = makefilenames(year,month)


    lat_s, lat_e, lon_s, lon_e = 5, 10, 110, 120

    # Initialize an empty DataFrame to store all data
    data = pd.DataFrame()
    for i, filename in enumerate(filenames):
        try:
            # Attempt to load the data
            my_loader = gems_loader_2023(filename, lat_s, lat_e, lon_s, lon_e)
            cur_data = my_loader.result()

            # Append the data to the main DataFrame
            if data.empty:
                data = cur_data
            else:
                data = pd.concat([data, cur_data], ignore_index=True)

        except FileNotFoundError as e:
            # Log a warning for the missing file and continue
            print(f"Warning: File not found - {filename}. Skipping this file.")
            continue

    data['Hours_elapsed'] = data['Time'].astype('int64') // 10**9/3600
    gqdata = data[data['FinalAlgorithmFlags']<=2]
    # frequency_table3= gqdata['FinalAlgorithmFlags'].value_counts()
    # print(frequency_table3)

    tmp_path = f'data_{int(str(year)[2:4])}_{month:02d}_{day_str}_N{str(lat_s)+str(lat_e)}_E{str(lon_s)+str(lon_e)}.csv' 
    print(tmp_path)
    csv_file_path = os.path.join(r"C:\\Users\\joonw\tco\\data_engineering", tmp_path)
    gqdata.to_csv(csv_file_path, index=False)

31
data_23_09_0130_N510_E110120.csv


# You may ignore belows:

#### Belows are previous codes, which can be used for debugging errors.

In [None]:
# change year, month and hour range
year=2024
month=12
days = '0131'
last_day_range = int(days[2:])+1
print(last_day_range)
def makefilenames(year,month): #year 2024 month 7 integer
    base_directory = f'{year}{month:02d}{days}/'

    file_prefixes = []
    for i in range(1,last_day_range):
        file_prefixes.append(f'{year}{month:02d}{i:02d}_')
    
    filenames = [f"{base_directory}{prefix}{hour:02d}45.nc" for prefix in file_prefixes for hour in range(0, 8)] # 6 for january 8 for else

    return filenames
filenames = makefilenames(year,month)


In [None]:
lat_s, lat_e, lon_s, lon_e = 5, 10, 110, 120

# Initialize an empty DataFrame to store all data
data = pd.DataFrame()
for i, filename in enumerate(filenames):
    try:
        # Attempt to load the data
        my_loader = gems_loader_2023(filename, lat_s, lat_e, lon_s, lon_e)
        cur_data = my_loader.result()

        # Append the data to the main DataFrame
        if data.empty:
            data = cur_data
        else:
            data = pd.concat([data, cur_data], ignore_index=True)

    except FileNotFoundError as e:
        # Log a warning for the missing file and continue
        print(f"Warning: File not found - {filename}. Skipping this file.")
        continue


Data quality control

In [None]:
data['Hours_elapsed'] = data['Time'].astype('int64') // 10**9/3600
frequency_table = data['FinalAlgorithmFlags'].value_counts()
print(frequency_table)

# 0's are best but consider 2^1 + 2^2 + 2^7
# OVER 2^1+2^2+2^7 = 134 is out.
data = data[data['FinalAlgorithmFlags']<=134]

frequency_table2 = data['FinalAlgorithmFlags'].value_counts()
print(frequency_table2)

gqdata = data[data['FinalAlgorithmFlags']!=3]
frequency_table3= gqdata['FinalAlgorithmFlags'].value_counts()
print(frequency_table3)

gqdata = data[data['FinalAlgorithmFlags']<=2]
frequency_table3= gqdata['FinalAlgorithmFlags'].value_counts()
print(frequency_table3)

Verfiy the region (N05-N10 E110-E120) I chosed is consistently available over time.

In [None]:
from GEMS_TCO import orbitmap
# from GEMS_TCO import smoothspace
from GEMS_TCO.smoothspace import space_average

df = gqdata
df['Time'] = df['Time'].astype(str)
resolution = 0.4 

instance = orbitmap.MakeOrbitdata(df,lat_s,lat_e,lon_s,lon_e,resolution,resolution)   # lat_s,lat_e, lon_s, lon_e
orbit_map24_7 = instance.makeorbitmap()

len(sorted(orbit_map24_7))

df['Orbit'].unique() 

Shows that for data from 2024/07/01 to 2024/07/09 missing 8th orbit so we have
 
240-9 = 231 orbits in total. 

save data

In [9]:
tmp_path = f'data_{int(str(year)[2:4])}_{month:02d}_{days}_N{str(lat_s)+str(lat_e)}_E{str(lon_s)+str(lon_e)}.csv'
csv_file_path = os.path.join(r"C:\\Users\\joonw\tco\\data_engineering", tmp_path)
# Save the DataFrame to a CSV file in the specified directory
gqdata.to_csv(csv_file_path, index=False)

#### Check if there is an error in opening a file

In [None]:
file_path = 'D:\\GEMS_UNZIPPED\\2023080131\\20230818_0745.nc'
location = xr.open_dataset(file_path, group='Geolocation Fields')
Z = xr.open_dataset(file_path, group='Data Fields')


