In [1]:
# work with jl2815 environment

import xarray as xr # for netCDF4 
import pandas as pd
import numpy as np
from netCDF4 import Dataset
from matplotlib import pyplot as plt
import os

# !pip install xarray

In [2]:
current_directory = os.getcwd()
print("Current working directory:", current_directory)
new_directory = "D:\\GEMS_UNZIPPED"
os.chdir(new_directory)
updated_directory = os.getcwd()
print("Updated working directory:", updated_directory)

Current working directory: c:\Users\joonw\Downloads\TCO_analysis\data_engineering
Updated working directory: D:\GEMS_UNZIPPED


In [76]:
class gems_loader_2022:          
    def __init__(self, file_path = '22070106/20220701_0045.nc'):
        self.file_path = file_path                                    
  
    def extract_data(self,file_path):
        location = xr.open_dataset(file_path, group='Geolocation Fields')
        Z = xr.open_dataset(file_path, group='Data Fields')
        
        location_variables = ['Latitude', 'Longitude', 'Time']
        tmp1 = location[location_variables]

        # Convert xarray.Dataset to pandas DataFrame
        location_df = tmp1.to_dataframe().reset_index()
        location_df = location_df[location_variables]  # remove first (spatial) and second(image) column
        
        Z_variables = ['ColumnAmountO3','FinalAlgorithmFlags']
        tmp2 = Z[Z_variables]

        Z_df = tmp2.to_dataframe().reset_index()
        Z_df = Z_df[Z_variables]

        mydata = pd.concat([location_df, Z_df], axis=1)
        mydata = mydata[ location_variables + Z_variables ]
        
        print(f'Dimension of data from single orbit around {self.file_path} is {mydata.shape}')
        
        # Close the NetCDF file
        location.close()
        Z.close()
        return mydata
    
    def dropna(self):
        mydata = self.extract_data(self.file_path)
        mydata = mydata.dropna(subset=['Latitude', 'Longitude','Time','ColumnAmountO3','FinalAlgorithmFlags'])
        
        return mydata

    def result(self):
        # self.mydata =  self.extract_data(self.file_path)

        df = self.dropna()

        df2 = df[ (df['Latitude']<=10) & (df['Latitude']>=5) & (df['Longitude']>=125) & (df['Longitude']<=130) ]
        
        df2 = df2[df2.iloc[:,3]<1000]    # Cut off missing values
        # Time is elapsed seconds from 2000.1.1. 12:00 not 1970.1.1. 00:00. Also should consider leap years. For 30 years, 30/4 ~ 7 leap years
        df2['Time'] = np.mean(df2.iloc[:,2])/3600 + 24*365*30+12 + 24*7   # consider leap years, so added 24*7

        df2['Time'] = pd.to_datetime(df2['Time'], unit='h')
        df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
        
        return df2
    

Compare with hours

In [24]:
my_loader= gems_loader_2022('2022070116\\20220701_0045.nc')
df1 = my_loader.result()
df1.head()

# Round timestamps to the closest minute
df1['Time'] = df1['Time'].dt.round('min')
print(df1['Time'])
df1.head()
# Convert timestamps to seconds (since 1970 1.1,  //10**9 converts nanoseconds to seconds, /3600 this division converts seconds to hours)
df1['hours_slapsed'] = df1['Time'].astype('int64') // 10**9/3600
# 197196+ 262980 = 461076
df1.head()



Dimension of data from single orbit around 2022070116\20220701_0045.nc is (1423360, 5)
Series([], Name: Time, dtype: datetime64[ns])


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Unnamed: 0,Latitude,Longitude,Time,ColumnAmountO3,FinalAlgorithmFlags,hours_slapsed


In [80]:
# Define the base directories and file prefixes for both years
base_directories_2022 = '2022070116/'
base_directories_2023 = '2023070118/'
base_directories_2024 = '2024070112/'
file_prefixes_2022 = ['20220701_', '20220702_', '20220703_', '20220704_', '20220705_','20220706_','20220707_','20220708_','20220709_',
                      '20220710_','20220711_','20220712_','20220713_','20220714_','20220715_']
file_prefixes_2023 = ['20230701_', '20230702_', '20230703_', '20230704_', '20230705_','20230706_','20230707_','20230708_','20230709_',
                      '20230710_','20230711_','20230712_','20230713_','20230714_','20230715_']


file_prefixes_2024 = ['20240701_', '20240702_', '20240703_', '20240704_', '20240705_','20240706_','20240707_','20240708_','20240709_',
                      '20240710_', '20240711_','20240712_' ]

# Combine the prefixes for both years
file_prefixes = file_prefixes_2022 + file_prefixes_2023 + file_prefixes_2024

# hour:02d  0 means 0 should be added if smaller than 10 and 2 means 2 digits and d means integer
# Generate the list of filenames using nested list comprehensions
filenames_2022 = [f"{base_directories_2022}{prefix}{hour:02d}45.nc" for prefix in file_prefixes_2022 for hour in range(0, 8)]
filenames_2023 = [f"{base_directories_2023}{prefix}{hour:02d}45.nc" for prefix in file_prefixes_2023 for hour in range(0, 8)]
filenames_2024 = [f"{base_directories_2024}{prefix}{hour:02d}45.nc" for prefix in file_prefixes_2024 for hour in range(0, 8)]

filenames=filenames_2022 + filenames_2023 + filenames_2024

# Print the generated list
print(filenames)

['2022070116/20220701_0045.nc', '2022070116/20220701_0145.nc', '2022070116/20220701_0245.nc', '2022070116/20220701_0345.nc', '2022070116/20220701_0445.nc', '2022070116/20220701_0545.nc', '2022070116/20220701_0645.nc', '2022070116/20220701_0745.nc', '2022070116/20220702_0045.nc', '2022070116/20220702_0145.nc', '2022070116/20220702_0245.nc', '2022070116/20220702_0345.nc', '2022070116/20220702_0445.nc', '2022070116/20220702_0545.nc', '2022070116/20220702_0645.nc', '2022070116/20220702_0745.nc', '2022070116/20220703_0045.nc', '2022070116/20220703_0145.nc', '2022070116/20220703_0245.nc', '2022070116/20220703_0345.nc', '2022070116/20220703_0445.nc', '2022070116/20220703_0545.nc', '2022070116/20220703_0645.nc', '2022070116/20220703_0745.nc', '2022070116/20220704_0045.nc', '2022070116/20220704_0145.nc', '2022070116/20220704_0245.nc', '2022070116/20220704_0345.nc', '2022070116/20220704_0445.nc', '2022070116/20220704_0545.nc', '2022070116/20220704_0645.nc', '2022070116/20220704_0745.nc', '202207

# Merge 2022 data 

In [10]:
data = pd.DataFrame()
for i in range(len(filenames_2022)):
    my_loader= gems_loader_2022(filenames_2022[i])
    cur_data = my_loader.result()
    
    if  data.empty:
        data = cur_data
    else:
        data = pd.concat([data, cur_data], ignore_index=True)
    

data2022070115 = data




Dimension of data from single orbit around 2022070116/20220701_0045.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220701_0145.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220701_0245.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220701_0345.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220701_0445.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220701_0545.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220701_0645.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220701_0745.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220702_0045.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220702_0145.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220702_0245.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220702_0345.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220702_0445.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220702_0545.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220702_0645.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220702_0745.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220703_0045.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220703_0145.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220703_0245.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220703_0345.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220703_0445.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220703_0545.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220703_0645.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220703_0745.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220704_0045.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220704_0145.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220704_0245.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220704_0345.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220704_0445.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220704_0545.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220704_0645.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220704_0745.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220705_0045.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220705_0145.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220705_0245.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220705_0345.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220705_0445.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220705_0545.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220705_0645.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220705_0745.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220706_0045.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220706_0145.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220706_0245.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220706_0345.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220706_0445.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220706_0545.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220706_0645.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220706_0745.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220707_0045.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220707_0145.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220707_0245.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220707_0345.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220707_0445.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220707_0545.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220707_0645.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220707_0745.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220708_0045.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220708_0145.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220708_0245.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220708_0345.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220708_0445.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220708_0545.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220708_0645.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220708_0745.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220709_0045.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220709_0145.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220709_0245.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220709_0345.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220709_0445.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220709_0545.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220709_0645.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220709_0745.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220710_0045.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220710_0145.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220710_0245.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220710_0345.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220710_0445.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220710_0545.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220710_0645.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220710_0745.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220711_0045.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220711_0145.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220711_0245.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220711_0345.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220711_0445.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220711_0545.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220711_0645.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220711_0745.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220712_0045.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220712_0145.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220712_0245.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220712_0345.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220712_0445.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220712_0545.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220712_0645.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220712_0745.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220713_0045.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220713_0145.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220713_0245.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220713_0345.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220713_0445.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220713_0545.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220713_0645.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220713_0745.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220714_0045.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220714_0145.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220714_0245.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220714_0345.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220714_0445.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220714_0545.nc is (1280000, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220714_0645.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220714_0745.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220715_0045.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220715_0145.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220715_0245.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220715_0345.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220715_0445.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220715_0545.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220715_0645.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


Dimension of data from single orbit around 2022070116/20220715_0745.nc is (1423360, 5)


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute


# MERGE 2023

I need separate loader because 'time' variable is date type in 2023.

In [107]:
class gems_loader_2023:          
    def __init__(self, file_path = '240307/20240306_2345.nc'):
        self.file_path = file_path                                    
  
    def extract_data(self,file_path):
        location = xr.open_dataset(file_path, group='Geolocation Fields')
        Z = xr.open_dataset(file_path, group='Data Fields')
        
        location_variables = ['Latitude', 'Longitude', 'Time']
        tmp1 = location[location_variables]

        # Convert xarray.Dataset to pandas DataFrame
        location_df = tmp1.to_dataframe().reset_index()
        location_df = location_df[location_variables]

        Z_variables = ['ColumnAmountO3','FinalAlgorithmFlags']
        tmp2 = Z[Z_variables]

        Z_df = tmp2.to_dataframe().reset_index()
        Z_df = Z_df[Z_variables]

        mydata = pd.concat([location_df, Z_df], axis=1)
        mydata = mydata[ location_variables + Z_variables ]
        
        # Close the NetCDF file
        location.close()
        Z.close()
        return mydata
    
    def dropna(self):
        mydata = self.extract_data(self.file_path)
        mydata = mydata.dropna(subset=['Latitude', 'Longitude','Time','ColumnAmountO3','FinalAlgorithmFlags'])
        
        return mydata

    def result(self):
        # self.mydata =  self.extract_data(self.file_path)

        df = self.dropna()
  
        df2 = df[ (df['Latitude']<= 10) & (df['Latitude']>= 5) & (df['Longitude']>=125) & (df['Longitude']<=130) ]
        
        df2 = df2[df2.iloc[:,3]<1000]    # Cut off missing values

        df2['Time'] = np.mean(df2.iloc[:,2])

        # Convert 'Time' column to datetime type
        # print(df2['Time'])

        df2['Time'] = pd.to_datetime(df2['Time'], unit='h')
        df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
        
        return df2
    

# Merge 2023 data 

In [13]:
data = pd.DataFrame()
for i in range(len(filenames_2023)):
    my_loader= gems_loader_2023(filenames_2023[i])
    cur_data = my_loader.result()
    
    if  data.empty:
        data = cur_data
    else:
        data = pd.concat([data, cur_data], ignore_index=True)
data2023070115 = data


  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time

# Merge 2024 data

In [19]:
data = pd.DataFrame()
for i in range(len(filenames_2024)):
    my_loader= gems_loader_2023(filenames_2024[i])
    cur_data = my_loader.result()
    
    if  data.empty:
        data = cur_data
    else:
        data = pd.concat([data, cur_data], ignore_index=True)
data2024070112 = data

  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time'].dt.floor('T')  # T represents minute
  df2['Time'] = df2['Time

# Merge 2022, 2023, and 2024 data

In [20]:

data = pd.concat([data2022070115, data2023070115], ignore_index=True)
data  = pd.DataFrame(data)
data.shape
data = pd.concat( [data, data2024070112], ignore_index=True )

data['Hours_elapsed'] = data['Time'].astype('int64') // 10**9/3600
data

Unnamed: 0,Latitude,Longitude,Time,ColumnAmountO3,FinalAlgorithmFlags,Hours_elapsed
0,9.990139,129.960587,2022-07-01 00:52:00,259.680634,3.0,460176.866667
1,9.990149,129.897293,2022-07-01 00:52:00,259.316742,3.0,460176.866667
2,9.990168,129.834076,2022-07-01 00:52:00,259.900146,3.0,460176.866667
3,9.990234,129.770798,2022-07-01 00:52:00,259.457855,3.0,460176.866667
4,9.990213,129.707611,2022-07-01 00:52:00,258.672577,3.0,460176.866667
...,...,...,...,...,...,...
6065253,5.019251,125.288757,2024-07-12 07:47:00,268.250549,128.0,477991.783333
6065254,5.019522,125.226509,2024-07-12 07:47:00,267.652496,0.0,477991.783333
6065255,5.019815,125.164055,2024-07-12 07:47:00,269.644623,0.0,477991.783333
6065256,5.019929,125.101486,2024-07-12 07:47:00,268.215210,0.0,477991.783333


# Save data in csv file.

In [22]:
csv_file_path = os.path.join(r"C:\\Users\\joonw\\Downloads\\TCO_analysis\\data_engineering", 'data_222324_07_0115.csv')
# Save the DataFrame to a CSV file in the specified directory
data.to_csv(csv_file_path, index=False)

# Read csv file in Python

In [23]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv('C:\\Users\\joonw\\Downloads\\TCO_analysis\\data_engineering\\data_222324_07_0115.csv')

df.head()

  df = pd.read_csv('C:\\Users\\joonw\Downloads\\TCO_analysis\\data_engineering\\data_222324_07_0115.csv')


Unnamed: 0,Latitude,Longitude,Time,ColumnAmountO3,FinalAlgorithmFlags,Hours_elapsed
0,9.990139,129.96059,2022-07-01 00:52:00,259.68063,3.0,460176.866667
1,9.990149,129.8973,2022-07-01 00:52:00,259.31674,3.0,460176.866667
2,9.990168,129.83408,2022-07-01 00:52:00,259.90015,3.0,460176.866667
3,9.990234,129.7708,2022-07-01 00:52:00,259.45786,3.0,460176.866667
4,9.990213,129.70761,2022-07-01 00:52:00,258.67258,3.0,460176.866667


# Control data quality

In [27]:
data = df

frequency_table = data['FinalAlgorithmFlags'].value_counts()
print(frequency_table)

# 0's are best but consider 2^1 + 2^2 + 2^7

# OVER 2^1+2^2+2^7 = 134 is out.
data = data[data['FinalAlgorithmFlags']<=134]

frequency_table2 = data['FinalAlgorithmFlags'].value_counts()
print(frequency_table2)

gqdata = data[data['FinalAlgorithmFlags']!=3]
frequency_table3= gqdata['FinalAlgorithmFlags'].value_counts()
print(frequency_table3)

FinalAlgorithmFlags
2.0       2326228
0.0       2014134
2177.0     559159
3.0        458775
2179.0     340026
130.0      157305
128.0       86959
2193.0      63972
145.0       58246
2225.0        167
2195.0        159
1154.0         84
177.0          17
2209.0         14
1026.0          7
161.0           5
163.0           1
Name: count, dtype: int64
FinalAlgorithmFlags
2.0      2326228
0.0      2014134
3.0       458775
130.0     157305
128.0      86959
Name: count, dtype: int64
FinalAlgorithmFlags
2.0      2326228
0.0      2014134
130.0     157305
128.0      86959
Name: count, dtype: int64


In [28]:
csv_file_path = os.path.join(r"C:\\Users\\joonw\Downloads\\TCO_analysis\\data_engineering", 'gq_data_222324_07_0115.csv')
# Save the DataFrame to a CSV file in the specified directory
gqdata.to_csv(csv_file_path, index=False)
