In [None]:
# default_exp nc4_convert

# NASA NC4 Loader

This module allow you to :
- load Level 2 NC4 files from disk
- read the datas with netCDF4 library
- save them to CSV

In a recursive way.

In [None]:
#hide
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
#%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
#export
import os
import glob
from netCDF4 import Dataset
import numpy as np
import pandas as pd
from fastprogress.fastprogress import master_bar, progress_bar

## Get the content of a directory
A filter allow you to get only some file. Useful to process only a month for example.

In [None]:
#export
def get_file_list(nc4_directory, pattern='*.nc4'):
    nc4_directory = os.path.realpath(nc4_directory)
    return glob.glob(nc4_directory + '/' + pattern)

In [None]:
last_nc4 = get_file_list('/media/NAS-Divers/dev/datasets/OCO2/nc4-v9/')[-1]
last_nc4
last_nc4 = get_file_list('/media/NAS-Divers/dev/datasets/OCO2/nc4-v10/')[-1]
last_nc4

'/media/NAS-Divers/dev/datasets/OCO2/nc4-v9/oco2_LtCO2_191231_B9003r_200130003035s.nc4'

'/media/NAS-Divers/dev/datasets/OCO2/nc4-v10/oco2_LtCO2_20200407_B10offline.nc4'

## Open the file with netCDF4

In [None]:
#export
def get_nc4_raw_content(one_file):
    return Dataset(one_file, 'r')

This allow you to get info from the file by yourself.

In [None]:
get_nc4_raw_content(last_nc4)
get_nc4_raw_content(last_nc4).groups['Sounding']
get_nc4_raw_content(last_nc4).groups['Meteorology']
get_nc4_raw_content(last_nc4).groups['Retrieval']



<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    filter_function: oco2_xco2_quality_flag_b10
    bc_function: oco2_xco2_bias_correct_b10
    warn_level_function: From_L2Std
    Bias_Correction_land: XCO2_Bias_Corrected = (XCO2_Raw + 0.855*(dpfrac + 0.0) + 0.0335*(co2_grad_del - 5.00) + 0.335*(logDWS + 5.00) - 5.20*(aod_fine - 0.0300) - footprint_bias)/0.99580
    Bias_Correction_oceanGL: XCO2_Bias_Corrected = (XCO2_Raw + 0.213*(dp_sco2 + 0.0) - 0.0870*((co2_grad_del<(0.)) + 6.00) - footprint_bias)/0.99560
    Bias_Correction_oceanND: XCO2_Bias_Corrected = (XCO2_Raw + 0.213*(dp_sco2 + 0.0) - 0.0870*((co2_grad_del<(0.)) + 6.00) - footprint_bias)/0.99560
    Footprint_bias_land: Assumed footprint biases in xco2 [ppm] for footprints 1-8: -0.51, -0.24, -0.20, -0.15, 0.11, 0.40, 0.20, 0.39
    Footprint_bias_oceanGL: Assumed footprint biases in xco2 [ppm] for footprints 1-8: -0.37, -0.10, -0.15, -0.10, 0.04, 0.28, 0.11, 0.29
    Footprint_bias_oceanND

<class 'netCDF4._netCDF4.Group'>
group /Sounding:
    dimensions(sizes): 
    variables(dimensions): float32 solar_azimuth_angle(sounding_id), float32 sensor_azimuth_angle(sounding_id), float32 polarization_angle(sounding_id), int8 land_fraction(sounding_id), float32 glint_angle(sounding_id), float32 airmass(sounding_id), float32 snr_o2a(sounding_id), float32 snr_wco2(sounding_id), float32 snr_sco2(sounding_id), int32 l1b_type(sounding_id), int8 operation_mode(sounding_id), int32 orbit(sounding_id), uint8 path(sounding_id), int8 footprint(sounding_id), int8 land_water_indicator(sounding_id), float32 altitude(sounding_id), float32 altitude_stddev(sounding_id)
    groups: 

<class 'netCDF4._netCDF4.Group'>
group /Meteorology:
    dimensions(sizes): 
    variables(dimensions): float32 psurf_apriori_o2a(sounding_id), float32 psurf_apriori_wco2(sounding_id), float32 psurf_apriori_sco2(sounding_id), float32 windspeed_u_met(sounding_id), float32 windspeed_v_met(sounding_id)
    groups: 

<class 'netCDF4._netCDF4.Group'>
group /Retrieval:
    dimensions(sizes): 
    variables(dimensions): int8 surface_type(sounding_id), float32 psurf(sounding_id), float32 SigmaB(levels), float32 windspeed(sounding_id), float32 windspeed_apriori(sounding_id), float32 psurf_apriori(sounding_id), float32 t700(sounding_id), float32 fs(sounding_id), float32 fs_rel(sounding_id), float32 tcwv(sounding_id), float32 tcwv_apriori(sounding_id), float32 tcwv_uncertainty(sounding_id), float32 xco2_raw(sounding_id), float32 dp(sounding_id), float32 dp_o2a(sounding_id), float32 dp_sco2(sounding_id), float32 dpfrac(sounding_id), float32 s31(sounding_id), float32 s32(sounding_id), float32 co2_grad_del(sounding_id), float32 dws(sounding_id), float32 eof3_3_rel(sounding_id), int8 snow_flag(sounding_id), float32 aod_dust(sounding_id), float32 aod_bc(sounding_id), float32 aod_oc(sounding_id), float32 aod_seasalt(sounding_id), float32 aod_sulfate(sounding_id), float32 aod_strataer(sounding_id), float32 aod_w

To get the version of the NASA pipeline :

In [None]:
get_nc4_raw_content(last_nc4).BuildId

'B10.0.04'

## Extract the useful informations

In [None]:
#export
columns=['flag','sounding_id', 'latitude', 'longitude', 'xco2', 'xco2_uncert', 'orbit', 'windspeed_u', 'windspeed_v',
    'surface_pressure_apriori', 'surface_pressure', 'altitude', 'land_water_indicator', 'land_fraction', 'tcwv', 'tcwv_apriori', 'tcwv_uncertainty']
def get_np_table(one_file):
    # Open the file
    try:
        file_nc = Dataset(one_file, 'r')
    except:
        print('ERROR reading', one_file)
        return np.empty((0,len(columns)))
    # Documentation of data : https://docserver.gesdisc.eosdis.nasa.gov/public/project/OCO/OCO2_DUG.V9.pdf
    #print(file_nc)
    if file_nc.BuildId[0:3] == 'B10' :
        np_table = np.column_stack((file_nc.variables['xco2_quality_flag'],file_nc.variables['sounding_id'],file_nc.variables['latitude'],file_nc.variables['longitude'],
            file_nc.variables['xco2'],file_nc.variables['xco2_uncertainty'],file_nc.groups['Sounding'].variables['orbit'],
            file_nc.groups['Meteorology']['windspeed_u_met'], file_nc.groups['Meteorology']['windspeed_v_met'],
            file_nc.groups['Meteorology']['psurf_apriori_o2a'], file_nc.groups['Retrieval']['psurf'],
            file_nc.groups['Sounding']['altitude'], file_nc.groups['Sounding']['land_water_indicator'], file_nc.groups['Sounding']['land_fraction'],
            file_nc.groups['Retrieval']['tcwv'], file_nc.groups['Retrieval']['tcwv_apriori'], file_nc.groups['Retrieval']['tcwv_uncertainty']))
    else:
        np_table = np.column_stack((file_nc.variables['xco2_quality_flag'],file_nc.variables['sounding_id'],file_nc.variables['latitude'],file_nc.variables['longitude'],
            file_nc.variables['xco2'],file_nc.variables['xco2_uncertainty'],file_nc.groups['Sounding'].variables['orbit'],
            file_nc.groups['Meteorology']['windspeed_u_met'], file_nc.groups['Meteorology']['windspeed_v_met'],
            file_nc.groups['Meteorology']['psurf_apriori'], file_nc.groups['Retrieval']['psurf'], file_nc.groups['Sounding']['altitude'], file_nc.groups['Sounding']['land_water_indicator'],
            file_nc.groups['Sounding']['land_fraction'],
            file_nc.groups['Retrieval']['tcwv'], file_nc.groups['Retrieval']['tcwv_apriori'], file_nc.groups['Retrieval']['tcwv_uncertainty']))#))
    return np_table

In [None]:
np_table =  get_np_table(last_nc4)
np_table.shape

(202617, 17)

## Convert them to Pandas
This method take a list of file in argument, and process them all in one Pandas Dataframe.

In [None]:
#export
def get_dataframe(nc4_list, master_progress_bar = None):
    global columns
    month_data = np.empty((0,len(columns)))
    # Loop over the files
    if master_progress_bar is None:
        master_progress_bar = master_bar([0])
        for _ in master_progress_bar: None

    for one_file in progress_bar(nc4_list, parent=master_progress_bar):
        np_table = get_np_table(one_file)
        month_data = np.concatenate((month_data, np_table), axis=0)

    if(month_data.size == 0):
        return pd.DataFrame(columns=columns)
    df = pd.DataFrame(month_data, columns=columns)
    # using dictionary to convert specific columns (https://www.geeksforgeeks.org/change-data-type-for-one-or-more-columns-in-pandas-dataframe/)
    convert_dict = {'sounding_id': int, 
                    'orbit': int
                } 
    df = df.astype(convert_dict) 
    # Remove bad quality
    df=df[df['flag']==0]
    # Remove flag
    df.drop(['flag'], axis=1, inplace=True)
    return df

In [None]:
df = get_dataframe([last_nc4])
df.head(3)

Unnamed: 0,sounding_id,latitude,longitude,xco2,xco2_uncert,orbit,windspeed_u,windspeed_v,surface_pressure_apriori,surface_pressure,altitude,land_water_indicator,land_fraction,tcwv,tcwv_apriori,tcwv_uncertainty
7,2020040700492638,-42.680946,-163.926727,409.077057,0.494898,30665,-8.192998,-1.161757,1005.479065,996.547852,0.0,1.0,0.0,18.573519,17.652161,0.109404
13,2020040700492703,-42.659702,-163.818771,409.052002,0.489938,30665,-7.951118,-1.333183,1005.495911,996.65625,0.0,1.0,0.0,18.361076,17.563622,0.097325
16,2020040700492707,-42.648354,-163.912247,408.720642,0.471686,30665,-8.057251,-1.286553,1005.430115,997.372009,0.0,1.0,0.0,18.235178,17.771111,0.093667


In [None]:
df.describe()

Unnamed: 0,sounding_id,latitude,longitude,xco2,xco2_uncert,orbit,windspeed_u,windspeed_v,surface_pressure_apriori,surface_pressure,altitude,land_water_indicator,land_fraction,tcwv,tcwv_apriori,tcwv_uncertainty
count,102672.0,102672.0,102672.0,102672.0,102672.0,102672.0,102672.0,102672.0,102672.0,102672.0,102672.0,102672.0,102672.0,102672.0,102672.0,102672.0
mean,2020041000000000.0,6.799648,-0.213743,412.5322,0.45339,30671.647927,-0.474404,-0.649793,996.979927,996.165778,153.193501,0.70433,29.814536,25.079223,25.38907,0.08961
std,6811327.0,27.727262,103.142932,2.655431,0.099719,4.137757,4.379928,3.129051,38.246736,37.936126,348.045096,0.462135,45.73377,13.437647,13.318149,0.047803
min,2020041000000000.0,-50.229252,-179.998718,401.247559,0.259756,30665.0,-12.165148,-14.724771,569.062927,574.740662,-10.927058,0.0,0.0,1.488046,1.375631,0.007579
25%,2020041000000000.0,-20.594182,-93.544983,409.817978,0.38622,30667.0,-4.173923,-3.127896,1002.930862,1001.288177,0.0,0.0,0.0,14.507736,15.098022,0.050798
50%,2020041000000000.0,10.888887,-5.311413,413.222961,0.432248,30672.0,-0.819255,-0.489899,1011.487122,1010.897705,0.0,1.0,0.0,23.878181,23.651602,0.085683
75%,2020041000000000.0,29.144582,93.615358,414.809303,0.495875,30675.0,3.014307,1.558952,1015.685547,1014.205765,117.972736,1.0,100.0,35.18096,35.131514,0.122442
max,2020041000000000.0,62.63401,179.999802,424.836792,1.217122,30679.0,13.394435,9.827548,1032.37561,1032.910767,4751.152344,3.0,100.0,64.513527,68.197533,0.304702


## Helpers function to process many files

In [None]:
# export
def get_pattern_yearmonth():
    '''
    Generate a list of every YYMM from 2014 to 2020
    '''
    years_months = []
    for year in range(14, 20+1):
        for month in range(1,12+1):
            years_months.append(str(year)+str(month).zfill(2))
    return years_months

def process_files(input_dir, output_dir, patterns):
    '''
    Process all NC4 file corresponding to the patterns list.
    '''
    if len(patterns) < 1:
        raise Exception("ERROR You must give an array pattern !")
    master_progress_bar = master_bar(patterns)
    for pattern in master_progress_bar:
        # Get the file list in directory
        nc4_list = get_file_list(input_dir, pattern='oco2_LtCO2_'+pattern+"*.nc4")
        master_progress_bar.write(f'Files to process for {pattern} : {len(nc4_list)}')
        if len(nc4_list) > 1:
            #master_progress_bar.write(f'Loading {pattern}')
            df = get_dataframe(nc4_list, master_progress_bar)
            master_progress_bar.write(f'Saving {pattern} to disk...')
            df.to_csv(output_dir + 'oco2_'+pattern+'.csv.bz2', sep=';', index=False, compression='bz2')
            del(df)
        else:
            master_progress_bar.write(f'WARNING : No file for {pattern}')

# Example usage

In [None]:
years_months = get_pattern_yearmonth()
print(years_months)
input_dir = r'/media/NAS-Divers/dev/datasets/OCO2/nc4-v10/'
output_dir = r'/media/data-nvme/dev/datasets/OCO2/csv-v10/'
# Replace ['1409'] by years_months to process every files.
process_files(input_dir, output_dir, ['202003'])

['1401', '1402', '1403', '1404', '1405', '1406', '1407', '1408', '1409', '1410', '1411', '1412', '1501', '1502', '1503', '1504', '1505', '1506', '1507', '1508', '1509', '1510', '1511', '1512', '1601', '1602', '1603', '1604', '1605', '1606', '1607', '1608', '1609', '1610', '1611', '1612', '1701', '1702', '1703', '1704', '1705', '1706', '1707', '1708', '1709', '1710', '1711', '1712', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808', '1809', '1810', '1811', '1812', '1901', '1902', '1903', '1904', '1905', '1906', '1907', '1908', '1909', '1910', '1911', '1912', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012']


In [None]:
input_dir = r'/media/NAS-Divers/dev/datasets/OCO2/nc4-v9/'
output_dir = r'/media/data-nvme/dev/datasets/OCO2/csv-v9/'
process_files(input_dir, output_dir, ['1409'])

ERROR reading /media/NAS-Divers/dev/datasets/OCO2/nc4-v9/oco2_LtCO2_140910_B9003r_180927220447s.nc4


In [None]:
from nbdev.export import notebook2script
notebook2script()

Converted 03_25_OCO2_Data_Exploration.ipynb.
Converted 04_01_OCO2_Work_Base.ipynb.
Converted 04_04_OCO2_China_Peaks.ipynb.
Converted 04_15_OCO2_Laiwu_Peak_Detection.ipynb.
Converted CO2_emissions_Inventory_data.ipynb.
Converted Christian-datasets-Distances.ipynb.
Converted Find_Peaks_with_LSTM_autoencoders.ipynb.
Converted Laiwu_Plume-more_data.ipynb.
Converted Laiwu_Plume-more_data_CD_exploration_selection_peaks.ipynb.
Converted Laiwu_Plume.ipynb.
Converted Untitled1.ipynb.
Converted WIP_OCO2_Capture.ipynb.
Converted WIP_OCO2_Peaks_Wind.ipynb.
Converted WIP_OCO2_Peaks_Wind_Visualization.ipynb.
Converted bco_playground.ipynb.
Converted find_peak_bco_test.ipynb.
Converted index.ipynb.
Converted oco2peak-datasets.ipynb.
Converted oco2peak-find_peak.ipynb.
Converted oco2peak-map.ipynb.
Converted oco2peak-nc4_convert.ipynb.
Converted oco2peak-swift_utils.ipynb.
Converted oco2peak_find_source.ipynb.
Converted show_map.ipynb.
Converted view_peak.ipynb.


In [None]:
628/1354

0.4638109305760709