## Import Basic Libraries

In [1]:
import numpy as np
import pandas as pd
import glob
import sys
import h5py
#from netCDF4 import Dataset
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree

import pyarrow as pa
import pyarrow.parquet as pq

from functools import reduce
import operator
import gc

In [2]:
# plot settings
plt.rc('font', family='serif') 
plt.rc('font', serif='Times New Roman') 
plt.rcParams.update({'font.size': 16})
plt.rcParams['mathtext.fontset'] = 'stix'

## Define SparkSession and sparkContext

In [3]:
# PySpark packages
from pyspark import SparkContext   
from pyspark.sql import SparkSession

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark import Row
from pyspark.sql.window import Window as W


spark = SparkSession.builder \
    .master("yarn") \
    .appName("spark-shell") \
    .config("spark.driver.maxResultSize", "32g") \
    .config("spark.driver.memory", "32g") \
    .config("spark.executor.memory", "14g") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.instances", "50") \
    .getOrCreate()


sc = spark.sparkContext
sc.setCheckpointDir("hdfs://spark00:54310/tmp/checkpoints")

spark.conf.set("spark.sql.debug.maxToStringFields", 500)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [4]:
sc.getConf().getAll()[:10]

[('spark.driver.memory', '32g'),
 ('spark.driver.appUIAddress', 'http://spark00:4040'),
 ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',
  'spark09'),
 ('spark.driver.maxResultSize', '32g'),
 ('spark.master', 'yarn'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.executor.cores', '2'),
 ('spark.driver.port', '35523'),
 ('spark.serializer.objectStreamReset', '100')]

## Read h5 files 

In [5]:
h5dir = '/mnt/raid5/shong/oco2L2/'

In [6]:
!ls /mnt/raid5/shong/oco2L2/

001  021  047  067  087  107  127  147	167  187  207  227  247  269
002  022  048  068  088  108  128  148	168  188  208  228  248  270
003  023  049  069  089  109  129  149	169  189  209  229  249  271
004  024  050  070  090  110  130  150	170  190  210  230  250  272
005  025  051  071  091  111  131  151	171  191  211  231  251  273
006  026  052  072  092  112  132  152	172  192  212  232  252  allfile.list
007  027  053  073  093  113  133  153	173  193  213  233  253  etc
008  028  054  074  094  114  134  154	174  194  214  234  254  julyfile.list
009  029  055  075  095  115  135  155	175  195  215  235  255
010  030  056  076  096  116  136  156	176  196  216  236  258
011  031  057  077  097  117  137  157	177  197  217  237  259
012  032  058  078  098  118  138  158	178  198  218  238  260
013  039  059  079  099  119  139  159	179  199  219  239  261
014  040  060  080  100  120  140  160	180  200  220  240  262
015  041  061  081  101  121  141  161	181  2

In [7]:
with open('/mnt/raid5/shong/oco2L2/julyfile.list', "r") as f:
    h5list = []
    for line in f:
        h5list.append(h5dir+line.strip()[2:])

In [8]:
h5list[:2]

['/mnt/raid5/shong/oco2L2/208/oco2_L2StdGL_37601a_210727_B10206r_211102142403.h5',
 '/mnt/raid5/shong/oco2L2/208/oco2_L2StdGL_37607a_210727_B10206r_211102160738.h5']

In [9]:
h5list[-2:]

['/mnt/raid5/shong/oco2L2/203/oco2_L2StdXS_37535a_210722_B10206r_211101181754.h5',
 '/mnt/raid5/shong/oco2L2/203/oco2_L2StdND_37532a_210722_B10206r_211101173743.h5']

In [10]:
numh5list = len(h5list)
print(numh5list)

612


In [11]:
h5py.is_hdf5(h5list[0])

True

In [12]:
inh5file = h5list[0]

In [13]:
try:
    h5f = h5py.File(inh5file, "r")
except IOError as e:
    print("Error opening HDF5 file:", str(e))
# Don't forget f.close() when done! 

## Explore the `h5`

In [14]:
for key in h5f.keys():
    item = h5f[key]
    
    if isinstance(item, h5py.Group):  # Check if it's a group
        print(f"Group: {key}")
    elif isinstance(item, h5py.Dataset):  # Check if it's a dataset
        print(f"Dataset: {key}, dtype: {item.dtype}")
    else:
        print(f"Unknown item: {key}")

Group: AerosolResults
Group: AlbedoResults
Group: BRDFResults
Group: Dimensions
Group: DispersionResults
Group: FootprintMeteorology
Group: L1bScSoundingReference
Group: L1bScSpectralParameters
Group: Metadata
Group: PreprocessingResults
Group: RetrievalGeometry
Group: RetrievalHeader
Group: RetrievalResults
Group: Shapes
Group: SpectralParameters


In [15]:
for key in h5f['RetrievalResults'].keys():
    item = h5f['RetrievalResults'][key]
    
    if isinstance(item, h5py.Group):  # Check if it's a group
        print(f"Group: {key}")
    elif isinstance(item, h5py.Dataset):  # Check if it's a dataset
        print(f"Dataset: {key}, dtype: {item.dtype}, shape: {item.shape}")
    else:
        print(f"Unknown item: {key}")

Dataset: apriori_o2_column, dtype: float32, shape: (12952,)
Dataset: co2_profile, dtype: float32, shape: (12952, 20)
Dataset: co2_profile_apriori, dtype: float32, shape: (12952, 20)
Dataset: co2_profile_averaging_kernel_matrix, dtype: float32, shape: (12952, 20, 20)
Dataset: co2_profile_covariance_matrix, dtype: float32, shape: (12952, 20, 20)
Dataset: co2_profile_uncert, dtype: float32, shape: (12952, 20)
Dataset: co2_vertical_gradient_delta, dtype: float32, shape: (12952,)
Dataset: diverging_steps, dtype: int16, shape: (12952,)
Dataset: dof_co2_profile, dtype: float32, shape: (12952,)
Dataset: dof_full_vector, dtype: float32, shape: (12952,)
Dataset: eof_1_scale_apriori_o2, dtype: float32, shape: (12952,)
Dataset: eof_1_scale_apriori_strong_co2, dtype: float32, shape: (12952,)
Dataset: eof_1_scale_apriori_weak_co2, dtype: float32, shape: (12952,)
Dataset: eof_1_scale_o2, dtype: float32, shape: (12952,)
Dataset: eof_1_scale_strong_co2, dtype: float32, shape: (12952,)
Dataset: eof_1_sc

In [16]:
h5f['RetrievalResults/xco2'][:]

array([0.00041481, 0.00041721, 0.00041088, ..., 0.00042293, 0.00042272,
       0.00042121], dtype=float32)

In [17]:
for key in h5f['RetrievalGeometry'].keys():
    item = h5f['RetrievalGeometry'][key]
    
    if isinstance(item, h5py.Group):  # Check if it's a group
        print(f"Group: {key}")
    elif isinstance(item, h5py.Dataset):  # Check if it's a dataset
        print(f"Dataset: {key}, dtype: {item.dtype}, shape: {item.shape}")
    else:
        print(f"Unknown item: {key}")

Dataset: retrieval_altitude, dtype: float32, shape: (12952,)
Dataset: retrieval_altitude_per_band, dtype: float32, shape: (12952, 3)
Dataset: retrieval_altitude_uncert, dtype: float32, shape: (12952,)
Dataset: retrieval_aspect, dtype: float32, shape: (12952,)
Dataset: retrieval_azimuth, dtype: float32, shape: (12952,)
Dataset: retrieval_center_offset_o2_weak_co2, dtype: float32, shape: (12952,)
Dataset: retrieval_center_offset_strong_co2_o2, dtype: float32, shape: (12952,)
Dataset: retrieval_center_offset_weak_co2_strong_co2, dtype: float32, shape: (12952,)
Dataset: retrieval_land_fraction, dtype: float32, shape: (12952,)
Dataset: retrieval_land_water_indicator, dtype: int8, shape: (12952,)
Dataset: retrieval_latitude, dtype: float32, shape: (12952,)
Dataset: retrieval_latitude_geoid, dtype: float32, shape: (12952,)
Dataset: retrieval_longitude, dtype: float32, shape: (12952,)
Dataset: retrieval_longitude_geoid, dtype: float32, shape: (12952,)
Dataset: retrieval_los_surface_bidirection

In [18]:
h5f['RetrievalGeometry/retrieval_latitude'][:]

array([-54.449707, -54.442287, -54.431953, ...,  79.32484 ,  79.333755,
        79.33841 ], dtype=float32)

In [19]:
h5f['RetrievalGeometry/retrieval_longitude'][:]

array([ 94.60168 ,  94.63712 ,  94.59557 , ..., -25.348944, -25.363718,
       -25.472635], dtype=float32)

In [20]:
for key in h5f['RetrievalHeader'].keys():
    item = h5f['RetrievalHeader'][key]
    
    if isinstance(item, h5py.Group):  # Check if it's a group
        print(f"Group: {key}")
    elif isinstance(item, h5py.Dataset):  # Check if it's a dataset
        print(f"Dataset: {key}, dtype: {item.dtype}, shape: {item.shape}")
    else:
        print(f"Unknown item: {key}")

Dataset: frame_index, dtype: int32, shape: (12952,)
Dataset: retrieval_time_string, dtype: |S25, shape: (12952,)
Dataset: retrieval_time_tai93, dtype: float64, shape: (12952,)
Dataset: sounding_id, dtype: int64, shape: (12952,)
Dataset: sounding_index, dtype: int32, shape: (12952,)
Dataset: sounding_operation_mode, dtype: |S3, shape: (12952,)


In [21]:
h5f['RetrievalHeader/retrieval_time_string'][:]

array([b'2021-07-27T07:45:14.056Z', b'2021-07-27T07:45:14.363Z',
       b'2021-07-27T07:45:14.389Z', ..., b'2021-07-27T08:29:35.615Z',
       b'2021-07-27T08:29:35.975Z', b'2021-07-27T08:29:36.669Z'],
      dtype='|S25')

In [22]:
h5f['RetrievalHeader/retrieval_time_string'][0].decode()

'2021-07-27T07:45:14.056Z'

> Which groups and datasets for matching `xco` and its corresponding `L1` ?

## Save selected features as a parquet

In [23]:
inh5file

'/mnt/raid5/shong/oco2L2/208/oco2_L2StdGL_37601a_210727_B10206r_211102142403.h5'

In [24]:
inh5file.split('oco2_')[-1]

'L2StdGL_37601a_210727_B10206r_211102142403.h5'

In [25]:
fname = inh5file.split('oco2_')[-1]

In [26]:
schema = T.StructType([\
                       T.StructField('filename',T.StringType(), True),\
                       T.StructField('altitude',T.FloatType(), True),\
                       T.StructField('longitude',T.FloatType(), True),\
                       T.StructField('latitude',T.FloatType(), True),\
                       T.StructField('aspect',T.FloatType(), True),\
                       T.StructField('slope',T.FloatType(), True),\
                       T.StructField('sol_az',T.FloatType(), True),\
                       T.StructField('sol_zn',T.FloatType(), True),\
                       T.StructField('xco2',T.FloatType(), True),\
                       T.StructField('time_str',T.StringType(), True)\
                      ])

In [27]:
numresults = len(h5f['RetrievalResults/xco2'][:])
print(numresults)

12952


In [28]:
fillist = []
altlist = []
lonlist = []
latlist = []
asplist = []
slolist = []
sazlist = []
sznlist = []
tstrlist = []
xcolist = []

In [29]:
fillist = [fname for i in range(numresults)]

In [30]:
altlist = h5f['RetrievalGeometry/retrieval_altitude'][:].tolist()
lonlist = h5f['RetrievalGeometry/retrieval_longitude'][:].tolist()
latlist = h5f['RetrievalGeometry/retrieval_latitude'][:].tolist()
asplist = h5f['RetrievalGeometry/retrieval_aspect'][:].tolist()
slolist = h5f['RetrievalGeometry/retrieval_slope'][:].tolist()
sazlist = h5f['RetrievalGeometry/retrieval_solar_azimuth'][:].tolist()
sznlist = h5f['RetrievalGeometry/retrieval_solar_zenith'][:].tolist()
xcolist = h5f['RetrievalResults/xco2'][:].tolist()

In [31]:
tstrlist = [ onestr.decode() for onestr in h5f['RetrievalHeader/retrieval_time_string'][:].tolist()]

In [32]:
len(tstrlist)

12952

In [33]:
tstrlist[:2]

['2021-07-27T07:45:14.056Z', '2021-07-27T07:45:14.363Z']

In [34]:
tstrlist[-2:]

['2021-07-27T08:29:35.975Z', '2021-07-27T08:29:36.669Z']

In [40]:
%%time
sparkdf = spark.createDataFrame(zip(fillist,altlist,lonlist,latlist,asplist,slolist, \
                                    sazlist,sznlist,xcolist,tstrlist),schema)

CPU times: user 49.4 ms, sys: 417 µs, total: 49.8 ms
Wall time: 909 ms


In [41]:
outdir = 'hdfs://spark00:54310/user/shong/data/parquet/oco2L2/'

In [42]:
outname = outdir+inh5file.split('oco2_')[-1].replace("h5","parquet.snappy")
print(outname)

hdfs://spark00:54310/user/shong/data/parquet/oco2L2/L2StdGL_37601a_210727_B10206r_211102142403.parquet.snappy


In [43]:
%%time
sparkdf.write.option("compression", "snappy") \
    .mode("overwrite") \
    .save(outname)

CPU times: user 3.27 ms, sys: 1.57 ms, total: 4.84 ms
Wall time: 7.13 s


### Check up the parquet

In [44]:
sparkdf.printSchema()

root
 |-- filename: string (nullable = true)
 |-- altitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- latitude: float (nullable = true)
 |-- aspect: float (nullable = true)
 |-- slope: float (nullable = true)
 |-- sol_az: float (nullable = true)
 |-- sol_zn: float (nullable = true)
 |-- xco2: float (nullable = true)
 |-- time_str: string (nullable = true)



In [45]:
%%time
print(sparkdf.count())

12952
CPU times: user 3.71 ms, sys: 0 ns, total: 3.71 ms
Wall time: 635 ms


In [46]:
%%time
sparkdf.limit(2).toPandas().transpose()

CPU times: user 12.5 ms, sys: 0 ns, total: 12.5 ms
Wall time: 573 ms


Unnamed: 0,0,1
filename,L2StdGL_37601a_210727_B10206r_211102142403.h5,L2StdGL_37601a_210727_B10206r_211102142403.h5
altitude,0.0,0.0
longitude,87.146935,87.130074
latitude,-24.197536,-24.194849
aspect,0.0,0.0
slope,0.0,0.0
sol_az,329.311066,329.328033
sol_zn,49.324081,49.313976
xco2,0.000412,0.000413
time_str,2021-07-27T07:54:41.228Z,2021-07-27T07:54:41.255Z


In [47]:
sparkdf.show(3,truncate=True)

+--------------------+--------+---------+----------+------+-----+---------+--------+------------+--------------------+
|            filename|altitude|longitude|  latitude|aspect|slope|   sol_az|  sol_zn|        xco2|            time_str|
+--------------------+--------+---------+----------+------+-----+---------+--------+------------+--------------------+
|L2StdGL_37601a_21...|     0.0| 94.60168|-54.449707|   0.0|  0.0|331.78232| 77.7332|4.1481422E-4|2021-07-27T07:45:...|
|L2StdGL_37601a_21...|     0.0| 94.63712|-54.442287|   0.0|  0.0| 331.7476|77.73676| 4.172136E-4|2021-07-27T07:45:...|
|L2StdGL_37601a_21...|     0.0| 94.59557|-54.431953|   0.0|  0.0|331.78485|77.71626| 4.108803E-4|2021-07-27T07:45:...|
+--------------------+--------+---------+----------+------+-----+---------+--------+------------+--------------------+
only showing top 3 rows



In [48]:
%%time
sparkdf.describe().toPandas().set_index('summary').transpose()

CPU times: user 3.91 ms, sys: 8.02 ms, total: 11.9 ms
Wall time: 2.82 s


summary,count,mean,stddev,min,max
filename,12952,,,L2StdGL_37601a_210727_B10206r_211102142403.h5,L2StdGL_37601a_210727_B10206r_211102142403.h5
altitude,12952,689.3239429860262,1305.191611339438,0.0,5795.221
longitude,12952,70.80185025956176,23.841023320777172,-25.472635,94.77584
latitude,12952,20.144116591884213,37.480726377530125,-54.449707,79.66689
aspect,12952,100.03833487890016,121.1369714622212,0.0,359.9664
slope,12952,6.770544836041603,17.722601994561163,0.0,90.0015
sol_az,12952,248.94193808453696,69.20408780473413,96.4665,332.35168
sol_zn,12952,38.79949679751688,15.097096160217664,20.080725,77.73676
xco2,12952,0.0004092351670237172,5.257489985069492e-06,3.5354478E-4,4.2750782E-4
time_str,12952,,,2021-07-27T07:45:14.056Z,2021-07-27T08:29:36.669Z
