# Cross-match L1 and L2 at the TCCONN site in Paris 

- Cross-match between L1 features and L2 $X_{co2}$ lables 

#### Import Basic Libraries

In [1]:
import numpy as np
import pandas as pd
import glob
import sys
import h5py
#from netCDF4 import Dataset
from datetime import datetime
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree

import pyarrow as pa
import pyarrow.parquet as pq

from functools import reduce
import operator
import gc

In [2]:
# plot settings
plt.rc('font', family='serif') 
plt.rc('font', serif='Times New Roman') 
plt.rcParams.update({'font.size': 16})
plt.rcParams['mathtext.fontset'] = 'stix'

In [3]:
# PySpark packages
from pyspark import SparkContext   
from pyspark.sql import SparkSession

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark import Row
from pyspark.sql.window import Window as W


spark = SparkSession.builder \
    .master("yarn") \
    .appName("spark-shell") \
    .config("spark.driver.maxResultSize", "32g") \
    .config("spark.driver.memory", "32g") \
    .config("spark.executor.memory", "14g") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.instances", "20") \
    .getOrCreate()


sc = spark.sparkContext
sc.setCheckpointDir("hdfs://spark00:54310/tmp/checkpoints")

spark.conf.set("spark.sql.debug.maxToStringFields", 500)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

## Read L1 and L2 in Paris

In [4]:
!ls /home/ebkim/work/oco2/data/

oco2_L1_healpix_nside1024.parquet.gzip	paris_L2.parquet.snappy
oco2_L2_healpix_nside1024.parquet.gzip	paris_L2_iall.parquet.snappy
paris_L1.parquet.snappy			paris_L2_iall_pandas.parquet.snappy
paris_L1_isite.parquet.snappy		paris_L2_isite.parquet.snappy
paris_L1_isite_pandas.parquet.snappy	paris_L2_pandas.parquet.snappy
paris_L1_pandas.parquet.snappy		paris_basic_info_pandas.parquet.gzip


In [5]:
onefilename = 'hdfs://spark00:54310/user/ebkim/data/paris_L1_isite.parquet.snappy'
twofilename = 'hdfs://spark00:54310/user/ebkim/data/paris_L2_iall.parquet.snappy'

In [6]:
onedf = spark.read.parquet(onefilename)

In [7]:
twodf = spark.read.parquet(twofilename)

In [8]:
onedf.printSchema()

root
 |-- filename: string (nullable = true)
 |-- channel_ind: integer (nullable = true)
 |-- pix_ind: integer (nullable = true)
 |-- row_ind: integer (nullable = true)
 |-- altitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- latitude: float (nullable = true)
 |-- aspect: float (nullable = true)
 |-- slope: float (nullable = true)
 |-- sol_az: float (nullable = true)
 |-- sol_zn: float (nullable = true)
 |-- fo_az: float (nullable = true)
 |-- fo_zn: float (nullable = true)
 |-- flag: integer (nullable = true)
 |-- snr: float (nullable = true)
 |-- continuum: float (nullable = true)
 |-- time_str: string (nullable = true)
 |-- spectrum: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- ipx1024nested: integer (nullable = true)



In [9]:
twodf.printSchema()

root
 |-- filename: string (nullable = true)
 |-- altitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- latitude: float (nullable = true)
 |-- aspect: float (nullable = true)
 |-- slope: float (nullable = true)
 |-- sol_az: float (nullable = true)
 |-- sol_zn: float (nullable = true)
 |-- xco2: float (nullable = true)
 |-- time_str: string (nullable = true)
 |-- ipx1024nested: integer (nullable = true)



In [10]:
onecolumns = [onedf.columns[1]]+onedf.columns[4:11]+[onedf.columns[16],onedf.columns[18]]
print(onecolumns)

['channel_ind', 'altitude', 'longitude', 'latitude', 'aspect', 'slope', 'sol_az', 'sol_zn', 'time_str', 'ipx1024nested']


In [11]:
onedf.select(onecolumns[:-1]).show(6,truncate=True)

+-----------+---------+---------+---------+---------+----------+---------+---------+--------------------+
|channel_ind| altitude|longitude| latitude|   aspect|     slope|   sol_az|   sol_zn|            time_str|
+-----------+---------+---------+---------+---------+----------+---------+---------+--------------------+
|          0|42.051094|   2.3677|48.852226| 189.7125|0.49040127|207.24275|30.967089|2021-07-22T12:55:...|
|          1|42.971222|2.3672364| 48.85481|195.08481|0.10498047|207.23999| 30.96925|2021-07-22T12:55:...|
|          2| 43.06294|2.3674605|48.855442|188.61536|0.10852284|207.23982|30.969864|2021-07-22T12:55:...|
|          0|51.605095|2.3902748|48.850983|225.64519| 1.1066356| 207.2849|30.973173|2021-07-22T12:55:...|
|          1|53.896103|2.3896976| 48.85335| 245.7854| 1.1184887|207.28212|30.975107|2021-07-22T12:55:...|
|          2| 54.90566| 2.390066| 48.85391| 330.3064| 0.8225637|207.28226|  30.9757|2021-07-22T12:55:...|
+-----------+---------+---------+---------+---

In [12]:
twocolumns = twodf.columns[1:]
print(twocolumns)

['altitude', 'longitude', 'latitude', 'aspect', 'slope', 'sol_az', 'sol_zn', 'xco2', 'time_str', 'ipx1024nested']


In [13]:
twodf.select(twocolumns[:-1]).show(6,truncate=True)

+---------+---------+---------+---------+----------+---------+---------+------------+--------------------+
| altitude|longitude| latitude|   aspect|     slope|   sol_az|   sol_zn|        xco2|            time_str|
+---------+---------+---------+---------+----------+---------+---------+------------+--------------------+
| 57.66209|2.3300796|48.842953|357.95782| 1.2404993|206.97125|30.910488| 4.093791E-4|2021-07-22T12:54:...|
|56.111057| 2.311695|48.840572| 326.1981| 1.4134638|206.94139|30.902916| 4.091359E-4|2021-07-22T12:54:...|
|43.401897| 2.293364| 48.83822|301.02423|0.86440855|206.91159|30.895386|4.0951013E-4|2021-07-22T12:54:...|
| 60.11515| 2.424551|48.853153|185.50987| 0.9138781| 207.1285|30.948095| 4.093566E-4|2021-07-22T12:54:...|
|58.226757|2.4061873| 48.85031|  165.122| 0.5481995|207.09908| 30.94009|4.0924925E-4|2021-07-22T12:54:...|
|46.581917|2.3877516|  48.8475|255.88002|0.81258553| 207.0695|30.932096| 4.086528E-4|2021-07-22T12:54:...|
+---------+---------+---------+------

#### Pandas Dataframes 

In [14]:
%%time
onepdf = onedf.select(onecolumns).toPandas()

CPU times: user 10.3 ms, sys: 0 ns, total: 10.3 ms
Wall time: 1.49 s


In [15]:
%%time
twopdf = twodf.select(twocolumns).toPandas()

CPU times: user 9.3 ms, sys: 207 µs, total: 9.5 ms
Wall time: 1.38 s


In [16]:
!pwd

/home/shong/work/oco2/notebook


In [17]:
onepdf.to_parquet('/home/ebkim/work/oco2/data/paris_L1_isite_pandas.parquet.snappy')
twopdf.to_parquet('/home/ebkim/work/oco2/data/paris_L2_iall_pandas.parquet.snappy')

## Cross-match L1 and L2 

In [18]:
onepdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3081 entries, 0 to 3080
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   channel_ind    3081 non-null   int32  
 1   altitude       3081 non-null   float32
 2   longitude      3081 non-null   float32
 3   latitude       3081 non-null   float32
 4   aspect         3081 non-null   float32
 5   slope          3081 non-null   float32
 6   sol_az         3081 non-null   float32
 7   sol_zn         3081 non-null   float32
 8   time_str       3081 non-null   object 
 9   ipx1024nested  3081 non-null   int32  
dtypes: float32(7), int32(2), object(1)
memory usage: 132.5+ KB


In [19]:
twopdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4237 entries, 0 to 4236
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   altitude       4237 non-null   float32
 1   longitude      4237 non-null   float32
 2   latitude       4237 non-null   float32
 3   aspect         4237 non-null   float32
 4   slope          4237 non-null   float32
 5   sol_az         4237 non-null   float32
 6   sol_zn         4237 non-null   float32
 7   xco2           4237 non-null   float32
 8   time_str       4237 non-null   object 
 9   ipx1024nested  4237 non-null   int32  
dtypes: float32(8), int32(1), object(1)
memory usage: 182.2+ KB
