In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

from scripts import data_loader

#### Data Card
Filename: Rayleigh&Fresnel_corrected_Rrc.csv

The parameters of interest are $Rrc$, **top of atmosphere Rayleigh and Fresnel-corrected radiance* and the associated geophysical parameters, Chl and phytoplankton absorption, aph, at various wavelengths.

Phytoplankton absorption, aph, is calculated from:

$$aph = ap - ad$$

In [2]:
project_path = Path.cwd()
data_path = project_path / 'data' 
fp = 'raw/nomad_seawifs_Rayleigh&Fresnel_corrected.csv'

In [3]:
df = data_loader(fp=data_path / fp)

In [4]:
df.head()

Unnamed: 0,datetime,lat,lon,id,oisst,etopo2,chl,chl_a,kd405,kd411,...,senz,sena,cv,valid,Rrc_412,Rrc_443,Rrc_490,Rrc_510,Rrc_555,Rrc_670
0,1997-10-11 09:32:00,39.29,25.11,4069,19.57,462,0.091,,,,...,37.7,250.1,0.058,1,0.012088,0.012417,0.011739,0.010579,0.00911,0.006655
1,2000-02-22 17:00:00,-61.45,-62.299,1596,2.54,3549,0.132,0.118,,0.03851,...,53.7,293.6,0.084,1,0.010525,0.010636,0.009614,0.007913,0.006224,0.004794
2,2001-02-19 16:10:00,-61.29,-56.29,1633,0.78,330,,,,0.08031,...,27.1,57.4,0.058,1,0.004443,0.004387,0.00424,0.003686,0.002646,0.001177
3,2002-01-22 13:45:00,-60.999,-56.498,1659,1.79,2193,0.707,0.614,,0.06742,...,45.8,91.4,0.123,1,0.005869,0.005866,0.005535,0.004643,0.003326,0.001747
4,1997-09-27 11:29:00,24.1392,-20.9995,6083,24.67,4369,,0.158,,0.043,...,35.5,243.3,0.052,1,0.009464,0.008968,0.007719,0.005974,0.004161,0.002517


In [5]:
df_out = df.filter(regex='(^ad|^ap[0-9]+$)|(^chl$)|(^chl_a$)')

In [6]:
df_out.head().T

Unnamed: 0,0,1,2,3,4
chl,0.091,0.132,,0.707,
chl_a,,0.118,,0.614,0.158
ap405,,,0.0288,0.02313,
ap411,,,0.03093,0.02458,
ap443,,,0.03554,0.02654,
ap455,,,0.03254,0.02387,
ap465,,,0.03137,0.02274,
ap489,,,0.02341,0.01664,
ap510,,,0.01485,0.01087,
ap520,,,0.01196,0.00887,


In [7]:
df_in = df.filter(regex='^Rrc')

In [8]:
df_in.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Rrc_412  495 non-null    float64
 1   Rrc_443  495 non-null    float64
 2   Rrc_490  495 non-null    float64
 3   Rrc_510  495 non-null    float64
 4   Rrc_555  495 non-null    float64
 5   Rrc_670  495 non-null    float64
dtypes: float64(6)
memory usage: 23.4 KB


In [9]:
df_out.head()

Unnamed: 0,chl,chl_a,ap405,ap411,ap443,ap455,ap465,ap489,ap510,ap520,...,ad555,ad560,ad565,ad570,ad590,ad619,ad625,ad665,ad670,ad683
0,0.091,,,,,,,,,,...,,,,,,,,,,
1,0.132,0.118,,,,,,,,,...,,,,,,,,,,
2,,,0.0288,0.03093,0.03554,0.03254,0.03137,0.02341,0.01485,0.01196,...,0.00136,0.00128,0.00121,0.00114,0.00091,0.00066,0.00062,0.00039,0.00037,0.00032
3,0.707,0.614,0.02313,0.02458,0.02654,0.02387,0.02274,0.01664,0.01087,0.00887,...,0.00082,0.00076,0.00072,0.00066,0.00051,0.00035,0.00033,0.00019,0.00017,0.00015
4,,0.158,,,,,,,,,...,,,,,,,,,,


In [10]:
desired_lambda = [443, 555, 670]

In [16]:
df_out = df.filter(regex='(^ad|^ap[0-9]+$)|(^chl$)|(^chl_a$)')

In [17]:
cols = [f'{c}{i}' for c in ['ad', 'ap'] for i in desired_lambda]

In [18]:
cols

['ad443', 'ad555', 'ad670', 'ap443', 'ap555', 'ap670']

In [19]:
df_chl = df_out[['chl', 'chl_a']]
df_aph = df_out[cols]

In [20]:
df_chl.head()

Unnamed: 0,chl,chl_a
0,0.091,
1,0.132,0.118
2,,
3,0.707,0.614
4,,0.158


In [21]:
from scripts.data_utils import create_chl_and_flag

In [22]:
df_chl_ = create_chl_and_flag(df_chl)

In [23]:
df_chl_.head()

Unnamed: 0,chl,hplc_flag
0,0.091,0
1,0.118,1
2,,-1
3,0.614,1
4,0.158,1


In [24]:
df_chl_.to_parquet(data_path/'reduced_columns'/'df_chl.pqt')

In [25]:
df_out.to_parquet(data_path / 'reduced_columns'/'df_all_targets.pqt')

In [26]:
df_in.to_parquet(data_path / 'reduced_columns' / 'df_in_toa_radiance.pqt')

In [28]:
for λ in desired_lambda:
    df_aph[f'aph{λ}'] = df_aph[f'ap{λ}'] - df_aph[f'ad{λ}']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_aph[f'aph{λ}'] = df_aph[f'ap{λ}'] - df_aph[f'ad{λ}']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_aph[f'aph{λ}'] = df_aph[f'ap{λ}'] - df_aph[f'ad{λ}']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_aph[f'aph{λ}'] = df_aph[f'ap{λ}'] - df_aph[f'ad{λ}']


In [29]:
df_aph

Unnamed: 0,ad443,ad555,ad670,ap443,ap555,ap670,aph443,aph555,aph670
0,,,,,,,,,
1,,,,,,,,,
2,0.00476,0.00136,0.00037,0.03554,0.00521,0.01077,0.03078,0.00385,0.01040
3,0.00371,0.00082,0.00017,0.02654,0.00392,0.00952,0.02283,0.00310,0.00935
4,,,,,,,,,
...,...,...,...,...,...,...,...,...,...
491,,,,,,,,,
492,0.02927,0.00466,0.00071,0.17194,0.02820,0.04674,0.14267,0.02354,0.04603
493,,,,,,,,,
494,0.00230,0.00029,0.00004,0.05141,0.00726,0.01404,0.04911,0.00697,0.01400


In [30]:
0.03554 - 0.00476

0.030780000000000002

In [31]:
df_aph.to_parquet(data_path / 'reduced_columns' / 'df_aph.pqt')