In [55]:
from pathlib import Path

import pandas as pd
import numpy as np

In [56]:
project_path = Path.cwd()
data_path_unprocessed = project_path / 'data' / '01_reduced'
data_path_preprocessed = project_path / 'data' / '02_preprocessed'
data_path_cleaned = project_path / 'data' / '03_cleaned'
data_path_transformed = project_path / 'data'/ '04_transformed'

In [57]:
df_in = pd.read_parquet(data_path_unprocessed / 'inputs.pqt')
df_out = pd.read_parquet(data_path_unprocessed / 'targets.pqt')

### Cleaning Inputs 

In [58]:
df_in.head()

Unnamed: 0,id,lat,lon,etopo2,oisst,es411,es443,es489,es510,es555,es670,lw411,lw443,lw489,lw510,lw555,lw670
0,1565,38.4279,-76.61,0.0,3.7,107.906,121.187,137.266,134.037,130.947,112.925,0.129962,0.204331,0.452029,0.541019,0.979378,0.391235
1,1566,38.368,-76.5,0.0,3.7,113.86,128.053,145.148,140.965,138.35,118.555,0.120886,0.177228,0.315395,0.352306,0.57444,0.200936
2,1567,38.3074,-76.44,1.0,3.7,114.35,128.055,146.06,142.725,140.198,119.978,0.111049,0.151807,0.269218,0.326515,0.595226,0.193438
3,1568,38.6367,-76.32,3.0,3.7,35.0441,38.1511,42.1373,40.5485,39.2907,31.7094,0.051578,0.06643,0.121242,0.14858,0.274316,0.102543
4,1559,38.3047,-76.44,1.0,22.03,61.8732,67.0335,72.2731,69.6545,77.1321,50.0766,0.056001,0.06853,0.108838,0.132581,0.216023,0.089663


In [59]:
λ = [411, 443, 489, 510, 555, 670]
for λi in λ:
    df_in[f'Rrs{λi}'] = df_in[f'lw{λi}'] / df_in[f'es{λi}']

In [60]:
df_in.head()

Unnamed: 0,id,lat,lon,etopo2,oisst,es411,es443,es489,es510,es555,...,lw489,lw510,lw555,lw670,Rrs411,Rrs443,Rrs489,Rrs510,Rrs555,Rrs670
0,1565,38.4279,-76.61,0.0,3.7,107.906,121.187,137.266,134.037,130.947,...,0.452029,0.541019,0.979378,0.391235,0.001204,0.001686,0.003293,0.004036,0.007479,0.003465
1,1566,38.368,-76.5,0.0,3.7,113.86,128.053,145.148,140.965,138.35,...,0.315395,0.352306,0.57444,0.200936,0.001062,0.001384,0.002173,0.002499,0.004152,0.001695
2,1567,38.3074,-76.44,1.0,3.7,114.35,128.055,146.06,142.725,140.198,...,0.269218,0.326515,0.595226,0.193438,0.000971,0.001185,0.001843,0.002288,0.004246,0.001612
3,1568,38.6367,-76.32,3.0,3.7,35.0441,38.1511,42.1373,40.5485,39.2907,...,0.121242,0.14858,0.274316,0.102543,0.001472,0.001741,0.002877,0.003664,0.006982,0.003234
4,1559,38.3047,-76.44,1.0,22.03,61.8732,67.0335,72.2731,69.6545,77.1321,...,0.108838,0.132581,0.216023,0.089663,0.000905,0.001022,0.001506,0.001903,0.002801,0.001791


In [61]:
df_in_nonan = df_in.dropna()[['id', 'lat', 'lon', 'etopo2', 'oisst'] + [f'Rrs{λi}' for λi in λ]]

In [62]:
df_in_nonan.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1137 entries, 0 to 4458
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      1137 non-null   int64  
 1   lat     1137 non-null   float64
 2   lon     1137 non-null   float64
 3   etopo2  1137 non-null   float64
 4   oisst   1137 non-null   float64
 5   Rrs411  1137 non-null   float64
 6   Rrs443  1137 non-null   float64
 7   Rrs489  1137 non-null   float64
 8   Rrs510  1137 non-null   float64
 9   Rrs555  1137 non-null   float64
 10  Rrs670  1137 non-null   float64
dtypes: float64(10), int64(1)
memory usage: 106.6 KB


In [63]:
df_in.filter(regex='(id)|(Rrs[0-9]+)', axis=1).to_parquet(data_path_cleaned/'df_in.pqt')

### Cleaning Output

In [64]:
df_out.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      4459 non-null   int64  
 1   chl     3392 non-null   float64
 2   chl_a   1381 non-null   float64
 3   ad411   1231 non-null   float64
 4   ad443   1238 non-null   float64
 5   ad489   1237 non-null   float64
 6   ad510   1237 non-null   float64
 7   ap411   1272 non-null   float64
 8   ap443   1279 non-null   float64
 9   ap489   1278 non-null   float64
 10  ap510   1278 non-null   float64
 11  ap555   1262 non-null   float64
 12  ap670   1270 non-null   float64
dtypes: float64(12), int64(1)
memory usage: 453.0 KB


In [65]:
df_out[['chl', 'chl_a']]

Unnamed: 0,chl,chl_a
0,38.19000,
1,35.01000,
2,26.91000,
3,47.96000,
4,23.55000,
...,...,...
4454,4.62784,
4455,2.95997,
4456,0.95114,
4457,4.54228,


In [66]:
df_chl = df_out[['id', 'chl', 'chl_a']].copy()

In [67]:
def create_chl_and_flag(df):
    """
    Creates 'chl' and 'hplc_flag' columns based on the rules provided.

    Args:
        df (pd.DataFrame): Input DataFrame with 'chl' and 'chl_a' columns.

    Returns:
        pd.DataFrame: DataFrame with new 'chl' and 'hplc_flag' columns.
    """
    new_chl = np.select(
        [
            df['chl_a'].notna(),
            df['chl'].notna() & df['chl_a'].isna(),
            df['chl'].isna() & df['chl_a'].isna()
        ],
        [
            df['chl_a'],
            df['chl'],
            np.nan
        ],
        default=np.nan  # Should not be reached based on the conditions
    )

    hplc_flag = np.select(
        [
            df['chl_a'].notna(),
            df['chl'].notna() & df['chl_a'].isna(),
            df['chl'].isna() & df['chl_a'].isna()
        ],
        ['hplc', 'fluo', 'None'],
        default='None' # Should not be reached based on the conditions
    )

    new_df = df.copy()
    new_df['chl'] = new_chl
    new_df['hplc_flag'] = hplc_flag
    return new_df[['id', 'chl', 'hplc_flag']]

In [68]:
df_chl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      4459 non-null   int64  
 1   chl     3392 non-null   float64
 2   chl_a   1381 non-null   float64
dtypes: float64(2), int64(1)
memory usage: 104.6 KB


In [69]:
new_df_chl = create_chl_and_flag(df_chl)

In [70]:
new_df_chl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         4459 non-null   int64  
 1   chl        4127 non-null   float64
 2   hplc_flag  4459 non-null   object 
dtypes: float64(1), int64(1), object(1)
memory usage: 104.6+ KB


In [71]:
new_df_chl.hplc_flag.value_counts()

hplc_flag
fluo    2746
hplc    1381
None     332
Name: count, dtype: int64

In [72]:
df_all = pd.merge(df_in_nonan, new_df_chl, how='inner', left_on='id', right_on='id')

In [73]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1213 entries, 0 to 1212
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         1213 non-null   int64  
 1   lat        1213 non-null   float64
 2   lon        1213 non-null   float64
 3   etopo2     1213 non-null   float64
 4   oisst      1213 non-null   float64
 5   Rrs411     1213 non-null   float64
 6   Rrs443     1213 non-null   float64
 7   Rrs489     1213 non-null   float64
 8   Rrs510     1213 non-null   float64
 9   Rrs555     1213 non-null   float64
 10  Rrs670     1213 non-null   float64
 11  chl        1118 non-null   float64
 12  hplc_flag  1213 non-null   object 
dtypes: float64(11), int64(1), object(1)
memory usage: 123.3+ KB


In [74]:
df_all.describe()

Unnamed: 0,id,lat,lon,etopo2,oisst,Rrs411,Rrs443,Rrs489,Rrs510,Rrs555,Rrs670,chl
count,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1213.0,1118.0
mean,4839.014839,28.156456,-76.890971,667.57873,20.904971,0.004553,0.004563,0.005227,0.004988,0.005008,0.001419,3.723355
std,2376.889363,18.107348,35.774092,1454.185069,7.519191,0.003789,0.00341,0.003846,0.004096,0.005033,0.002277,7.444344
min,1441.0,-62.441,-179.955,0.0,0.88,5.1e-05,0.00019,0.000367,0.000497,0.000417,0.0,0.017
25%,2655.0,26.461,-83.901,10.0,13.86,0.002132,0.002417,0.002969,0.002854,0.001841,0.0002,0.3018
50%,4842.0,30.125,-81.9736,33.0,22.77,0.0033,0.003516,0.004392,0.003547,0.002901,0.000573,1.04815
75%,7084.0,39.4572,-68.204,189.0,27.57,0.0063,0.006,0.006,0.0053,0.0063,0.0017,3.267068
max,7831.0,44.015,178.867,5707.0,30.89,0.0306,0.027601,0.0302,0.0329,0.0466,0.0277,77.8648


Rrs670 has entries that are 0.0. Seems strange...

In [75]:
df_all.loc[df_all.Rrs670==0.0].info()

<class 'pandas.core.frame.DataFrame'>
Index: 36 entries, 157 to 1157
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         36 non-null     int64  
 1   lat        36 non-null     float64
 2   lon        36 non-null     float64
 3   etopo2     36 non-null     float64
 4   oisst      36 non-null     float64
 5   Rrs411     36 non-null     float64
 6   Rrs443     36 non-null     float64
 7   Rrs489     36 non-null     float64
 8   Rrs510     36 non-null     float64
 9   Rrs555     36 non-null     float64
 10  Rrs670     36 non-null     float64
 11  chl        36 non-null     float64
 12  hplc_flag  36 non-null     object 
dtypes: float64(11), int64(1), object(1)
memory usage: 3.9+ KB


36 values seems spurious. I will replace 0.0 with NaN. The idea behind replacement with NaN rather than discarding is that these data can be revisited and the missing values can be modeled. Doing so instead of discarding multiple rows where only one entry is bad in each row is a far more judicious use of precious data. Data modeling, is done during fitting and is best carried out within the Bayesian framework as it will incorporate the uncertainty of the modeled data, which will be included in the total uncertainty budget of the model output cascaded from the posterior distribution after fitting the data. 

In [76]:
df_all['Rrs670'] = df_all['Rrs670'].replace(0.0, np.nan)


In [77]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1213 entries, 0 to 1212
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         1213 non-null   int64  
 1   lat        1213 non-null   float64
 2   lon        1213 non-null   float64
 3   etopo2     1213 non-null   float64
 4   oisst      1213 non-null   float64
 5   Rrs411     1213 non-null   float64
 6   Rrs443     1213 non-null   float64
 7   Rrs489     1213 non-null   float64
 8   Rrs510     1213 non-null   float64
 9   Rrs555     1213 non-null   float64
 10  Rrs670     1177 non-null   float64
 11  chl        1118 non-null   float64
 12  hplc_flag  1213 non-null   object 
dtypes: float64(11), int64(1), object(1)
memory usage: 123.3+ KB


In [78]:
df_all.to_parquet(data_path_cleaned / 'df_all.pqt')

### Transforming model form MBR modeling

In [79]:
df_t = df_all[
    ['id', 'lat', 'lon', 'etopo2', 'oisst'] 
    + [f'Rrs{i}' for i in λ] 
    + ['chl', 'hplc_flag']].copy()


In [80]:
df_t.loc[:, 'MBR'] = df_t.loc[
    :, [f'Rrs{i}' for i in λ[:-2]]].max(axis=1).divide(
        df_t.loc[:, ['Rrs555', 'Rrs670']].sum(axis=1)
    )
df_t.loc[:, 'MBR_flag'] = df_t.loc[:, [f'Rrs{i}' for i in λ[:-2]]].idxmax(axis=1)

In [82]:
df_t = df_t[
    [
        'id', 'lat', 'lon', 'etopo2', 'oisst',
        'Rrs411', 'Rrs443', 'Rrs489', 'Rrs510', 'Rrs555', 'Rrs670', 
        'MBR', 'MBR_flag', 'chl', 'hplc_flag'
    ]
]


In [84]:
df_t.hplc_flag.value_counts()

hplc_flag
fluo    666
hplc    452
None     95
Name: count, dtype: int64

In [87]:
df_t.insert(12, 'log_MBR', np.log10(df_t.MBR))

In [89]:
df_t.insert(15, 'log_chl', np.log10(df_t.chl))

In [96]:
df_t.sample(8).T

Unnamed: 0,981,1176,828,486,595,202,251,329
id,6814,6878,7079,3949,2741,2869,1879,1972
lat,26.1011,26.5153,37.0932,5.025,30.315,30.1378,27.3514,27.5429
lon,-83.146,-82.558,-75.7104,-154.909,-88.912,-88.4377,-83.2176,-82.8006
etopo2,9.0,22.0,12.0,4619.0,3.0,11.0,30.0,4.0
oisst,29.02,25.24,25.98,29.22,28.77,20.24,21.15,28.67
Rrs411,0.008,0.001701,0.002265,0.007257,0.0057,0.017199,0.0046,0.0079
Rrs443,0.0062,0.0019,0.002871,0.006225,0.0051,0.0151,0.0046,0.0091
Rrs489,0.0045,0.002599,0.004239,0.004338,0.0053,0.0148,0.005,0.0131
Rrs510,0.0026,0.0028,0.004228,0.002711,0.0057,0.014401,0.0035,0.0141
Rrs555,0.0012,0.003301,0.003839,0.00125,0.0072,0.0129,0.0018,0.0161


In [91]:
df_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1213 entries, 0 to 1212
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         1213 non-null   int64  
 1   lat        1213 non-null   float64
 2   lon        1213 non-null   float64
 3   etopo2     1213 non-null   float64
 4   oisst      1213 non-null   float64
 5   Rrs411     1213 non-null   float64
 6   Rrs443     1213 non-null   float64
 7   Rrs489     1213 non-null   float64
 8   Rrs510     1213 non-null   float64
 9   Rrs555     1213 non-null   float64
 10  Rrs670     1177 non-null   float64
 11  MBR        1213 non-null   float64
 12  log_MBR    1213 non-null   float64
 13  MBR_flag   1213 non-null   object 
 14  chl        1118 non-null   float64
 15  log_chl    1118 non-null   float64
 16  hplc_flag  1213 non-null   object 
dtypes: float64(14), int64(1), object(2)
memory usage: 161.2+ KB


In [92]:
df_t.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,1213.0,4839.014839,2376.889363,1441.0,2655.0,4842.0,7084.0,7831.0
lat,1213.0,28.156456,18.107348,-62.441,26.461,30.125,39.4572,44.015
lon,1213.0,-76.890971,35.774092,-179.955,-83.901,-81.9736,-68.204,178.867
etopo2,1213.0,667.57873,1454.185069,0.0,10.0,33.0,189.0,5707.0
oisst,1213.0,20.904971,7.519191,0.88,13.86,22.77,27.57,30.89
Rrs411,1213.0,0.004553,0.003789,5.1e-05,0.002132,0.0033,0.0063,0.0306
Rrs443,1213.0,0.004563,0.00341,0.00019,0.002417,0.003516,0.006,0.027601
Rrs489,1213.0,0.005227,0.003846,0.000367,0.002969,0.004392,0.006,0.0302
Rrs510,1213.0,0.004988,0.004096,0.000497,0.002854,0.003547,0.0053,0.0329
Rrs555,1213.0,0.005008,0.005033,0.000417,0.001841,0.002901,0.0063,0.0466


In [98]:
df_t.head().T

Unnamed: 0,0,1,2,3,4
id,1565,1566,1567,1568,1559
lat,38.4279,38.368,38.3074,38.6367,38.3047
lon,-76.61,-76.5,-76.44,-76.32,-76.44
etopo2,0.0,0.0,1.0,3.0,1.0
oisst,3.7,3.7,3.7,3.7,22.03
Rrs411,0.001204,0.001062,0.000971,0.001472,0.000905
Rrs443,0.001686,0.001384,0.001185,0.001741,0.001022
Rrs489,0.003293,0.002173,0.001843,0.002877,0.001506
Rrs510,0.004036,0.002499,0.002288,0.003664,0.001903
Rrs555,0.007479,0.004152,0.004246,0.006982,0.002801


In [99]:
df_t.insert(15, 'max_band_all_flag', df_t.filter(like='Rrs').idxmax(axis=1))

In [100]:
df_t.head().T

Unnamed: 0,0,1,2,3,4
id,1565,1566,1567,1568,1559
lat,38.4279,38.368,38.3074,38.6367,38.3047
lon,-76.61,-76.5,-76.44,-76.32,-76.44
etopo2,0.0,0.0,1.0,3.0,1.0
oisst,3.7,3.7,3.7,3.7,22.03
Rrs411,0.001204,0.001062,0.000971,0.001472,0.000905
Rrs443,0.001686,0.001384,0.001185,0.001741,0.001022
Rrs489,0.003293,0.002173,0.001843,0.002877,0.001506
Rrs510,0.004036,0.002499,0.002288,0.003664,0.001903
Rrs555,0.007479,0.004152,0.004246,0.006982,0.002801


In [101]:
df_t.max_band_all_flag.value_counts()

max_band_all_flag
Rrs555    498
Rrs489    305
Rrs411    285
Rrs510    110
Rrs443     13
Rrs670      2
Name: count, dtype: int64

In [102]:
df_t.to_parquet(data_path_transformed/'df_all.pqt')