In [None]:
from pathlib import Path

import pandas as pd
import numpy as np

In [32]:
project_path = Path.cwd()
data_path_unprocessed = project_path / 'data' / '01_reduced'
data_path_processed = project_path / 'data' / '02_processed'
data_path_cleaned = project_path / 'data' / '03_cleaned'

In [16]:
df_in = pd.read_parquet(data_path_unprocessed / 'inputs.pqt')
df_out = pd.read_parquet(data_path_unprocessed / 'targets.pqt')

### Cleaning Inputs 

In [17]:
df_in.head()

Unnamed: 0,id,lat,lon,etopo2,oisst,es411,es443,es489,es510,es555,es670,lw411,lw443,lw489,lw510,lw555,lw670
0,1565,38.4279,-76.61,0.0,3.7,107.906,121.187,137.266,134.037,130.947,112.925,0.129962,0.204331,0.452029,0.541019,0.979378,0.391235
1,1566,38.368,-76.5,0.0,3.7,113.86,128.053,145.148,140.965,138.35,118.555,0.120886,0.177228,0.315395,0.352306,0.57444,0.200936
2,1567,38.3074,-76.44,1.0,3.7,114.35,128.055,146.06,142.725,140.198,119.978,0.111049,0.151807,0.269218,0.326515,0.595226,0.193438
3,1568,38.6367,-76.32,3.0,3.7,35.0441,38.1511,42.1373,40.5485,39.2907,31.7094,0.051578,0.06643,0.121242,0.14858,0.274316,0.102543
4,1559,38.3047,-76.44,1.0,22.03,61.8732,67.0335,72.2731,69.6545,77.1321,50.0766,0.056001,0.06853,0.108838,0.132581,0.216023,0.089663


In [18]:
λ = [411, 443, 489, 510, 555, 670]
for λi in λ:
    df_in[f'Rrs{λi}'] = df_in[f'lw{λi}'] / df_in[f'es{λi}']

In [19]:
df_in.head()

Unnamed: 0,id,lat,lon,etopo2,oisst,es411,es443,es489,es510,es555,...,lw489,lw510,lw555,lw670,Rrs411,Rrs443,Rrs489,Rrs510,Rrs555,Rrs670
0,1565,38.4279,-76.61,0.0,3.7,107.906,121.187,137.266,134.037,130.947,...,0.452029,0.541019,0.979378,0.391235,0.001204,0.001686,0.003293,0.004036,0.007479,0.003465
1,1566,38.368,-76.5,0.0,3.7,113.86,128.053,145.148,140.965,138.35,...,0.315395,0.352306,0.57444,0.200936,0.001062,0.001384,0.002173,0.002499,0.004152,0.001695
2,1567,38.3074,-76.44,1.0,3.7,114.35,128.055,146.06,142.725,140.198,...,0.269218,0.326515,0.595226,0.193438,0.000971,0.001185,0.001843,0.002288,0.004246,0.001612
3,1568,38.6367,-76.32,3.0,3.7,35.0441,38.1511,42.1373,40.5485,39.2907,...,0.121242,0.14858,0.274316,0.102543,0.001472,0.001741,0.002877,0.003664,0.006982,0.003234
4,1559,38.3047,-76.44,1.0,22.03,61.8732,67.0335,72.2731,69.6545,77.1321,...,0.108838,0.132581,0.216023,0.089663,0.000905,0.001022,0.001506,0.001903,0.002801,0.001791


In [20]:
df_in.to_parquet(data_path_processed / 'df_in.pqt')


In [33]:
df_in.filter(regex='(id)|(Rrs[0-9]+)', axis=1).to_parquet(data_path_cleaned/'df_in.pqt')

### Cleaning Output

In [34]:
df_out

Unnamed: 0,id,chl,chl_a,ad411,ad443,ad489,ad510,ap411,ap443,ap489,ap510,ap555,ap670
0,1565,38.19000,,0.78272,0.47467,0.23128,0.16657,1.52149,1.38905,0.81987,0.64517,0.24726,0.63165
1,1566,35.01000,,,,,,,,,,,
2,1567,26.91000,,0.27165,0.14016,0.05413,0.03506,0.77966,0.73214,0.42119,0.33765,0.14978,0.40562
3,1568,47.96000,,,,,,,,,,,
4,1559,23.55000,,0.76505,0.43701,0.19538,0.13529,1.62324,1.54082,0.96391,0.72949,0.18209,0.46114
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4454,6901,4.62784,,0.03151,0.02175,0.01276,0.01000,0.22345,0.20338,0.13320,0.09238,0.03786,0.06421
4455,6902,2.95997,,,,,,,,,,,
4456,6914,0.95114,,,,,,,,,,,
4457,6903,4.54228,,0.01395,0.01010,0.00636,0.00514,0.18583,0.17516,0.11804,0.08267,0.03175,0.05858


In [36]:
df_out[['chl', 'chl_a']]

Unnamed: 0,chl,chl_a
0,38.19000,
1,35.01000,
2,26.91000,
3,47.96000,
4,23.55000,
...,...,...
4454,4.62784,
4455,2.95997,
4456,0.95114,
4457,4.54228,


In [35]:
df_chl = df_out[['id', 'chl', 'chl_a']].copy()

In [37]:
def create_chl_and_flag(df):
    """
    Creates 'chl' and 'hplc_flag' columns based on the rules provided.

    Args:
        df (pd.DataFrame): Input DataFrame with 'chl' and 'chl_a' columns.

    Returns:
        pd.DataFrame: DataFrame with new 'chl' and 'hplc_flag' columns.
    """
    new_chl = np.select(
        [
            df['chl_a'].notna(),
            df['chl'].notna() & df['chl_a'].isna(),
            df['chl'].isna() & df['chl_a'].isna()
        ],
        [
            df['chl_a'],
            df['chl'],
            np.nan
        ],
        default=np.nan  # Should not be reached based on the conditions
    )

    hplc_flag = np.select(
        [
            df['chl_a'].notna(),
            df['chl'].notna() & df['chl_a'].isna(),
            df['chl'].isna() & df['chl_a'].isna()
        ],
        ['1', '0', '-1'],
        default='-1' # Should not be reached based on the conditions
    )

    new_df = df.copy()
    new_df['chl'] = new_chl
    new_df['hplc_flag'] = hplc_flag
    return new_df[['chl', 'hplc_flag']]

In [38]:
new_df_chl = create_chl_and_flag(df_chl)

In [40]:
df_chl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      4459 non-null   int64  
 1   chl     3392 non-null   float64
 2   chl_a   1381 non-null   float64
dtypes: float64(2), int64(1)
memory usage: 104.6 KB


In [48]:
df_chl.loc[df_chl['chl'].isna() & df_chl['chl_a'].isna()]

Unnamed: 0,id,chl,chl_a
9,1574,,
19,1601,,
20,1604,,
37,1625,,
39,1627,,
...,...,...,...
4369,7777,,
4373,7785,,
4374,7786,,
4375,7787,,


In [50]:
df_chl.loc[9]

id       1574.0
chl         NaN
chl_a       NaN
Name: 9, dtype: float64

In [45]:
df_chl.dropna(axis=1, how='all')

Unnamed: 0,id,chl,chl_a
0,1565,38.19000,
1,1566,35.01000,
2,1567,26.91000,
3,1568,47.96000,
4,1559,23.55000,
...,...,...,...
4454,6901,4.62784,
4455,6902,2.95997,
4456,6914,0.95114,
4457,6903,4.54228,


In [39]:
new_df_chl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   chl        4127 non-null   float64
 1   hplc_flag  4459 non-null   object 
dtypes: float64(1), object(1)
memory usage: 69.8+ KB


In [51]:
new_df_chl.to_parquet(data_path_cleaned / 'df_out_chl.pqt')