In [1]:
import pandas as pd
import numpy as np

# Read the datasets
df_field = pd.read_csv('../dataset/field.csv')
df_satel = pd.read_csv('../dataset/sentinel.csv')

# Convert Date columns to datetime
df_field['Date'] = pd.to_datetime(df_field['Date'])
df_satel['Date'] = pd.to_datetime(df_satel['Date'])

In [2]:
# Convert USGS units to SI units
# Discharge: cfs to m³/s (1 cfs = 0.0283168 m³/s)
df_field['Discharge'] = df_field['Discharge'] * 0.0283168

# Height: feet to meters (1 foot = 0.3048 m)
df_field['Height'] = df_field['Height'] * 0.3048

df_field.head()

Unnamed: 0,Date,Discharge,Height,Turbidity,pH,DO,SC,Temperature,Chl-a,Phycocyanin
0,2018-11-20,509.7024,3.788664,62.8,8.2,11.8,508,6.9,0.7,0.56
1,2018-11-21,577.66272,4.011168,70.2,8.2,11.7,486,7.5,0.7,0.55
2,2018-11-22,600.31616,4.075176,69.6,8.1,11.6,474,7.9,0.7,0.54
3,2018-11-23,600.31616,4.078224,65.0,8.1,11.4,472,8.2,0.7,0.52
4,2018-11-24,603.14784,4.08432,63.3,8.1,11.4,476,7.9,0.7,0.53


In [3]:
band_cols = ['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9', 'B11', 'B12', 'TCI_B', 'TCI_G', 'TCI_R']
aot_wvp_cols = ['AOT', 'WVP']

In [4]:
for col in band_cols:
    if col in df_satel.columns:
        df_satel[f'{col}_scaled'] = np.maximum(df_satel[col] / 10000, 0)

for col in aot_wvp_cols:
    if col in df_satel.columns:
        df_satel[f'{col}_scaled'] = df_satel[col] / 1000

In [5]:
df_satel = df_satel.drop(columns=[col for col in band_cols + aot_wvp_cols if col in df_satel.columns])

In [6]:
rename_dict = {f'{col}_scaled': col for col in band_cols + aot_wvp_cols if f'{col}_scaled' in df_satel.columns}
df_satel = df_satel.rename(columns=rename_dict)
df_satel.head()

Unnamed: 0,Date,B1,B2,B3,B4,B5,B6,B7,B8,B8A,B9,B11,B12,TCI_B,TCI_G,TCI_R,AOT,WVP
0,2018-11-21,0.027446,0.043425,0.071241,0.094409,0.097316,0.061365,0.061496,0.059066,0.046062,0.039327,0.028033,0.023053,0.004465,0.007285,0.00964,0.058,0.577148
1,2018-11-21,0.028183,0.043683,0.071198,0.094183,0.097056,0.061128,0.061226,0.058865,0.04584,0.038905,0.027705,0.022819,0.004491,0.007283,0.009619,0.058,0.575089
2,2018-12-16,0.006953,0.035253,0.063513,0.092642,0.097512,0.062078,0.061366,0.056977,0.041752,0.024241,0.023519,0.019142,0.003646,0.006509,0.009479,0.125,0.549102
3,2018-12-16,0.007793,0.035834,0.063865,0.092796,0.097513,0.062118,0.061315,0.057037,0.041757,0.024393,0.02351,0.019082,0.003708,0.006545,0.009492,0.126,0.551032
4,2018-12-21,0.009535,0.029663,0.062336,0.087721,0.090278,0.051236,0.049769,0.046751,0.033392,0.027911,0.021804,0.019632,0.0031,0.006393,0.008981,0.107986,0.446026


In [7]:
# Merge the datasets on Date (inner join)
merged_df = pd.merge(df_field, df_satel, on='Date', how='inner')

# Compute satellite band combination indices

- **MNDWI, NDWI:** Water body delineation and inundation.
- **GNDVI:** Vegetation/algae health (relevant for Chl-a).
- **SDDI:** Secchi disk depth.
- **NDTI:** Turbidity/total suspended matter.
- **BR:** Water clarity.
- **NDPI:** Pond/standing water (adaptable for river segments).
- **NDCI:** Chlorophyll-a concentration.

In [8]:
# MNDWI = (Green - SWIR) / (Green + SWIR) = (B3 - B11) / (B3 + B11)
merged_df['MNDWI'] = (merged_df['B3'] - merged_df['B11']) / (merged_df['B3'] + merged_df['B11'])

# GNDVI = (NIR - Green) / (NIR + Green) = (B8 - B3) / (B8 + B3)
merged_df['GNDVI'] = (merged_df['B8'] - merged_df['B3']) / (merged_df['B8'] + merged_df['B3'])

# SDDI = Log(Green/Red) = Log(B3 / B4)
merged_df['SDDI'] = np.log(merged_df['B3'] / merged_df['B4'])

# NDTI = (Red - Green) / (Red + Green) = (B4 - B3) / (B4 + B3)
merged_df['NDTI'] = (merged_df['B4'] - merged_df['B3']) / (merged_df['B4'] + merged_df['B3'])

# BR = (Blue / Red) = B2 / B4
merged_df['BR'] = merged_df['B2'] / merged_df['B4']

# NDWI = (Green - NIR) / (Green + NIR) = (B3 - B8) / (B3 + B8)
merged_df['NDWI'] = (merged_df['B3'] - merged_df['B8']) / (merged_df['B3'] + merged_df['B8'])

# NDPI = (SWIR - Green) / (SWIR + Green) = (B11 - B3) / (B11 + B3)
merged_df['NDPI'] = (merged_df['B11'] - merged_df['B3']) / (merged_df['B11'] + merged_df['B3'])

# NDCI = (RedEdge1 - Red) / (RedEdge1 + Red) = (B5 - B4) / (B5 + B4)
merged_df['NDCI'] = (merged_df['B5'] - merged_df['B4']) / (merged_df['B5'] + merged_df['B4'])

# 2BDA (2-Band Difference Algorithm) for Chlorophyll-a Proxy: B5 - B4
merged_df['2BDA_Chl'] = merged_df['B5'] - merged_df['B4']

# Red Edge / Red ratio for turbidity/sediment: B5 / B4
merged_df['RR'] = merged_df['B5'] / merged_df['B4']

In [9]:
field_cols = [col for col in df_field.columns if col != 'Date']
satel_cols = [col for col in df_satel.columns if col != 'Date']
index_cols = [col for col in merged_df.columns if col.startswith(('MNDWI', 'GNDVI', 'SDDI', 'NDTI', 'BR', 'NDWI', 'NDPI', 'NDCI', '2BDA_Chl', 'RR'))]

merged_df = merged_df[['Date'] + field_cols + satel_cols + index_cols]
merged_df.head()

Unnamed: 0,Date,Discharge,Height,Turbidity,pH,DO,SC,Temperature,Chl-a,Phycocyanin,...,MNDWI,GNDVI,SDDI,NDTI,BR,NDWI,NDPI,NDCI,2BDA_Chl,RR
0,2018-11-21,577.66272,4.011168,70.2,8.2,11.7,486,7.5,0.7,0.55,...,0.435233,-0.093428,-0.281568,0.139861,0.459967,0.093428,-0.435233,0.015163,0.002907,1.030793
1,2018-11-21,577.66272,4.011168,70.2,8.2,11.7,486,7.5,0.7,0.55,...,0.439745,-0.094823,-0.279776,0.138983,0.463807,0.094823,-0.439745,0.015024,0.002873,1.030507
2,2018-12-16,543.68256,3.849624,93.3,8.0,13.3,657,2.6,0.8,0.74,...,0.459535,-0.054246,-0.377496,0.186538,0.380529,0.054246,-0.459535,0.025611,0.00487,1.052568
3,2018-12-16,543.68256,3.849624,93.3,8.0,13.3,657,2.6,0.8,0.74,...,0.461854,-0.05647,-0.373636,0.184674,0.386158,0.05647,-0.461854,0.024785,0.004717,1.050831
4,2018-12-21,478.55392,3.624072,67.7,8.0,12.9,672,3.4,0.7,0.67,...,0.481713,-0.142864,-0.341626,0.169171,0.338154,0.142864,-0.481713,0.014362,0.002556,1.029142


In [10]:
merged_df.to_csv('../dataset/data.csv', index=False)
print("Merged data with unit conversions and indices exported.")

Merged data with unit conversions and indices exported.
