In [1]:
import pandas as pd
import numpy as np

# Read the datasets
df_field = pd.read_csv('../dataset/field.csv')
df_satel = pd.read_csv('../dataset/sentinel.csv')

# Convert Date columns to datetime
df_field['Date'] = pd.to_datetime(df_field['Date'])
df_satel['Date'] = pd.to_datetime(df_satel['Date'])

In [2]:
# Convert USGS units to SI units
# Discharge: cfs to m³/s (1 cfs = 0.0283168 m³/s)
df_field['Discharge'] = df_field['Discharge'] * 0.0283168

# Height: feet to meters (1 foot = 0.3048 m)
df_field['Height'] = df_field['Height'] * 0.3048

In [3]:
# Merge the datasets on Date (inner join)
merged_df = pd.merge(df_field, df_satel, on='Date', how='inner')

# Compute satellite band combination indices

- **MNDWI, NDWI:** Water body delineation and inundation.
- **GNDVI:** Vegetation/algae health (relevant for Chl-a).
- **SDDI:** Secchi disk depth.
- **NDTI:** Turbidity/total suspended matter.
- **BR:** Water clarity.
- **NDPI:** Pond/standing water (adaptable for river segments).
- **NDCI:** Chlorophyll-a concentration.

In [4]:
# MNDWI = (Green - SWIR) / (Green + SWIR) = (B3 - B11) / (B3 + B11)
merged_df['MNDWI'] = (merged_df['B3'] - merged_df['B11']) / (merged_df['B3'] + merged_df['B11'])

# GNDVI = (NIR - Green) / (NIR + Green) = (B8 - B3) / (B8 + B3)
merged_df['GNDVI'] = (merged_df['B8'] - merged_df['B3']) / (merged_df['B8'] + merged_df['B3'])

# SDDI = Log(Green/Red) = Log(B3 / B4)
merged_df['SDDI'] = np.log(merged_df['B3'] / merged_df['B4'])

# NDTI = (Red - Green) / (Red + Green) = (B4 - B3) / (B4 + B3)
merged_df['NDTI'] = (merged_df['B4'] - merged_df['B3']) / (merged_df['B4'] + merged_df['B3'])

# BR = (Blue / Red) = B2 / B4
merged_df['BR'] = merged_df['B2'] / merged_df['B4']

# NDWI = (Green - NIR) / (Green + NIR) = (B3 - B8) / (B3 + B8)
merged_df['NDWI'] = (merged_df['B3'] - merged_df['B8']) / (merged_df['B3'] + merged_df['B8'])

# NDPI = (SWIR - Green) / (SWIR + Green) = (B11 - B3) / (B11 + B3)
merged_df['NDPI'] = (merged_df['B11'] - merged_df['B3']) / (merged_df['B11'] + merged_df['B3'])

# NDCI = (RedEdge1 - Red) / (RedEdge1 + Red) = (B5 - B4) / (B5 + B4)
merged_df['NDCI'] = (merged_df['B5'] - merged_df['B4']) / (merged_df['B5'] + merged_df['B4'])

# 2BDA (2-Band Difference Algorithm) for Chlorophyll-a Proxy: B5 - B4
merged_df['2BDA_Chl'] = merged_df['B5'] - merged_df['B4']

# Red Edge / Red ratio for turbidity/sediment: B5 / B4
merged_df['RR'] = merged_df['B5'] / merged_df['B4']

In [5]:
field_cols = [col for col in df_field.columns if col != 'Date']
satel_cols = [col for col in df_satel.columns if col != 'Date']
index_cols = [col for col in merged_df.columns if col.startswith(('MNDWI', 'GNDVI', 'SDDI', 'NDTI', 'BR', 'NDWI', 'NDPI', 'NDCI', '2BDA_Chl', 'RR'))]

merged_df = merged_df[['Date'] + field_cols + satel_cols + index_cols]
merged_df.head()

Unnamed: 0,Date,Discharge,Height,Turbidity,pH,DO,SC,Temperature,Chl-a,Phycocyanin,...,MNDWI,GNDVI,SDDI,NDTI,BR,NDWI,NDPI,NDCI,2BDA_Chl,RR
0,2018-11-21,577.66272,4.011168,70.2,8.2,11.7,486,7.5,0.7,0.55,...,0.435233,-0.093428,-0.281568,0.139861,0.459967,0.093428,-0.435233,0.015163,29.071487,1.030793
1,2018-11-21,577.66272,4.011168,70.2,8.2,11.7,486,7.5,0.7,0.55,...,0.439745,-0.094823,-0.279776,0.138983,0.463807,0.094823,-0.439745,0.015024,28.732013,1.030507
2,2018-12-16,543.68256,3.849624,93.3,8.0,13.3,657,2.6,0.8,0.74,...,0.459535,-0.054246,-0.377496,0.186538,0.380529,0.054246,-0.459535,0.025611,48.700519,1.052568
3,2018-12-16,543.68256,3.849624,93.3,8.0,13.3,657,2.6,0.8,0.74,...,0.461854,-0.05647,-0.373636,0.184674,0.386158,0.05647,-0.461854,0.024785,47.168736,1.050831
4,2018-12-21,478.55392,3.624072,67.7,8.0,12.9,672,3.4,0.7,0.67,...,0.481713,-0.142864,-0.341626,0.169171,0.338154,0.142864,-0.481713,0.014362,25.564005,1.029142


In [6]:
merged_df.to_csv('../dataset/data.csv', index=False)
print("Merged data with unit conversions and indices exported.")

Merged data with unit conversions and indices exported.
