In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../dataset/merged.csv')
df['Date'] = pd.to_datetime(df['Date'])
df.head()

Unnamed: 0,Date,Discharge,Height,Turbidity,pH,DO,SC,Temperature,Chl-a,Phycocyanin,...,MNDWI,GNDVI,SDDI,NDTI,BR,NDWI,NDPI,NDCI,2BDA_Chl,RR
0,2018-11-21,577.66272,4.011168,70.2,8.2,11.7,486,7.5,0.7,0.55,...,0.407433,-0.082268,-0.278961,0.138583,0.465241,0.082268,-0.407433,0.016434,0.003132,1.033418
1,2018-12-16,543.68256,3.849624,93.3,8.0,13.3,657,2.6,0.8,0.74,...,0.426877,-0.046317,-0.374309,0.185,0.386335,0.046317,-0.426877,0.025896,0.00489,1.053169
2,2018-12-21,478.55392,3.624072,67.7,8.0,12.9,672,3.4,0.7,0.67,...,0.445335,-0.129644,-0.339137,0.167962,0.343961,0.129644,-0.445335,0.013995,0.002476,1.028387
3,2019-01-05,521.02912,3.767328,107.0,8.0,13.6,611,1.9,0.8,0.8,...,0.457476,-0.022581,-0.368427,0.182158,0.371143,0.022581,-0.457476,0.032999,0.006492,1.06825
4,2019-02-24,342.63328,3.218688,541.0,8.1,13.7,576,1.1,4.2,2.18,...,0.411478,0.4069,-0.478754,0.234907,0.29206,-0.4069,-0.411478,0.090299,0.01853,1.198525


In [3]:
print(df.dtypes)

Date           datetime64[ns]
Discharge             float64
Height                float64
Turbidity             float64
pH                    float64
DO                    float64
SC                      int64
Temperature           float64
Chl-a                 float64
Phycocyanin           float64
B1                    float64
B2                    float64
B3                    float64
B4                    float64
B5                    float64
B6                    float64
B7                    float64
B8                    float64
B8A                   float64
B9                    float64
B11                   float64
B12                   float64
TCI_B                 float64
TCI_G                 float64
TCI_R                 float64
AOT                   float64
WVP                   float64
MNDWI                 float64
GNDVI                 float64
SDDI                  float64
NDTI                  float64
BR                    float64
NDWI                  float64
NDPI      

In [4]:
print("\nDuplicate dates count:", df['Date'].duplicated().sum())
if df['Date'].duplicated().sum() > 0:
    print("Duplicate dates sample:")
    print(df[df['Date'].duplicated(keep=False)][['Date']].head())


Duplicate dates count: 0


In [5]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Date,156.0,2022-01-24 12:18:27.692307712,2018-11-21 00:00:00,2020-09-28 12:00:00,2022-01-01 12:00:00,2023-07-29 06:00:00,2024-12-29 00:00:00,
Discharge,156.0,148.179636,18.179386,30.298976,69.092992,144.982016,1900.05728,221.863079
Height,156.0,2.214802,1.466088,1.624584,1.8669,2.410968,7.123176,0.869617
Turbidity,156.0,82.030769,5.1,17.425,29.5,79.325,1080.0,132.692839
pH,156.0,8.437179,7.7,8.3,8.5,8.6,8.9,0.270058
DO,156.0,10.153846,5.1,8.075,10.1,11.9,14.1,2.240777
SC,156.0,913.115385,262.0,760.75,923.5,1092.5,1400.0,243.625942
Temperature,156.0,16.319231,0.0,7.375,17.15,25.625,33.0,9.615425
Chl-a,156.0,6.455769,0.7,2.0,5.45,10.05,17.8,4.785748
Phycocyanin,156.0,1.268974,0.22,0.86,1.215,1.5525,4.02,0.572246


# Define priority columns for outlier detection
focus on key hydrological, water quality, and core bands/indices

In [6]:
priority_cols = [
    'Discharge', 'Height', 'Turbidity', 'pH', 'DO', 'SC', 'Temperature', 
    'Chl-a', 'Phycocyanin', 'B2', 'B3', 'B4', 'B5', 'B8', 'B11', 
    # 'MNDWI', 'GNDVI', 'SDDI', 'NDTI', 'BR', 'NDWI', 'NDPI', 'NDCI', 
    # '2BDA_Chl', 'RR'
]

In [7]:
band_cols = ['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9', 'B11', 'B12']
for col in band_cols:
    if col in df.columns:
        # Flag invalid reflectance
        df[f'{col}_outlier_bound'] = (df[col] < 0) | (df[col] > 1)

In [8]:
def detect_outliers_iqr(series, col_name):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = series[(series < lower_bound) | (series > upper_bound)]
    return outliers, lower_bound, upper_bound

In [9]:
outlier_summary = []
for col in priority_cols:
    if col in df.columns:
        outliers, lb, ub = detect_outliers_iqr(df[col].dropna(), col)
        outlier_count = len(outliers)
        outlier_pct = (outlier_count / len(df)) * 100
        outlier_summary.append({
            'Column': col,
            'Outlier_Count': outlier_count,
            'Outlier_Percentage': round(outlier_pct, 2),
            'Lower_Bound': round(lb, 4),
            'Upper_Bound': round(ub, 4)
        })
        # Flag outliers in df
        df[f'{col}_outlier'] = (df[col] < lb) | (df[col] > ub)

In [10]:
summary_df = pd.DataFrame(outlier_summary)
summary_df

Unnamed: 0,Column,Outlier_Count,Outlier_Percentage,Lower_Bound,Upper_Bound
0,Discharge,23,14.74,-141.7256,317.0066
1,Height,15,9.62,0.445,3.5905
2,Turbidity,19,12.18,-75.425,172.175
3,pH,5,3.21,7.85,9.05
4,DO,0,0.0,2.3375,17.6375
5,SC,1,0.64,263.125,1590.125
6,Temperature,0,0.0,-20.0,53.0
7,Chl-a,0,0.0,-10.075,22.125
8,Phycocyanin,2,1.28,-0.1788,2.5912
9,B2,1,0.64,-0.0013,0.0954


In [11]:
outlier_flags = [f'{col}_outlier' for col in priority_cols if f'{col}_outlier' in df.columns]
outlier_flags += [f'{col}_outlier_bound' for col in band_cols if f'{col}_outlier_bound' in df.columns]
df['has_outlier'] = df[outlier_flags].any(axis=1)

In [12]:
cleaned_df = df[~df['has_outlier']].copy()
cleaned_df = cleaned_df.drop(columns=[col for col in df.columns if col.endswith('_outlier') or col.endswith('_outlier_bound') or col == 'has_outlier'])

In [13]:
cleaned_df.to_csv('../dataset/data.csv', index=False)
print(f"\nOriginal rows: {len(df)}")
print(f"Cleaned rows: {len(cleaned_df)}")
print(f"Rows removed: {len(df) - len(cleaned_df)} ({((len(df) - len(cleaned_df)) / len(df) * 100):.2f}%)")
print("Cleaned data exported")


Original rows: 156
Cleaned rows: 123
Rows removed: 33 (21.15%)
Cleaned data exported
