# Exploratory Data Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro, kstest, kendalltau
from statsmodels.graphics.gofplots import qqplot

In [2]:
plt.style.use('seaborn-v0_8-white')
plt.rcParams.update({
    'font.family': 'Arial',
    'font.size': 12,
    'axes.labelsize': 12,
    'axes.titlesize': 14,
    'xtick.labelsize': 10,
    'ytick.labelsize': 10,
    'legend.fontsize': 10,
    'axes.linewidth': 0.8,
    'grid.linewidth': 0.5,
    'lines.linewidth': 1.5,
    'figure.dpi': 600,
    'savefig.dpi': 600,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.grid': True,
    'grid.color': '0',
})

In [3]:
path = "../dataset/data.csv"
df = pd.read_csv(path, parse_dates=['Date'])
columns = df.drop('Date', axis=1).columns
df.head()

Unnamed: 0,Date,Discharge,Height,Turbidity,pH,DO,SC,Temperature,Chl-a,Phycocyanin,...,MNDWI,GNDVI,SDDI,NDTI,BR,NDWI,NDPI,NDCI,2BDA_Chl,RR
0,2019-04-15,122.045408,2.197608,68.1,8.4,10.2,1110,14.6,2.8,0.67,...,0.374551,-0.150643,-0.17072,0.085153,0.551217,0.150643,-0.374551,0.063141,0.012068,1.134792
1,2019-04-20,96.27712,2.033016,43.6,8.6,10.6,1150,17.2,6.0,1.11,...,0.343369,-0.229942,0.003395,-0.001698,0.503659,0.229942,-0.343369,0.132095,0.019033,1.304399
2,2019-05-05,180.944352,2.526792,152.0,8.2,8.6,745,18.0,3.2,0.72,...,0.262853,0.136803,-0.187451,0.093452,0.644899,-0.136803,-0.262853,0.075063,0.018629,1.162309
3,2019-09-27,212.092832,2.685288,153.0,8.2,7.8,1150,23.4,1.6,0.75,...,0.453808,0.04652,-0.216945,0.108049,0.556456,-0.04652,-0.453808,0.071456,0.016295,1.15391
4,2019-12-31,175.847328,2.508504,22.2,8.4,13.1,1030,2.7,2.1,0.89,...,0.28801,-0.226803,-0.055859,0.027922,0.638716,0.226803,-0.28801,0.004913,0.000667,1.009874


In [4]:
def test_normality(data, column, alpha=0.05):
    if column not in data.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    
    valid_data = data[column].dropna()
        
    if not np.issubdtype(valid_data.dtype, np.number):
        raise ValueError(f"Column '{column}' contains non-numeric data.")
    
    stat_sw, p_sw = shapiro(valid_data)
    
    print(f'Shapiro-Wilk Test for {column}:')
    print(f'Statistic: {stat_sw:.3f}, p-value: {p_sw:.3e}')
    if p_sw > alpha:
        print(f'{column} appears normally distributed (p > {alpha}).')
    else:
        print(f'{column} does not appear normally distributed (p ≤ {alpha}).')
        
    print()
    return stat_sw, p_sw

In [5]:
results = [test_normality(df, col) for col in columns]

Shapiro-Wilk Test for Discharge:
Statistic: 0.781, p-value: 2.955e-12
Discharge does not appear normally distributed (p ≤ 0.05).

Shapiro-Wilk Test for Height:
Statistic: 0.845, p-value: 5.130e-10
Height does not appear normally distributed (p ≤ 0.05).

Shapiro-Wilk Test for Turbidity:
Statistic: 0.739, p-value: 1.726e-13
Turbidity does not appear normally distributed (p ≤ 0.05).

Shapiro-Wilk Test for pH:
Statistic: 0.959, p-value: 9.551e-04
pH does not appear normally distributed (p ≤ 0.05).

Shapiro-Wilk Test for DO:
Statistic: 0.962, p-value: 1.439e-03
DO does not appear normally distributed (p ≤ 0.05).

Shapiro-Wilk Test for SC:
Statistic: 0.988, p-value: 3.269e-01
SC appears normally distributed (p > 0.05).

Shapiro-Wilk Test for Temperature:
Statistic: 0.934, p-value: 1.476e-05
Temperature does not appear normally distributed (p ≤ 0.05).

Shapiro-Wilk Test for Chl-a:
Statistic: 0.944, p-value: 6.192e-05
Chl-a does not appear normally distributed (p ≤ 0.05).

Shapiro-Wilk Test fo

In [6]:
table_data = [
    ['SW'] + [f'{stat_sw:.2f}' for stat_sw, _ in results],
    ['p_SW'] + [f'{p_sw:.1e}' for _, p_sw in results],
    ['Normality'] + ['Normal' if p_sw > 0.05 else 'Not Normal' for _, p_sw in results]
]
table = pd.DataFrame(
    data=[row[1:] for row in table_data],
    index=[row[0] for row in table_data],
    columns=columns
)
table

Unnamed: 0,Discharge,Height,Turbidity,pH,DO,SC,Temperature,Chl-a,Phycocyanin,B1,...,MNDWI,GNDVI,SDDI,NDTI,BR,NDWI,NDPI,NDCI,2BDA_Chl,RR
SW,0.78,0.85,0.74,0.96,0.96,0.99,0.93,0.94,0.98,0.95,...,0.95,0.93,0.98,0.98,0.98,0.93,0.95,0.98,0.98,0.98
p_SW,3.0e-12,5.1e-10,1.7e-13,9.6e-04,1.4e-03,3.3e-01,1.5e-05,6.2e-05,9.8e-02,1.7e-04,...,2.3e-04,5.2e-06,5.9e-02,6.1e-02,1.3e-01,5.2e-06,2.3e-04,6.1e-02,6.9e-02,1.1e-01
Normality,Not Normal,Not Normal,Not Normal,Not Normal,Not Normal,Normal,Not Normal,Not Normal,Normal,Not Normal,...,Not Normal,Not Normal,Normal,Normal,Normal,Not Normal,Not Normal,Normal,Normal,Normal


In [7]:
for col in columns:
    plt.figure(figsize=(10, 4))
    
    plt.subplot(1, 2, 1)
    sns.kdeplot(df[col].dropna(), color='navy', linewidth=1.5)
    plt.xlabel(col.replace('_', ' ').title())
    plt.ylabel('Frequency')
    # plt.title(f'Histogram of {col.replace("_", " ").title()}', pad=10)
    
    plt.subplot(1, 2, 2)
    qqplot(df[col].dropna(), line='s', ax=plt.gca(), marker='.', markersize=12, alpha=0.6)
    plt.xlabel('Theoretical Quantiles')
    plt.ylabel('Sample Quantiles')
    # plt.title(f'Q-Q Plot of {col.replace("_", " ").title()}', pad=10)
    
    plt.tight_layout()
    plt.savefig(
        f'../plots/eda/normality/{col}.svg',
        format='svg',
        bbox_inches='tight',
        transparent=True,
        dpi=600
    )
    plt.close()

In [8]:
columns

Index(['Discharge', 'Height', 'Turbidity', 'pH', 'DO', 'SC', 'Temperature',
       'Chl-a', 'Phycocyanin', 'B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8',
       'B8A', 'B9', 'B11', 'B12', 'TCI_B', 'TCI_G', 'TCI_R', 'AOT', 'WVP',
       'MNDWI', 'GNDVI', 'SDDI', 'NDTI', 'BR', 'NDWI', 'NDPI', 'NDCI',
       '2BDA_Chl', 'RR'],
      dtype='object')

In [9]:
def correlation(data, name="dataset"):
    corr = data.corr(numeric_only=True, method="pearson")
    mask = np.triu(np.ones_like(corr, dtype=bool))
    np.fill_diagonal(mask, False)
    
    plt.figure(figsize=(12, 10))
    heatmap = sns.heatmap(
            corr, mask=mask, vmin=-1, vmax=1,
            annot=True, fmt=".2f",
            cmap='coolwarm',
            annot_kws={"size": 7, "weight": "bold"},
            cbar_kws={"shrink": .8, "ticks": np.linspace(-1, 1, 5)}
    )
    colorbar = heatmap.collections[0].colorbar
    colorbar.set_label('Pearson Correlation Coefficients (r)', fontsize=14)
    heatmap.figure.axes[-1].yaxis.label.set_size(10)
    heatmap.figure.axes[-1].tick_params(labelsize=14)
    plt.grid(False)
    plt.tight_layout()
    plt.savefig(
        f'../plots/eda/correlation/{name}.svg',
        format='svg',
        bbox_inches='tight',
        transparent=True,
        dpi=600
    )
    plt.close()

In [10]:
correlation(df, name="full")

In [11]:
def seasonal_correlation(data):
    seasons = {
        'Spring': [3, 4, 5],
        'Summer': [6, 7, 8],
        'Autumn': [9, 10, 11],
        'Winter': [12, 1, 2]
    }
    
    for idx, (season, months) in enumerate(seasons.items()):
        season_data = data[data['Date'].dt.month.isin(months)]
        corr = season_data.corr(method="pearson")

        corr = season_data.corr(numeric_only=True, method="pearson")
        mask = np.triu(np.ones_like(corr, dtype=bool))
        np.fill_diagonal(mask, False)
        
        plt.figure(figsize=(12, 10))
        heatmap = sns.heatmap(
                corr, mask=mask, vmin=-1, vmax=1,
                annot=True, fmt=".2f",
                cmap='coolwarm',
                annot_kws={"size": 7, "weight": "bold"},
                cbar_kws={"shrink": .6, "ticks": np.linspace(-1, 1, 5)}
        )
        colorbar = heatmap.collections[0].colorbar
        colorbar.set_label('Pearson Correlation Coefficients (r)', fontsize=14)
        colorbar.ax.tick_params(labelsize=12)
        heatmap.figure.axes[-1].yaxis.label.set_size(10)
        heatmap.figure.axes[-1].tick_params(labelsize=14)
        plt.grid(False)
        plt.tight_layout()
        plt.savefig(
            f'../plots/eda/correlation/seasonal-{season}.svg',
            format='svg',
            bbox_inches='tight',
            transparent=True,
            dpi=600
        )
        plt.close()

In [12]:
seasonal_correlation(df)