# Exploratory Data Analysis
This notebook performs normality tests and visualizes distributions of selected variables.

In [1]:
# Import required libraries for data manipulation, statistical tests, and visualization

import pandas as pd
import numpy as np
from scipy.stats import shapiro, kstest
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kendalltau

## Plot Configuration
Configure Matplotlib parameters for consistent and professional plot styling.

In [2]:
# Configure Matplotlib parameters for consistent plot styling
plt.rcParams.update({
    'font.size': 16,              # Set default font size
    'axes.labelsize': 16,         # Set axis label font size
    'xtick.labelsize': 16,        # Set x-tick label font size
    'ytick.labelsize': 16,        # Set y-tick label font size
    'axes.linewidth': 1.2,        # Set axis line width
    'xtick.major.width': 1.2,     # Set x-tick line width
    'ytick.major.width': 1.2,     # Set y-tick line width
    'savefig.format': 'png',      # Save figures in PNG format
    'savefig.bbox': 'tight',      # Use tight layout for saved figures
    'font.family': 'Times New Roman',  # Set font family
    'text.usetex': False          # Disable LaTeX rendering for text
})

## Data Loading
Load the dataset and select relevant columns, dropping any missing values.

In [3]:
# Read the CSV file into a DataFrame
df = pd.read_csv('../dataset/data.csv')
# Define columns of interest
columns = ['T', 'TSM', 'CHL', 'CDOM', 'EV']
# Select columns and drop rows with NaN values
data = df[columns].dropna()

## Normality Testing Function
Define a function to perform Shapiro-Wilk and Kolmogorov-Smirnov tests for normality.

In [4]:
def test_normality(data, column):
    # Perform Shapiro-Wilk test
    stat_sw, p_sw = shapiro(data[column])
    print(f'Shapiro-Wilk Test for {column}:')
    print(f'Statistic: {stat_sw}, p-value: {p_sw}')
    if p_sw > 0.05:
        print(f'{column} appears normally distributed (p > 0.05).')
    else:
        print(f'{column} does not appear normally distributed (p ≤ 0.05).')
    print()
    # Normalize data for KS test
    data_normalized = (data[column] - data[column].mean()) / data[column].std()
    # Perform Kolmogorov-Smirnov test
    stat_ks, p_ks = kstest(data_normalized, 'norm')
    print(f'Kolmogorov-Smirnov Test for {column}:')
    print(f'Statistic: {stat_ks}, p-value: {p_ks}')
    if p_ks > 0.05:
        print(f'{column} appears normally distributed (p > 0.05).')
    else:
        print(f'{column} does not appear normally distributed (p ≤ 0.05).')
    print()
    return stat_sw, p_sw, stat_ks, p_ks

## Perform Normality Tests and Prepare Table
Run normality tests on each column and compile results into a table.

In [5]:
# Run normality tests for each column
results = [test_normality(data, col) for col in columns]
# Prepare table data with metrics
table_data = [
    ['Metric', 'T', 'TSM', 'CHL', 'CDOM', 'EV'],
    ['SW'] + [f'{stat_sw:.2f}' for stat_sw, _, _, _ in results],
    ['p$_{SW}$'] + [f'{p_sw:.1e}' for _, p_sw, _, _ in results],
    ['KS'] + [f'{stat_ks:.2f}' for _, _, stat_ks, _ in results],
    ['p$_{KS}$'] + [f'{p_ks:.2f}' for _, _, _, p_ks in results],
    ['Normality'] + ['None' for _ in columns]
]

Shapiro-Wilk Test for T:
Statistic: 0.9554771922784835, p-value: 2.711899106539829e-07
T does not appear normally distributed (p ≤ 0.05).

Kolmogorov-Smirnov Test for T:
Statistic: 0.09552300529920998, p-value: 0.01427703340519887
T does not appear normally distributed (p ≤ 0.05).

Shapiro-Wilk Test for TSM:
Statistic: 0.9394579841640055, p-value: 5.079866008457815e-09
TSM does not appear normally distributed (p ≤ 0.05).

Kolmogorov-Smirnov Test for TSM:
Statistic: 0.11137089328248018, p-value: 0.002435405617728651
TSM does not appear normally distributed (p ≤ 0.05).

Shapiro-Wilk Test for CHL:
Statistic: 0.9329347763238338, p-value: 1.2091153724580618e-09
CHL does not appear normally distributed (p ≤ 0.05).

Kolmogorov-Smirnov Test for CHL:
Statistic: 0.09684380252899993, p-value: 0.012448983109398104
CHL does not appear normally distributed (p ≤ 0.05).

Shapiro-Wilk Test for CDOM:
Statistic: 0.963159730307284, p-value: 2.4552722155404775e-06
CDOM does not appear normally distributed 

## Visualize Distributions and Table
Create histograms for each variable and display the normality test results in a table.

In [6]:
# Create a 3x2 subplot grid
fig, axes = plt.subplots(3, 2, figsize=(10, 12), dpi=600)
axes = axes.flatten()

# Define color palette
colors = sns.color_palette("Set2", n_colors=5)
# Plot histograms for each column
for i, col in enumerate(columns):
    sns.histplot(data[col], bins=50, kde=True, color=colors[i], ax=axes[i])
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')
    axes[i].set_title('')

# Add table to the last subplot
table = axes[5].table(cellText=table_data, loc='center', cellLoc='center',
                      colWidths=[0.2, 0.16, 0.16, 0.16, 0.16, 0.16])

# Customize table appearance
table.auto_set_font_size(False)
table.set_fontsize(14)
table.scale(1.2, 2.0)
for (i, j), cell in table.get_celld().items():
    if i == 0:
        cell.set_text_props(weight='bold')
    cell.set_facecolor('#F5F5F5' if i % 2 == 0 else '#FFFFFF')
    cell.set_edgecolor('none')
    cell.set_linewidth(0 if j > 0 and j < len(table_data[0]) else 1)
# Turn off axis for table subplot
axes[5].axis('off')

# Adjust layout and save figure
plt.tight_layout()
plt.savefig(f"./../plots/normality.png", dpi=600, bbox_inches='tight')
plt.close()
# plt.show()

## Perform Kendall’s correlations

In [7]:
predictors = ['T', 'TSM', 'CHL', 'CDOM']
for pred in predictors:
    # Kendall’s correlation
    stat_kend, p_kend = kendalltau(data[pred], data['EV'])
    print(f'Correlation between {pred} and EV:')
    print(f'Kendall’s: τ = {stat_kend}, p-value = {p_kend}')
    if p_kend < 0.05:
        print(f'Kendall’s: Significant correlation (p < 0.05).')
    else:
        print(f'Kendall’s: No significant correlation (p ≥ 0.05).')
    print()

Correlation between T and EV:
Kendall’s: τ = 0.7465574047478246, p-value = 7.561264124933377e-74
Kendall’s: Significant correlation (p < 0.05).

Correlation between TSM and EV:
Kendall’s: τ = 0.01917715637408127, p-value = 0.6405186091810806
Kendall’s: No significant correlation (p ≥ 0.05).

Correlation between CHL and EV:
Kendall’s: τ = -0.3459491425192194, p-value = 3.6371028655468153e-17
Kendall’s: Significant correlation (p < 0.05).

Correlation between CDOM and EV:
Kendall’s: τ = -0.19681225535749486, p-value = 1.6473081808910434e-06
Kendall’s: Significant correlation (p < 0.05).

