In [2]:
import pandas as pd
from scipy.stats import kstest, norm  # Import K-S test and Normal distribution
import numpy as np
import os

# ---- EDIT EXCEL FILE PATH ----
file_path = r"D:\climate change\Tutorial_Climate Data_2025-26.xlsx"

# Load all sheets in Excel file
try:
    xls = pd.ExcelFile(file_path)
    print(f"Successfully loaded {file_path}")
except FileNotFoundError:
    print(f"‚ùå ERROR: File not found at {file_path}")
    exit()
except Exception as e:
    print(f"‚ùå ERROR: Could not load Excel file. {e}")
    exit()

print("Running Kolmogorov-Smirnov Test for Normality")
print("="*80)
print("(Null Hypothesis: The data is normally distributed)")
print("p > 0.05: Cannot reject null hypothesis (likely normal)")
print("p <= 0.05: Reject null hypothesis (likely not normal)")
print("="*80)


for sheet in xls.sheet_names:
    print(f"\nüìÑ Sheet: {sheet}")
    try:
        df = pd.read_excel(file_path, sheet_name=sheet)
    except Exception as e:
        print(f"  ...Could not read sheet '{sheet}'. Error: {e}")
        continue

    # Select only numeric columns
    num_df = df.select_dtypes(include=[np.number])

    if num_df.empty:
        print("  ...No numeric data found in this sheet.")
        continue

    for col in num_df.columns:
        data = num_df[col].dropna()

        # Skip if too few data points
        if len(data) < 5:
            print(f"  ‚ö† Skipped {col:25s} (not enough data, need at least 5)")
            continue
        
        # --- MODIFICATION ---
        # Removed the 5000 sample limit, as K-S test can handle large datasets.
        
        # Perform the Kolmogorov-Smirnov test.
        # We test the data against a normal distribution ('norm')
        # using the data's own mean and standard deviation as arguments.
        try:
            # Calculate mean and std dev from the data
            data_mean = np.mean(data)
            data_std = np.std(data, ddof=1) # Use ddof=1 for sample standard deviation
            
            if data_std == 0:
                print(f"  ‚ö† Skipped {col:25s} (data is constant, std dev is 0)")
                continue

            # Run the K-S test
            D, p = kstest(data, 'norm', args=(data_mean, data_std))
    
            result = "‚úÖ Normal" if p > 0.05 else "‚ùå Not Normal"
            print(f"  Variable: {col:25s} | D = {D:.4f} | p = {p:.4f} | Result: {result}")
        except Exception as e:
            print(f"  ...Could not run test on {col}. Error: {e}")

print("\n...Done.")



Successfully loaded D:\climate change\Tutorial_Climate Data_2025-26.xlsx
Running Kolmogorov-Smirnov Test for Normality
(Null Hypothesis: The data is normally distributed)
p > 0.05: Cannot reject null hypothesis (likely normal)
p <= 0.05: Reject null hypothesis (likely not normal)

üìÑ Sheet: Rainfall
  Variable: Stattion-1                | D = 0.3108 | p = 0.0000 | Result: ‚ùå Not Normal
  Variable: Station-2                 | D = 0.2944 | p = 0.0000 | Result: ‚ùå Not Normal
  Variable: Station-3                 | D = 0.3684 | p = 0.0000 | Result: ‚ùå Not Normal
  Variable: Station-4                 | D = 0.3809 | p = 0.0000 | Result: ‚ùå Not Normal
  Variable: Station-5                 | D = 0.3221 | p = 0.0000 | Result: ‚ùå Not Normal
  Variable: Station-6                 | D = 0.3402 | p = 0.0000 | Result: ‚ùå Not Normal
  Variable: Station-7                 | D = 0.3145 | p = 0.0000 | Result: ‚ùå Not Normal
  Variable: Station-8                 | D = 0.2825 | p = 0.0000 | Result: 