In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
# Read the comprehensive dataset
df = pd.read_csv('comprehensive_ganga_water_quality_2018_2021.csv')
df['Date'] = pd.to_datetime(df['Date'])

# 1. Temporal Analysis
def analyze_temporal_trends():
    """Analyze temporal trends in key water quality parameters"""
    temporal_means = df.groupby(df['Date'].dt.to_period('M')).agg({
        'Dissolved_Oxygen_mg/L': 'mean',
        'BOD_mg/L': 'mean',
        'pH': 'mean',
        'Temperature_C': 'mean'
    }).reset_index()
    
    print("\n=== Temporal Trends Analysis ===")
    print("Annual Averages:")
    annual_means = df.groupby(df['Date'].dt.year).mean(numeric_only=True)
    print(annual_means[['Dissolved_Oxygen_mg/L', 'BOD_mg/L', 'pH']].round(2))
    
    return temporal_means

# 2. Spatial Analysis
def analyze_spatial_patterns():
    """Analyze spatial variations across different regions"""
    spatial_stats = df.groupby('Region').agg({
        'Dissolved_Oxygen_mg/L': ['mean', 'std'],
        'BOD_mg/L': ['mean', 'std'],
        'Total_Coliform_MPN/100ml': ['mean', 'std'],
        'Pb_mg/L': ['mean', 'std']
    }).round(2)
    
    print("\n=== Spatial Patterns Analysis ===")
    print("Regional Statistics:")
    print(spatial_stats)
    
    return spatial_stats

# 3. Seasonal Analysis
def analyze_seasonal_patterns():
    """Analyze seasonal variations in water quality"""
    seasonal_stats = df.groupby('Season').agg({
        'Temperature_C': 'mean',
        'Dissolved_Oxygen_mg/L': 'mean',
        'BOD_mg/L': 'mean',
        'Total_Coliform_MPN/100ml': 'mean'
    }).round(2)
    
    print("\n=== Seasonal Patterns Analysis ===")
    print("Seasonal Averages:")
    print(seasonal_stats)
    
    return seasonal_stats

# 4. Water Quality Index Calculation
def calculate_wqi():
    """Calculate Water Quality Index based on key parameters"""
    # Assign weights to parameters (based on their importance)
    weights = {
        'Dissolved_Oxygen_mg/L': 0.3,
        'pH': 0.2,
        'BOD_mg/L': 0.2,
        'Total_Coliform_MPN/100ml': 0.15,
        'Total_Dissolved_Solids_mg/L': 0.15
    }
    
    # Calculate sub-indices
    wqi_df = df.copy()
    
    # Normalize values between 0 and 100
    for param in weights.keys():
        if param == 'pH':
            # For pH, optimal is around 7.5
            wqi_df[f'{param}_index'] = 100 - (abs(wqi_df[param] - 7.5) * 20)
        else:
            # For others, lower is better (except DO where higher is better)
            if param == 'Dissolved_Oxygen_mg/L':
                wqi_df[f'{param}_index'] = (wqi_df[param] / 9) * 100
            else:
                max_val = wqi_df[param].max()
                wqi_df[f'{param}_index'] = (1 - (wqi_df[param] / max_val)) * 100
    
    # Calculate final WQI
    wqi = 0
    for param, weight in weights.items():
        wqi += wqi_df[f'{param}_index'] * weight
    
    wqi_df['WQI'] = wqi
    
    print("\n=== Water Quality Index Analysis ===")
    print("Average WQI by Region:")
    print(wqi_df.groupby('Region')['WQI'].mean().round(2))
    
    return wqi_df['WQI']

# 5. Correlation Analysis
def analyze_correlations():
    """Analyze correlations between different parameters"""
    parameters = ['Dissolved_Oxygen_mg/L', 'BOD_mg/L', 'Temperature_C', 
                 'pH', 'Total_Dissolved_Solids_mg/L', 'Pb_mg/L']
    
    corr_matrix = df[parameters].corr().round(2)
    
    print("\n=== Parameter Correlations ===")
    print("Correlation Matrix:")
    print(corr_matrix)
    
    return corr_matrix

# 6. Compliance Analysis
def analyze_compliance():
    """Analyze compliance with water quality standards"""
    standards = {
        'pH': (6.5, 8.5),
        'Dissolved_Oxygen_mg/L': (5.0, None),  # Minimum 5.0
        'BOD_mg/L': (None, 3.0),  # Maximum 3.0
        'Total_Coliform_MPN/100ml': (None, 5000)  # Maximum 5000
    }
    
    compliance_results = {}
    for param, (min_val, max_val) in standards.items():
        if min_val and max_val:
            compliance = ((df[param] >= min_val) & (df[param] <= max_val)).mean() * 100
        elif min_val:
            compliance = (df[param] >= min_val).mean() * 100
        else:
            compliance = (df[param] <= max_val).mean() * 100
        compliance_results[param] = round(compliance, 2)
    
    print("\n=== Compliance Analysis ===")
    print("Percentage of Samples Meeting Standards:")
    for param, compliance in compliance_results.items():
        print(f"{param}: {compliance}%")
    
    return compliance_results

# Run all analyses
temporal_results = analyze_temporal_trends()
spatial_results = analyze_spatial_patterns()
seasonal_results = analyze_seasonal_patterns()
wqi_results = calculate_wqi()
correlation_results = analyze_correlations()
compliance_results = analyze_compliance()


=== Temporal Trends Analysis ===
Annual Averages:
      Dissolved_Oxygen_mg/L  BOD_mg/L    pH
Date                                       
2018                   7.73      4.14  7.79
2019                   7.71      4.24  7.74
2020                   7.79      4.14  7.84
2021                   7.65      4.01  7.76

=== Spatial Patterns Analysis ===
Regional Statistics:
       Dissolved_Oxygen_mg/L       BOD_mg/L       Total_Coliform_MPN/100ml  \
                        mean   std     mean   std                     mean   
Region                                                                       
Lower                   7.85  1.50     3.93  1.60                 20140.06   
Middle                  7.65  1.28     4.30  1.69                 11905.47   
Upper                   7.65  1.44     4.16  1.51                  5106.89   

                Pb_mg/L        
            std    mean   std  
Region                         
Lower   2360.72    0.15  0.02  
Middle  1394.43    0.08  0.01  
