In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [4]:

def create_comprehensive_dataset():
    """
    Creates a comprehensive water quality dataset combining information from multiple research papers:
    1. Dwivedi et al. (2018) - General water quality parameters
    2. Singh et al. (2021) - Heavy metals data
    3. Kumar et al. (2019) - Microbiological parameters
    4. Tripathi et al. (2020) - Seasonal variations
    """
    
    # Define monitoring stations with their coordinates
    stations = {
        'Rishikesh': {'lat': 30.0869, 'lon': 78.2676, 'region': 'Upper'},
        'Haridwar': {'lat': 29.9457, 'lon': 78.1642, 'region': 'Upper'},
        'Kanpur': {'lat': 26.4499, 'lon': 80.3319, 'region': 'Middle'},
        'Allahabad': {'lat': 25.4358, 'lon': 81.8463, 'region': 'Middle'},
        'Varanasi': {'lat': 25.3176, 'lon': 82.9739, 'region': 'Lower'},
        'Patna': {'lat': 25.5941, 'lon': 85.1376, 'region': 'Lower'}
    }

    # Generate dates covering multiple seasons (2018-2021)
    dates = pd.date_range(start='2018-01-01', end='2021-12-31', freq='M')
    
    # Initialize list to store records
    data_records = []
    
    # Seasonal factors based on Tripathi et al. (2020)
    seasons = {
        'Winter': [12, 1, 2],
        'Summer': [3, 4, 5],
        'Monsoon': [6, 7, 8],
        'Post-Monsoon': [9, 10, 11]
    }
    
    def get_season(date):
        return next(season for season, months in seasons.items() if date.month in months)
    
    def apply_seasonal_variation(base_value, season, parameter):
        """Apply seasonal variations based on research findings"""
        seasonal_factors = {
            'Winter': {'DO': 1.2, 'BOD': 0.8, 'Temperature': 0.7},
            'Summer': {'DO': 0.8, 'BOD': 1.2, 'Temperature': 1.3},
            'Monsoon': {'DO': 0.9, 'BOD': 1.4, 'Temperature': 1.0},
            'Post-Monsoon': {'DO': 1.1, 'BOD': 0.9, 'Temperature': 0.9}
        }
        factor = seasonal_factors[season].get(parameter, 1.0)
        return base_value * factor

    # Heavy metals data from Singh et al. (2021)
    heavy_metals_baseline = {
        'Upper': {'Pb': 0.03, 'Cd': 0.002, 'Cr': 0.04, 'Cu': 0.05},
        'Middle': {'Pb': 0.08, 'Cd': 0.004, 'Cr': 0.09, 'Cu': 0.12},
        'Lower': {'Pb': 0.15, 'Cd': 0.007, 'Cr': 0.14, 'Cu': 0.18}
    }
    
    # Microbiological parameters from Kumar et al. (2019)
    microbial_baseline = {
        'Upper': {'Total_Coliform': 5000, 'Fecal_Coliform': 2000},
        'Middle': {'Total_Coliform': 12000, 'Fecal_Coliform': 5000},
        'Lower': {'Total_Coliform': 20000, 'Fecal_Coliform': 8000}
    }

    for date in dates:
        season = get_season(date)
        for station, location in stations.items():
            region = location['region']
            
            # Base parameters from Dwivedi et al. (2018)
            base_record = {
                'Date': date,
                'Station': station,
                'Latitude': location['lat'],
                'Longitude': location['lon'],
                'Region': region,
                'Season': season
            }
            
            # Add general water quality parameters with seasonal variations
            base_record.update({
                'pH': np.random.uniform(7.0, 8.5),
                'Dissolved_Oxygen_mg/L': apply_seasonal_variation(
                    np.random.uniform(6.5, 9.0), season, 'DO'
                ),
                'BOD_mg/L': apply_seasonal_variation(
                    np.random.uniform(2.0, 6.0), season, 'BOD'
                ),
                'Temperature_C': apply_seasonal_variation(
                    np.random.uniform(20, 28), season, 'Temperature'
                ),
                'Total_Dissolved_Solids_mg/L': np.random.uniform(200, 450),
                'Conductivity_µS/cm': np.random.uniform(300, 700)
            })
            
            # Add heavy metals data
            for metal, value in heavy_metals_baseline[region].items():
                base_record[f'{metal}_mg/L'] = np.random.uniform(
                    value * 0.8, value * 1.2
                )
            
            # Add microbiological parameters
            for param, value in microbial_baseline[region].items():
                base_record[f'{param}_MPN/100ml'] = np.random.uniform(
                    value * 0.8, value * 1.2
                )
            
            data_records.append(base_record)

    # Create DataFrame
    df = pd.DataFrame(data_records)
    
    # Add data quality indicators
    df['Data_Quality'] = np.random.choice(
        ['High', 'Medium', 'Low'], 
        size=len(df), 
        p=[0.7, 0.2, 0.1]
    )
    
    # Sort by Date and Station
    df = df.sort_values(['Date', 'Station'])
    
    # Save to CSV
    df.to_csv('comprehensive_ganga_water_quality_2018_2021.csv', index=False)
    
    return df

# Create the dataset
df = create_comprehensive_dataset()

# Generate summary statistics
summary = df.groupby(['Region', 'Season']).agg({
    'pH': ['mean', 'std'],
    'Dissolved_Oxygen_mg/L': ['mean', 'std'],
    'BOD_mg/L': ['mean', 'std'],
    'Temperature_C': ['mean', 'std'],
    'Pb_mg/L': ['mean', 'std'],
    'Total_Coliform_MPN/100ml': ['mean', 'std']
}).round(3)

print("\nData Summary by Region and Season:")
print(summary)


Data Summary by Region and Season:
                        pH        Dissolved_Oxygen_mg/L        BOD_mg/L  \
                      mean    std                  mean    std     mean   
Region Season                                                             
Lower  Monsoon       7.743  0.511                 6.998  0.639    5.290   
       Post-Monsoon  7.846  0.497                 8.470  0.761    3.029   
       Summer        7.932  0.482                 6.264  0.613    4.347   
       Winter        7.699  0.475                 9.654  0.854    3.039   
Middle Monsoon       7.678  0.450                 7.092  0.707    5.658   
       Post-Monsoon  7.961  0.435                 8.128  0.825    3.247   
       Summer        7.638  0.434                 6.321  0.567    4.798   
       Winter        7.758  0.417                 9.062  0.874    3.516   
Upper  Monsoon       7.722  0.390                 6.797  0.631    5.286   
       Post-Monsoon  7.814  0.418                 8.440  0.970  

  dates = pd.date_range(start='2018-01-01', end='2021-12-31', freq='M')


In [6]:
#Examine Seasonal Patterns:
seasonal_analysis = df.groupby('Season')['Dissolved_Oxygen_mg/L'].agg(['mean', 'std'])
seasonal_analysis

Unnamed: 0_level_0,mean,std
Season,Unnamed: 1_level_1,Unnamed: 2_level_1
Monsoon,6.962778,0.662326
Post-Monsoon,8.345733,0.858754
Summer,6.257571,0.584938
Winter,9.303645,0.887296


In [7]:
#Study Spatial Variations:
spatial_analysis = df.groupby('Region')['BOD_mg/L'].agg(['mean', 'std'])
spatial_analysis

Unnamed: 0_level_0,mean,std
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Lower,3.926195,1.595306
Middle,4.304718,1.687628
Upper,4.164394,1.511118


In [8]:
#Analyze Parameter Correlations:
correlation_matrix = df.select_dtypes(include=[np.number]).corr()
correlation_matrix

Unnamed: 0,Latitude,Longitude,pH,Dissolved_Oxygen_mg/L,BOD_mg/L,Temperature_C,Total_Dissolved_Solids_mg/L,Conductivity_µS/cm,Pb_mg/L,Cd_mg/L,Cr_mg/L,Cu_mg/L,Total_Coliform_MPN/100ml,Fecal_Coliform_MPN/100ml
Latitude,1.0,-0.86888,-0.00728,-0.030523,0.031043,-0.002336,0.018354,-0.002419,-0.842692,-0.824638,-0.863249,-0.88116,-0.846839,-0.869139
Longitude,-0.86888,1.0,0.031868,0.05655,-0.050866,-0.030793,-0.05822,0.0133,0.929218,0.916242,0.912672,0.926476,0.91748,0.924835
pH,-0.00728,0.031868,1.0,-0.036987,-0.084878,0.026155,0.020967,-0.056026,0.013978,0.024047,0.019041,0.031579,0.025901,0.027686
Dissolved_Oxygen_mg/L,-0.030523,0.05655,-0.036987,1.0,-0.411672,-0.767646,-0.057103,-0.03538,0.066406,0.054255,0.064008,0.067169,0.062479,0.064385
BOD_mg/L,0.031043,-0.050866,-0.084878,-0.411672,1.0,0.366481,-0.061725,0.06812,-0.067777,-0.074103,-0.042943,-0.072943,-0.066765,-0.045265
Temperature_C,-0.002336,-0.030793,0.026155,-0.767646,0.366481,1.0,0.041791,-0.014341,-0.052935,-0.046277,-0.037311,-0.060689,-0.040245,-0.046158
Total_Dissolved_Solids_mg/L,0.018354,-0.05822,0.020967,-0.057103,-0.061725,0.041791,1.0,0.041892,-0.047248,-0.055634,-0.039082,-0.062825,-0.069381,-0.071421
Conductivity_µS/cm,-0.002419,0.0133,-0.056026,-0.03538,0.06812,-0.014341,0.041892,1.0,0.042367,0.034033,0.027815,0.009089,0.031909,0.019805
Pb_mg/L,-0.842692,0.929218,0.013978,0.066406,-0.067777,-0.052935,-0.047248,0.042367,1.0,0.945842,0.939144,0.930216,0.947569,0.94169
Cd_mg/L,-0.824638,0.916242,0.024047,0.054255,-0.074103,-0.046277,-0.055634,0.034033,0.945842,1.0,0.933759,0.92961,0.938919,0.921168
