In [None]:
"""
River Discharge Data Analysis
Description: This script analyzes river discharge data to compute various hydrological statistics and metrics, 
such as mean flow, coefficient of variation, flashiness index, and 7-day low flow. The data is cleaned to 
remove non-numeric values and processed to identify significant flow events. It is designed to provide insights 
into water flow characteristics over a period.
"""


In [4]:
import pandas as pd
import numpy as np
from scipy.stats import skew

def load_data(filepath):
    df = pd.read_csv(filepath, parse_dates=['datetimeUTC'])
    df.set_index('datetimeUTC', inplace=True)
    return df

def clean_data(df):
    # Convert to numeric, setting errors='coerce' will convert non-convertibles to NaN
    df['discharge_cfs'] = pd.to_numeric(df['discharge_cfs'], errors='coerce')
    return df

def descriptive_stats(df):
    desc_stats = df.describe()
    desc_stats.loc['skew'] = df['discharge_cfs'].skew()  # Calculate skewness
    return desc_stats

def coeff_of_variation(df):
    mean = df['discharge_cfs'].mean()
    std_dev = df['discharge_cfs'].std()
    return (std_dev / mean) * 100

def flashiness_index(df):
    daily_changes = df['discharge_cfs'].diff().abs()
    total_flow = df['discharge_cfs'].sum()
    return daily_changes.sum() / total_flow if total_flow != 0 else np.nan

def seven_day_low_flow(df):
    return df['discharge_cfs'].rolling(window=7).mean().min()

def median_flow(df):
    return df['discharge_cfs'].median()

def high_flow_events(df, times=3):
    median_flow = df['discharge_cfs'].median()
    return df[df['discharge_cfs'] > times * median_flow]

if __name__ == '__main__':
    filepath = "C:/Users/mgalib/Box/Finley_UH_work/Data/TN/TN_2012/Raw_Data/Raw_Data_03431700/Raw_Discharge_03431700.csv"
    data = load_data(filepath)
    data = clean_data(data)  # Ensure data is clean

    # Check and print if there are any remaining non-numeric values
    if data['discharge_cfs'].isnull().any():
        print("Non-numeric values detected and converted to NaN.")
    
    print("Descriptive Statistics:\n", descriptive_stats(data))
    print("Coefficient of Variation (%):", coeff_of_variation(data))
    print("Richards-Baker Flashiness Index:", flashiness_index(data))
    print("7-day Low Flow (cfs):", seven_day_low_flow(data))
    print("Median Daily Flow (cfs):", median_flow(data))

    high_flows = high_flow_events(data, times=3)  # Adjust 'times' as needed
    print(f"High Flow Events (Flow > 3 times the median):\n {high_flows}")


Non-numeric values detected and converted to NaN.
Descriptive Statistics:
        discharge_cfs
count   34699.000000
mean       26.386639
std        89.957117
min         1.540000
25%         6.100000
50%        11.900000
75%        25.600000
max      4390.000000
skew       26.722130
Coefficient of Variation (%): 340.9191934680508
Richards-Baker Flashiness Index: 0.04792754451149035
7-day Low Flow (cfs): 1.54
Median Daily Flow (cfs): 11.9
High Flow Events (Flow > 3 times the median):
                            discharge_cfs qualifiers
datetimeUTC                                        
2012-01-01 06:00:00+00:00           37.8          A
2012-01-01 06:15:00+00:00           37.8          A
2012-01-01 06:30:00+00:00           37.8          A
2012-01-01 06:45:00+00:00           37.8          A
2012-01-01 07:00:00+00:00           37.8          A
...                                  ...        ...
2012-12-31 19:30:00+00:00           36.0          A
2012-12-31 19:45:00+00:00           36.0  