In [None]:
#%%
## IMPORT MODULES
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter, MaxNLocator
import matplotlib.dates as mdates
import seaborn as sns
from calendar import monthrange

from definitions import (locations, quality_codes, completeness_codes)
from eda_functions import (get_flow_data, get_rainfall_data,
                       resample_flow_hr, resample_rf_hr,
                       quality_count,
                       plot_sample,
                       plot_histogram,
                       plot_quality, 
                       plot_seasonal,
                       rainfall_sample,
                       offset_flow)


In [None]:
#%%
## LOAD DATA

# load flow data for Adelphi Weir
adelphi = get_flow_data('Adelphi-Weir')
# note - 'completeness' and 'qcode' empty for Adelphi Weir

# load rainfall data for locations in the Irwell catchment
rainfall_data = get_rainfall_data(locations)

# group rainfall and flow data by hour
adelphi_hr = resample_flow_hr(adelphi)
rainfall_hr = resample_rf_hr(rainfall_data)

# create mean rainfall dataframe
rf_mean_hr = pd.DataFrame(index=rainfall_hr[locations[0]].index)
for location in rainfall_hr.keys():
    rf_mean_hr[location] = rainfall_hr[location]['value']
rf_mean_hr['rf_mean'] = rf_mean_hr[locations].mean(axis=1)
rf_mean_hr = rf_mean_hr[['rf_mean']]


In [None]:
#%%
## EDA for Adelphi Weir flow data 
# basic statistics for flow

print('Total datapoints: ', len(adelphi))
print('Overall mean flow: ', adelphi['value'].mean())
print('Overall maximum flow: ', max(adelphi['value']))

In [None]:
# %%
# plot sampled flow data
# sample of mean weekly/monthly/quarterly/yearly data

sample_interval = 'Y' # 'W','M','Q','Y'
adelphi_plot = adelphi.loc['1976':'2024'] # change time window
# minmax also plots minimum and maximum flows for relevant period - use with logscale
plot_sample(adelphi_plot,
        sample_interval,
        minmax=True,
        logscale=True)


In [None]:
#%%
# plot histogram of flow levels
plot_histogram(adelphi, quantile=0.99, bins=500)
plt.xlabel('Flow (m$^3$/s)')




In [None]:
#%%
# number of data points by quality
quality_count(adelphi, year=2000)


In [None]:
#%%
# plot histogram of data quality for each year
plot_quality(adelphi, title=None)


In [None]:
#%%
# plot data quality broken down for a single year
plot_quality(adelphi, year=2007)




In [None]:
#%%

# seasonal plot with daily/weekly averages

plot_seasonal(adelphi, interval='monthly', log=True, cmap=None,
     linewidth=1, linewidth_mean=4)


In [None]:
#%%
## EDA for catchment rainfall data

# plot data quality histograms for each rainfall dataset
for location in rainfall_data.keys():
    plot_quality(rainfall_data[location],
                 title = location.replace('-',' '),
                 completeness=False)




In [None]:
#%%
# regress rainfall data against flow data displaced by a number of hours

def get_corrs(location, sample_size, offsets=None):

    '''
    finds correlations between rainfall and flow offset by different numbers of hours
    works for each individual location or 'all' locations
    '''

    # take sample of rainfall data
    rf_sample = rainfall_sample(rainfall_hr, loc=location, sample_size=sample_size)


    # combine with flow data offset by given numbers of hours
    if offsets == None:
        offsets = [6,7,8,9,10] # offsets to use in hours
    df_offset = offset_flow(rf_sample, adelphi, offsets)


    # get regression parameters for each offset
    if location == 'all':
        rf_col = 'rf_mean'
    else:
        rf_col = 'value'

    corrs = {}
    for offset in offsets:
        corr = df_offset[rf_col].corr(df_offset[f'flow_{offset}h'])
        corrs[offset] = corr
    # return column name for plotting
    return df_offset, corrs, rf_col

In [None]:
#%%



In [None]:
#%%

# plot correlation between rainfall and flow with offset
offsets = list(range(0,21))
df_offset, corrs, rf_col = get_corrs('all', 200000, offsets=offsets)

plt.figure(figsize=(6,4))
plt.plot(offsets, list(corrs.values()), '-o', color='b')

plt.gca().spines[:].set_visible(False)
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
plt.xlabel('Offset (hours)')
plt.ylabel('Correlation of rainfall and flow')
plt.ylim(0, 0.6)



In [None]:
#%%
# regression plot of rainfall against offset flow
def rainfall_regplot(data, offset, col, group=True, log=False):
    '''
    regression plot of rainfall against offset flow
    '''
    plt.figure(figsize=(7,5))

    if group:
        # round data to create bins for plotting
        bin_width = 0.2
        data['rf_rounded'] = (data[col] // bin_width) * bin_width
        # regression plot of binned data
        sns.regplot(data = data, x = 'rf_rounded', y = f'flow_{offset}h', 
                    #order=2,
                    x_estimator=np.mean
                    )
    else:
        # regression plot of data without grouping
        sns.regplot(data = data, x = col, y = f'flow_{offset}h', 
                    #order=2,
                    #x_jitter=0.01, 
                    scatter_kws={'s':10, 'alpha':0.5}
                    #x_bins = 200
                    )
    # rugplot of unrounded data to view rainfall distribution
    sns.rugplot(x=col, data=data, height=0.03, 
                lw=10, alpha=0.5, color=sns.color_palette()[0])
    plt.gca().spines[:].set_visible(False)
    #plt.xlim(-0.1,3)
    #plt.ylim(0,200)
    if log:
        plt.yscale('log')
    #plt.xscale('log')
    plt.xlabel('Mean catchment rainfall (mm)')
    plt.ylabel(f'Flow after {offset} hours')

df_offset, corrs, rf_col = get_corrs('all', 10000)
rainfall_regplot(df_offset, 8, col=rf_col, group=True, log=False)



In [None]:
#%%

def get_all_corrs(sample_size, samples=7):
    '''
    find correlations for each location to compare locations
    use given number of samples for each location
    '''
    df_corrs = pd.DataFrame(columns=['location', 'corr_8h'])
    for location in (locations + ['all']):
        for i in range(samples):
            df_offset, corrs, rf_col = get_corrs(location, sample_size)
            corr_8h = corrs[8]
            df_corrs.loc[len(df_corrs)] = [location, corr_8h]
    return df_corrs


In [None]:
#%%

# get correlations with flow for each rain gauge
## takes 2-3 minutes ##

df_corrs = get_all_corrs(10000, samples=7)
#df_corrs['location'] = df_corrs['location'].str.capitalize()
#df_corrs = df_corrs.sort_values(by='location')


In [None]:
#%%
# plot correlation with flow for each rain gauge

# function to print gauge names properly
def formatter_func(x, pos):
    x = x.replace('-', ' ')
    x = x.split()
    x = '\n'.join(x[:2])
    return x
formatter = FuncFormatter(formatter_func)

# plot correlation between rainfall and 8-hour-offset flow for each location
palette = [sns.color_palette()[0]]*9 + [sns.color_palette()[2]]
#plt.figure(figsize=(3,2))

sns.catplot(data=df_corrs, kind='point', 
            x='location', y='corr_8h', 
            errorbar="ci",
            height = 4.5,
            aspect = 1.8,
            palette = palette,
            formatter=formatter)
plt.ylim(0, 0.6)
plt.xlabel('Rain gauge location')
plt.ylabel('Correlation with flow (8 hour offset)')
plt.gca().spines[:].set_visible(False)


# result: using all locations shows a significantly higher correlation than each individual location



In [None]:
#%%
# example rainfall data histogram

bury = rainfall_data['Bury']
bury['value'].hist(bins=40)
plt.yscale('log')
plt.gca().spines[:].set_visible(False)
plt.grid(False)
plt.xlabel('15 minute rainfall (mm)')
plt.ylabel('Frequency')



In [None]:
#%%

# runoff coefficient 
# (proportion of total rainfall that becomes runoff)

# combine hourly flow and catchment mean rainfall data
df_rc = pd.concat([adelphi_hr, rf_mean_hr], axis=1)

# use only years starting 2001
df_rc = df_rc[df_rc.index.year > 2000]
# remove recent months where no flow data
df_rc = df_rc[df_rc.index.date < pd.to_datetime('2024-03-31').date()]
df_rc = df_rc.rename(columns={'value': 'flow'})

# resample data by quarter
df_rc = df_rc.resample('QE', # quarter end
                       closed='right',
                       label='right').agg({
                           'flow': 'mean',
                           'rf_mean': 'sum',
                           'quality_max': 'max', 
                           'quality_min': 'min'
                              })

In [None]:
#%%
# convert data to km^3:

# function to get the number of days in the quarter
def get_days_in_quarter(date):
    quarter_days = 0
    for i in range(3):
        quarter_days += monthrange(date.year, date.month-i)[1]
    return quarter_days
df_rc['quarter_days'] = df_rc.index.to_series().apply(get_days_in_quarter)

# multiply flow by number of seconds in quarter and convert from m^3 to km^3
seconds_per_day = 24*60*60
df_rc['total_flow'] = df_rc['flow']*seconds_per_day*df_rc['quarter_days']/(np.power(1000,3))

# convert rainfall from mm to km and multiply by catchment area
catchment_area = 559.4 # Adelphi Weir catchment area in km^2
df_rc['total_rf'] = df_rc['rf_mean']*catchment_area/(1000000)


In [None]:
#%%
# analyse runoff coefficient

df_rc['runoff_coefficient'] = df_rc['total_flow']/df_rc['total_rf']
max_rc = df_rc['runoff_coefficient'].max()
min_rc = df_rc['runoff_coefficient'].min()

def find_mean_rc(df):
    mean_rc = df['total_flow'].sum()/df['total_rf'].sum()
    return mean_rc
mean_rc = find_mean_rc(df_rc)

# find summer and winter values
df_winter = df_rc[(df_rc.index.month==3)|(df_rc.index.month==12)]
df_summer = df_rc[(df_rc.index.month==6)|(df_rc.index.month==9)]
winter_rc = find_mean_rc(df_winter)
summer_rc = find_mean_rc(df_summer)

print('Runoff coefficients:\n' +
       f'\toverall: {round(mean_rc,3)}\n' +
       f'\twinter: {round(winter_rc,3)}\n' +
       f'\tsummer: {round(summer_rc,3)}\n' +
       f'\tmaximum (quarterly): {round(max_rc,3)}\n' +
       f'\tminimum (quarterly): {round(min_rc,3)}')

# higher values in winter consistent with Beven p55
# true values may be lower as -
# - most rain gauges are in valleys rather than on hills
# however there are exceptions such as Blackstone Edge No. 2
# a tendency to underestimate rainfall shouldn't significantly affect model performance, -
# - but may be an issue when extending the model to other catchments


In [None]:
#%%
years = df_rc.shape[0]/4
rf_annual_mean = df_rc['rf_mean'].sum()/years
print(f'Mean annual rainfall: {round(rf_annual_mean)} mm')
# similar to Met Office mean for Rochdale (1119 mm)
# probably higher due to more rainfall at higher elevations

In [None]:
#%%
# create a stacked bar chart showing rainfall and flow
df_rc['rf_flow'] = df_rc[['total_flow', 'total_rf']].min(axis=1)
df_rc['rf_lost'] = (df_rc['total_rf']-df_rc['total_flow']).clip(lower=0)
df_rc['excess_flow'] = (df_rc['total_flow']-df_rc['total_rf']).clip(lower=0)

df_plot = df_rc[['rf_flow', 'rf_lost', 'excess_flow']]
p = sns.color_palette()
colours = [p[0], p[2], p[3]]
fig, ax = plt.subplots(figsize=(12, 6))
df_plot.plot(kind='bar', 
             stacked=True, 
             color=colours, 
             width=0.8,
             ax=ax)

#plt.xticks(range(0, len(df_rc.index), 12), df_rc.index[::12])

unique_years = df_plot.index.year.unique()

plt.legend(['Effective rainfall', 'Excess rainfall', 'Excess flow'], frameon=False)
# Format x-axis tick labels to show only one label per year
tick_positions = [df_plot.index.year.tolist().index(year) for year in unique_years]
ax.set_xticks(tick_positions)
ax.set_xticklabels(unique_years)

plt.xlabel('Date')
plt.ylabel('Quarterly discharge (km$^3$)')
ax.spines[:].set_visible(False)

# quarters with excess flow tend to begin just after rainfall events 
# so start with a period of high flow where corresponding rainfall included in previous month
#plt.tight_layout()