# Bellingham Stormwater Monitoring Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from IPython.display import display, HTML

# Define the CSS to set the height of the output container
css_style = """
<style>
    div.output_area {
        height: 800px; /* Adjust this value based on your needs */
        overflow-y: auto; /* Vertical scroll */
        overflow-x: auto; /* Horizontal scroll */
    }
</style>
"""
# Apply the CSS style to the notebook
display(HTML(css_style))

pd.set_option('display.max_rows', 1000) 
pd.set_option('display.max_columns', 1000) 

## TODO
1. Review the other site data as well as Bellingham
2. Determine treatment of duplicates
3. Add F chart to show variability.  Review F-Chart calcs from stats book.  Not std
4. Research how e.coli data is collected
5. What is variability with time and incubation?
6. Verify that all these tests were taken with similar methods (R-Card, lab, ...).  Comments refer to counts on some measurements which allude to R-Card


![Bacteria](e.coli-sampling-protocol.png) 

In [None]:
data = pd.read_csv('Salish Sea Stormwater Monitoring Database-20240528.csv')

print(data.columns)

## Clean

In [None]:
# Remove 'Unnamed' columns
data = data.loc[:, ~data.columns.str.contains('^Unnamed')].dropna(how='all')
data

In [None]:
# Identify duplicates
duplicates = data[data.duplicated(keep=False)]
data['is_duplicate'] = cleaned_data.duplicated(subset=['Site ID', 'Sample Date'], keep=False)

#display(data[data['is_duplicate'] == True])
data[data['is_duplicate'] == True]

In [None]:
data.columns

In [None]:
# Process dates and relevant columns for E. coli and Enterococcus
data['Sample Date'] = pd.to_datetime(data['Sample Date'], errors='coerce', format='%m-%d-%y')

ecoli = data[['Sample Date', 'Site ID', 'E. Coli', 'Comments', 'is_duplicate']]

# Extract the month for aggregation in ecoli
ecoli['Month'] = ecoli['Sample Date'].dt.to_period('M')  # Convert date to month period for grouping

enterococcus = data[['Sample Date', 'Site ID', 'Enterococcus', 'Comments', 'is_duplicate']]
# Extract the month for aggregation in ecoli
enterococcus['Month'] = enterococcus['Sample Date'].dt.to_period('M')  # Convert date to month period for grouping

# Sort data by date
ecoli = ecoli.sort_values('Sample Date')
enterococcus = enterococcus.sort_values('Sample Date')

In [None]:
ecoli.site_id = 'Bennett Ave'

In [None]:
enterococcus

In [None]:
fig, ax = plt.subplots(figsize=(16, 8))
for site_id, group in ecoli.groupby('Site ID'):
    ax.plot(group['Sample Date'], group['E. Coli'], marker='o', linestyle='-', label=f'Site {site_id}')

ax.set_title('Raw E. coli Readings by Site Over Time')
ax.set_xlabel('Sample Date')
ax.set_ylabel('E. Coli (cfu/100ml)')
ax.axhline(y=310, color='red', linestyle='--', label='Threshold (310 cfu/100ml)')
ax.legend(title='Site ID')
plt.xticks(rotation=45)  # Rotate x-axis labels for better visibility
plt.tight_layout()
plt.show()



Bennett Ave Data has mnay outliers 

In [None]:
# Create a figure with two subplots (ax for raw readings and ax2 for variability F-chart)
fig, (ax, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(16, 16), sharex=True)

# Plotting E. Coli readings by Site
for site_id, group in ecoli.groupby('Site ID'):
    ax.plot(group['Sample Date'], group['E. Coli'], marker='o', linestyle='-', label=f'Site {site_id}')

ax.set_title('Raw E. coli Readings by Site Over Time')
ax.set_ylabel('E. Coli (cfu/100ml)')
ax.axhline(y=310, color='red', linestyle='--', label='Threshold (310 cfu/100ml)')
ax.legend(title='Site ID')
plt.xticks(rotation=45)  # Rotate x-axis labels for better visibility

# Plotting F-chart of variability (Standard Deviation over time)
# Calculating standard deviation for each date
std_dev_by_date = ecoli.groupby('Sample Date')['E. Coli'].std()
ax2.plot(std_dev_by_date.index, std_dev_by_date, marker='o', linestyle='-', color='green')

# Plotting F-chart of variability (Standard Deviation over time) by Site ID

#for site_id, group in ecoli.groupby('Site ID'):
#    std_dev = group.groupby('Sample Date')['E. Coli'].std()
#    ax2.plot(std_dev.index, std_dev, marker='o', linestyle='-', label=f'Site {site_id}')

ax2.set_title('Standard Deviation of E. Coli Readings by Site Over Time')
ax2.set_xlabel('Sample Date')
ax2.set_ylabel('Standard Deviation (cfu/100ml)')
ax2.legend(title='Site ID')

plt.tight_layout()
plt.show()

In [None]:
ecoli[ecoli['Site ID'] == 'Bennett Ave']

In [None]:
# Aggregate data for plotting
ecoli_aggregated_mean = ecoli.groupby(['Month', 'Site ID'])['E. Coli'].mean().unstack()
ecoli_aggregated_std = ecoli.groupby(['Month', 'Site ID'])['E. Coli'].std().unstack()
enterococcus_aggregated = enterococcus.groupby(['Month', 'Site ID'])['Enterococcus'].mean().unstack()

In [None]:
ecoli_aggregated_mean
#ecoli_aggregated[ecoli_aggregated['Site ID'] == 'Bennett Ave']

In [None]:
ecoli_threshold = 310
enterococcus_threshold = 500

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(16, 12), sharex=True)
ecoli_aggregated_mean.clip(upper=5000).plot(ax=ax[0], marker='o', linestyle='-', title='E. Coli Readings by Site Over Time with Clamped Values')
ax[0].set_ylabel('E. Coli (cfu/100ml)')
ax[0].axhline(y=ecoli_threshold, color='red', linestyle='--', label='E. Coli Threshold (cfu/100ml)')
ecoli_aggregated_std.plot(ax=ax[1], marker='o', linestyle='-', title='Std Dev of E. Coli Readings')

In [None]:
# Plotting the control charts
fig, ax = plt.subplots(2, 1, figsize=(16, 12), sharex=True)
ecoli_aggregated_mean.clip(upper=5000).plot(ax=ax[0], marker='o', linestyle='-', title='E. Coli Readings by Site Over Time with Clamped Values')
ax[0].set_ylabel('E. Coli (cfu/100ml)')
ax[0].axhline(y=ecoli_threshold, color='red', linestyle='--', label='E. Coli Threshold (cfu/100ml)')
enterococcus_aggregated.plot(ax=ax[1], marker='o', linestyle='-', title='Enterococcus Readings by Site Over Time')
ax[1].set_ylabel('Enterococcus (cfu/100ml)')
ax[1].axhline(y=enterococcus_threshold, color='red', linestyle='--', label='Enterococcus Threshold (cfu/100ml)')
plt.tight_layout()
plt.show()

In [None]:
ecoli_aggregated_mean = ecoli_aggregated[ecoli_aggregated.index >= '2023-01']
enterococcus_aggregated_mean = enterococcus_aggregated[enterococcus_aggregated.index >= '2023-01']

fig, ax = plt.subplots(2, 1, figsize=(16, 12), sharex=True)
ecoli_aggregated_mean.clip(upper=5000).plot(ax=ax[0], marker='o', linestyle='-', title='E. Coli Readings by Site Over Time')
ax[0].set_ylabel('E. Coli (cfu/100ml)')
ax[0].axhline(y=ecoli_threshold, color='red', linestyle='--', label='E. Coli Threshold (cfu/100ml)')
enterococcus_aggregated.plot(ax=ax[1], marker='o', linestyle='-', title='Enterococcus Readings by Site Over Time')
ax[1].set_ylabel('Enterococcus (cfu/100ml)')
ax[1].axhline(y=enterococcus_threshold, color='red', linestyle='--', label='Enterococcus Threshold (cfu/100ml)')

plt.tight_layout()
plt.show()

In [None]:
# Generate histograms
fig, ax = plt.subplots(figsize=(12, 8))
colors = plt.cm.viridis(np.linspace(0, 1, len(ecoli_aggregated.columns)))

for (site, values), color in zip(ecoli_aggregated.items(), colors):
    ax.hist(values.dropna(), bins=20, color=color, alpha=0.6, edgecolor='black', label=site)

ax.axvline(x=ecoli_threshold, color='red', linestyle='--', label='Threshold (310 cfu/100ml)')
ax.set_title('E. Coli Distribution Across Sites')
ax.set_xlabel('E. Coli (cfu/100ml)')
ax.set_ylabel('Frequency')
ax.legend(title='Site ID')
plt.show()

In [None]:
# Calculate exceedances for E. coli
ecoli_exceedances = ecoli[ecoli['E. Coli'] > ecoli_threshold]
ecoli_exceedances['Year'] = ecoli_exceedances['Sample Date'].dt.year
exceedance_counts = ecoli_exceedances.groupby(['Year', 'Site ID']).size().reset_index(name='Exceedance Count')
total_counts = ecoli.groupby([ecoli['Sample Date'].dt.year, 'Site ID']).size().reset_index(name='Total Readings')
total_counts.rename(columns={'Sample Date': 'Year'}, inplace=True)
exceedance_data = pd.merge(total_counts, exceedance_counts, on=['Year', 'Site ID'], how='left').fillna(0)
exceedance_data['Exceedance Percentage'] = (exceedance_data['Exceedance Count'] / exceedance_data['Total Readings']) * 100

print(exceedance_data)