In [None]:
# !pip3 install pipreqs
# pipreqs . --force

In [None]:
## Load and Inspect Data ##

import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from datetime import datetime

In [None]:
# Load Weather Data
weather = pd.read_csv('../data/toronto_weather_daily_2007_2025.csv')
weather['date'] = pd.to_datetime(weather['date'], format='%Y-%m-%d')

#Load beach water quality data
beaches = gpd.read_file('../data/toronto-beaches-water-quality - 4326.geojson')
beaches['collectionDate'] = pd.to_datetime(beaches['collectionDate'])

# Initial inspection
print("Beaches_Info\n", beaches.info())
print("Beaches_Desc\n", beaches.describe())
print("Weather_Info\n", weather.info())
print("Weather_Desc\n", weather.describe())


2.4.2
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 108284 entries, 0 to 108283
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   _id             108284 non-null  int32         
 1   beachId         108284 non-null  int32         
 2   beachName       108284 non-null  str           
 3   siteName        108284 non-null  str           
 4   collectionDate  108284 non-null  datetime64[ms]
 5   eColi           98955 non-null   float64       
 6   comments        108284 non-null  str           
 7   geometry        108284 non-null  geometry      
dtypes: datetime64[ms](1), float64(1), geometry(1), int32(2), str(3)
memory usage: 5.8 MB
Beaches_Info
 None
Beaches_Info
                  _id        beachId              collectionDate         eColi
count  108284.000000  108284.000000                      108284  9.895500e+04
mean    54142.500000       5.599885  2016-11-21 19:11:51.861000  1.

In [20]:
# Data Quality Assessment
# Check missing values

print("Weather missing values: \n", weather.isnull().sum())
print("\nBeach missing values: \n", beaches.isnull().sum())

# Check E. coli distribution
print("\nE. coli null percentage: ", beaches['eColi'].isnull().sum() / len(beaches) * 100)

# Identify date ranges
print("\nWeather date range: ", weather['date'].min(), "to", weather['date'].max())
print("Beach date range: ", beaches['collectionDate'].min(), "to", beaches['collectionDate'].max())

Weather missing values: 
 date         0
temp_mean    0
precip       0
year         0
month        0
dtype: int64

Beach missing values: 
 _id                  0
beachId              0
beachName            0
siteName             0
collectionDate       0
eColi             9329
comments             0
geometry             0
dtype: int64

E. coli null percentage:  8.61530789405637

Weather date range:  2007-01-01 00:00:00 to 2025-09-30 00:00:00
Beach date range:  2007-06-03 00:00:00 to 2025-09-08 00:00:00


In [21]:
# Canadian Recreational Water Quality Guidelines (CRWQG): 200 E. coli per 100 mL of water is the threshold for safe recreational water.
# The CRWQG also provides a "low risk" threshold of 100 E. coli per 100 mL of water, which indicates a lower risk of illness for swimmers.
SAFE_THRESHOLD = 200
LOW_RISK_THRESHOLD = 100

# Create binary safety indicators based on the thresholds
beaches['is_unsafe'] = beaches['eColi'] > SAFE_THRESHOLD
beaches['is_safe'] = beaches['eColi'] <= SAFE_THRESHOLD

# Remove null E. coli values for analysis
cleaned_beaches = beaches.dropna(subset=['eColi'])

In [25]:
# Merge weather and beach data on date
cleaned_beaches['date'] = cleaned_beaches['collectionDate']
merged_data = cleaned_beaches.merge(
    weather,
    on='date',
    how='left'
)

# Check merged data
print("\nMerged data info:\n", merged_data.info())
print("Merge success rate: ", (1-merged_data['temp_mean'].isnull().sum() / len(merged_data)) * 100, "%")

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 98955 entries, 0 to 98954
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   _id             98955 non-null  int32         
 1   beachId         98955 non-null  int32         
 2   beachName       98955 non-null  str           
 3   siteName        98955 non-null  str           
 4   collectionDate  98955 non-null  datetime64[ms]
 5   eColi           98955 non-null  float64       
 6   comments        98955 non-null  str           
 7   geometry        98955 non-null  geometry      
 8   is_unsafe       98955 non-null  bool          
 9   is_safe         98955 non-null  bool          
 10  date            98955 non-null  datetime64[ms]
 11  temp_mean       98955 non-null  float64       
 12  precip          98955 non-null  float64       
 13  year            98955 non-null  int64         
 14  month           98955 non-null  int64         

In [None]:
# Create weather-based features
# Lagged weather features (bacteria can persist/grow after rain)
def add_lagged_weather(df, weather_df, lag_days=[1, 2, 3, 7]):
    df = df.copy()
    for lag in lag_days:
        lag_weather = weather_df.copy()
        lag_weather['date'] = lag_weather['date'] + pd.Timedelta(days=lag)
        lag_weather = lag_weather.rename(columns={
            'temp_mean': f'temp_lag{lag}',
            'precip': f'precip_lag{lag}'
        })
        df = df.merge(lag_weather[['date', f'temp_lag{lag}', f'precip_lag{lag}']], on='date', how='left')
    return df

merged_data = add_lagged_weather(merged_data, weather)

# Cumulative precipitation (2-day, 3-day, 7-day)
merged_data['precip_2day'] = merged_data['precip'] + merged_data['precip_lag1']
merged_data['precip_3day'] = merged_data['precip_2day'] + merged_data['precip_2day']
merged_data['precip_7day'] = merged_data[['precip'] + [f'precip_lag{i}' for i in range(1, 7)]].sum(axis=1)

# Temporal features
merged_data['month'] = merged_data['collectionDate'].dt.month
merged_data['day_of_year'] = merged_data['collectionDate'].dt.dayofyear
merged_data['year'] = merged_data['collectionDate'].dt.year
merged_data['is_summer'] = merged_data['month'].isin([6, 7, 8]) 
