**Models**:
- Binary Classifier (BC)
- Random Forest (RF)
- Neural Network (NN)

# Load Packages

In [None]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, losses

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from functools import reduce
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Load Datasets and Basic Data Cleaning

## BC & RF: features dataset

In [21]:
df_feature = pd.read_csv('~data/feature_extraction.csv')
df_feature = df_feature.dropna(how='all', subset=df_feature.columns[2:])
df_feature

Unnamed: 0,time,site,TUR_1x1_median,SPM_1x1_median,CHL_1x1_median,TUR_1x1_mean,SPM_1x1_mean,CHL_1x1_mean,TUR_1x1_q1,SPM_1x1_q1,...,CHL_11x11_median,TUR_11x11_mean,SPM_11x11_mean,CHL_11x11_mean,TUR_11x11_q1,SPM_11x11_q1,CHL_11x11_q1,TUR_11x11_q3,SPM_11x11_q3,CHL_11x11_q3
0,2022-03-04,Anderby,,,,,,,,,...,9.174551,163.135918,152.993287,9.480745,126.656470,115.165835,8.739565,184.262455,175.823985,10.048879
1,2022-03-04,Bexhill,,,,,,,,,...,9.088710,68.446416,59.178691,9.029485,62.619209,52.394663,7.873103,72.854860,63.530318,10.056020
2,2022-03-04,Birling Gap,,,,,,,,,...,10.805366,93.880434,87.483492,11.032810,70.399693,59.039690,9.145264,91.852595,100.279610,12.380630
3,2022-03-04,"Botany Bay, Broadstairs",,,,,,,,,...,6.761849,83.942460,70.130099,8.039195,68.796310,53.251103,5.826829,101.135487,94.071441,8.775965
4,2022-03-04,Brightlingsea,62.57270,42.77478,10.132153,62.57270,42.77478,10.132153,62.57270,42.77478,...,8.110302,68.449360,52.265659,8.602104,62.192470,44.807648,7.444274,76.858376,60.001347,9.831872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28475,2022-10-31,Whitby,96.45255,67.53166,6.800380,96.45255,67.53166,6.800380,96.45255,67.53166,...,5.693440,33.249046,30.364642,5.453727,10.124196,6.075947,5.170083,17.858250,11.138832,6.066630
28476,2022-10-31,Whitley Bay,,,,,,,,,...,4.309310,44.649534,46.954763,5.177130,7.078911,4.246786,3.856401,10.785609,6.658894,6.085422
28477,2022-10-31,Widemouth Sand,,,,,,,,,...,2.953294,9.439017,6.185049,3.555955,1.677360,0.976173,2.053952,9.585481,5.794549,4.352078
28478,2022-10-31,Wilsthorpe,,,,,,,,,...,4.383949,140.258110,127.778108,4.487975,58.708913,42.690909,3.901447,175.019072,162.879985,4.956142


## NN: sites_data.csv

In [16]:
sites_data = pd.read_csv("~data/sites_data_11x11.csv")
sites_data

Unnamed: 0,time,lat,lon,TUR,SPM,CHL,site
0,2022-03-01,55.189352,-1.519043,,,,Newbiggin North
1,2022-03-01,55.189352,-1.517296,,,,Newbiggin North
2,2022-03-01,55.189352,-1.515549,,,,Newbiggin North
3,2022-03-01,55.189352,-1.513802,,,,Newbiggin North
4,2022-03-01,55.189352,-1.512055,,,,Newbiggin North
...,...,...,...,...,...,...,...
12331105,2022-10-31,50.277315,-3.893256,2.006134,1.186131,1.828091,Bigbury-on-Sea South
12331106,2022-10-31,50.277315,-3.891509,0.708182,0.406647,1.046769,Bigbury-on-Sea South
12331107,2022-10-31,50.277315,-3.889762,1.258156,0.729469,1.327342,Bigbury-on-Sea South
12331108,2022-10-31,50.277315,-3.888015,1.037095,0.598882,1.254974,Bigbury-on-Sea South


In [17]:
sites_data.isnull().sum()

time           0
lat            0
lon            0
TUR     10838900
SPM     10838900
CHL     10838900
site           0
dtype: int64

In [18]:
# Fill Null Values with 0, as we cannot have missing values in the tensors for neural network
# Later we will remove time, site pairs where all values are 0

sites_data.fillna(value=-10, inplace=True)

## All: Pollution Data

In [19]:
riskforecasting = pd.read_csv('~data/pollution_risk_forecasting.csv')
riskforecasting

Unnamed: 0,site,time,warning,riskLevelLabel
0,Ainsdale,2022-04-28,Pollution RIsk Forecasts will start soon,normal
1,Ainsdale,2022-04-29,Pollution RIsk Forecasts will start soon,normal
2,Ainsdale,2022-04-30,Pollution RIsk Forecasts will start soon,normal
3,Ainsdale,2022-05-04,No warnings in place,normal
4,Ainsdale,2022-05-05,No warnings in place,normal
...,...,...,...,...
66558,Yaverland,2023-04-29,Pollution RIsk Forecasts will start soon,normal
66559,Yaverland,2023-04-30,Pollution RIsk Forecasts will start soon,normal
66560,Yaverland,2023-05-01,No pollution incidents reported,normal
66561,Yaverland,2023-05-02,No pollution incidents reported,normal


# Data Manipulation and Further Data Cleaning

## BC:

## RF:

In [20]:
df_merged = pd.merge(df_feature, riskforecasting[['Site Name in Files', 'time', 'riskLevelLabel']], left_on=['site', 'time'], right_on=['Site Name in Files', 'time'])
df_merged

Unnamed: 0,time,site,TUR_1x1_median,SPM_1x1_median,CHL_1x1_median,TUR_1x1_mean,SPM_1x1_mean,CHL_1x1_mean,TUR_1x1_q1,SPM_1x1_q1,...,SPM_11x11_mean,CHL_11x11_mean,TUR_11x11_q1,SPM_11x11_q1,CHL_11x11_q1,TUR_11x11_q3,SPM_11x11_q3,CHL_11x11_q3,Site Name in Files,riskLevelLabel
0,2022-03-01,Saltburn,,,,,,,,,...,,,,,,,,,Saltburn,increased
1,2022-03-11,Gwithian_Towans,,,,,,,,,...,,,,,,,,,Gwithian_Towans,increased
2,2022-03-12,Gwithian_Towans,,,,,,,,,...,,,,,,,,,Gwithian_Towans,normal
3,2022-03-14,Salcombe_North_Sands,,,,,,,,,...,,,,,,,,,Salcombe_North_Sands,normal
4,2022-03-14,Salcombe_South_Sands,,,,,,,,,...,,,,,,,,,Salcombe_South_Sands,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62776,2022-09-30,Woolacombe_Village,,,,,,,,,...,,,,,,,,,Woolacombe_Village,increased
62777,2022-09-30,Worthing,,,,,,,,,...,,,,,,,,,Worthing,normal
62778,2022-09-30,Yaverland,,,,,,,,,...,,,,,,,,,Yaverland,normal
62779,2022-10-13,Blackpool_Central,,,,,,,,,...,,,,,,,,,Blackpool_Central,normal


In [21]:
# Remove rows that are all null values?

## NN: Combine Datasets to Create Input Dataset
For every site (430) and time (237), create a 11 x 11 x 3 tensor, each corresponding to one risk level label 

In [16]:
%%time
def chl_to_array(chl_values):
    # Not 100% sure if this reshapes according to lat/lon (though it does not matter if we perform the same operation every time?)
    return np.array(chl_values).reshape(11, 11)

def get_features_data(sites_data, features_list):
    '''
    input: 
        - sites_data (pd.DataFrame):
            - dataframe where each row contains feature values for a time, site and coordinate
        - features_list (list):
            - list of strings of features to use
            
    output:
        - features data (pd.DataFrame)
            - row: data for every time and site pair
            - column: features
            - entries: np.array of shape 11x11
    '''
    dfs = []
    for feature in features_list:
        df = pd.DataFrame(sites_data.groupby(['time', 'site'])[feature].apply(chl_to_array))
        dfs.append(df)
    input_data = reduce(lambda  left,right: pd.merge(left,right,on=['time', 'site'],how='outer'), dfs)
    
    return input_data

features_df = get_features_data(sites_data, ['TUR', 'SPM', 'CHL'])
features_df

CPU times: user 5.88 s, sys: 818 ms, total: 6.7 s
Wall time: 7.28 s


Unnamed: 0_level_0,Unnamed: 1_level_0,TUR,SPM,CHL
time,site,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-03-01,Ainsdale,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1...","[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1...","[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2022-03-01,Allonby,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1...","[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1...","[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2022-03-01,Allonby_South,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1...","[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1...","[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2022-03-01,Amble_Links,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1...","[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1...","[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2022-03-01,Anderby,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1...","[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1...","[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
...,...,...,...,...
2022-10-31,Withernsea,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1...","[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1...","[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2022-10-31,Wolvercote_Mill_Stream,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1...","[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1...","[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."
2022-10-31,Woolacombe_Village,"[[5.9864016, 25.982555, -10.0, -10.0, -10.0, -...","[[3.564524, 16.43204, -10.0, -10.0, -10.0, -10...","[[6.564089, 6.803925, -10.0, -10.0, -10.0, -10..."
2022-10-31,Worthing,"[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1...","[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1...","[[-10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -1..."


In [27]:
# Merging datasets. Merge on riskforecasting (only add CHL values if we have riskLevellabel)
input_data = features_df.merge(riskforecasting, how='right', left_on=['time', 'site'], right_on=['time', 'Site Name in Files'])

In [28]:
# Missing data check. 
# Data missing for 07-23 and 07-25 for all sites - No satellite data

input_data[input_data['CHL'].isnull()]['time'].value_counts()

2022-07-23    421
2022-07-25    421
2022-02-08      2
2022-12-02      2
2022-02-25      2
2022-02-06      2
2022-11-07      1
2022-11-08      1
2022-11-03      1
2022-11-04      1
2022-01-09      1
2022-12-07      1
2022-12-08      1
2023-01-17      1
2022-12-19      1
2022-04-06      1
2022-02-09      1
2022-02-10      1
2022-02-19      1
2022-02-23      1
2022-01-17      1
Name: time, dtype: int64

In [29]:
# Other missing values to look into - why do we have risk level labels but not satellite data when merging? Naming issue?

input_data[(input_data['CHL'].isnull()) & (input_data['time'] != '2022-07-23') & (input_data['time'] != '2022-07-25')]

Unnamed: 0.1,time,TUR,SPM,CHL,Unnamed: 0,site,riskLevelLabel,Site Name in Files
13905,2022-12-19,,,,13905,Ingoldmells South,normal,Ingoldmells_South
21307,2022-02-06,,,,21307,Pevensey Bay,increased,Pevensey_Bay
21308,2022-02-08,,,,21308,Pevensey Bay,normal,Pevensey_Bay
21460,2022-02-06,,,,21460,Eastbourne,increased,Eastbourne
21461,2022-02-08,,,,21461,Eastbourne,normal,Eastbourne
27204,2022-11-07,,,,27204,Gurnard,increased,Gurnard
27205,2022-11-08,,,,27205,Gurnard,normal,Gurnard
27804,2022-11-03,,,,27804,Seagrove,increased,Seagrove
27805,2022-11-04,,,,27805,Seagrove,normal,Seagrove
43071,2022-01-09,,,,43071,Maenporth,increased,Maenporth


In [30]:
# Drop NA values for now as there are not that many of them
input_data.dropna(inplace=True)
input_data.shape

(62781, 8)

In [31]:
# Remove rows where everything is 0 (i.e. all missing values)
def has_nonzero(arr):
    return np.any(arr != -10)

input_data = input_data[input_data['CHL'].apply(has_nonzero)]
input_data

Unnamed: 0.1,time,TUR,SPM,CHL,Unnamed: 0,site,riskLevelLabel,Site Name in Files
1,2022-04-29,"[[-10.0, -10.0, -10.0, -10.0, 6.66657, 17.5728...","[[-10.0, -10.0, -10.0, -10.0, 3.965849, 10.934...","[[-10.0, -10.0, -10.0, -10.0, 3.342492, 8.1076...",1,Seaton Carew North,normal,Seaton_Carew_North
3,2022-05-04,"[[-10.0, -10.0, -10.0, 6.540016, 2.0652199, 5....","[[-10.0, -10.0, -10.0, 3.9651499, 1.2428378, 3...","[[-10.0, -10.0, -10.0, 4.6985893, 3.716854, 11...",3,Seaton Carew North,normal,Seaton_Carew_North
13,2022-05-14,"[[-10.0, -10.0, -10.0, 7.645644, 3.5351295, 5....","[[-10.0, -10.0, -10.0, 4.581397, 2.1685658, 3....","[[-10.0, -10.0, -10.0, 5.416618, 4.1310344, 14...",13,Seaton Carew North,normal,Seaton_Carew_North
15,2022-05-16,"[[-10.0, -10.0, -10.0, 7.8827524, 4.909162, 7....","[[-10.0, -10.0, -10.0, 4.6897936, 3.0189908, 4...","[[-10.0, -10.0, -10.0, 5.1706023, 4.0242176, 1...",15,Seaton Carew North,increased,Seaton_Carew_North
18,2022-05-19,"[[-10.0, -10.0, -10.0, 7.920317, 8.67042, 11.0...","[[-10.0, -10.0, -10.0, 4.654839, 5.33716, 7.09...","[[-10.0, -10.0, -10.0, 4.5101485, 3.6868532, 1...",18,Seaton Carew North,normal,Seaton_Carew_North
...,...,...,...,...,...,...,...,...
63634,2022-09-19,"[[7.055306, 11.538847, 21.925156, 38.97817, 67...","[[4.1807566, 6.9168806, 13.601273, 25.838427, ...","[[9.039883, 9.1092415, 8.58365, 8.242571, 1.83...",63634,Westward Ho!,normal,Westward_Ho!
63636,2022-09-21,"[[7.0546794, 11.46094, 21.594828, 37.921295, 6...","[[4.1956024, 6.904963, 13.490634, 25.326557, 4...","[[9.529826, 9.603931, 8.9767, 8.631461, 1.8302...",63636,Westward Ho!,normal,Westward_Ho!
63639,2022-09-24,"[[8.833501, 13.690014, 24.412632, 37.04687, 77...","[[5.320448, 8.367022, 15.584268, 24.799337, 58...","[[11.152357, 11.245967, 10.390406, 9.975812, 1...",63639,Westward Ho!,normal,Westward_Ho!
63641,2022-09-26,"[[12.430533, 17.521309, 27.909481, 32.5058, 90...","[[7.6267166, 10.912169, 18.30988, 21.522917, 6...","[[11.114327, 11.215534, 10.675369, 10.119238, ...",63641,Westward Ho!,normal,Westward_Ho!


# Train-Test Split
- Pick time-site pairs to use as train data and test data
- Potential for implementing cross validation

In [38]:
time_site_pairs = input_data[['time', 'site']]

# 80/20 split
time_site_pairs_test = time_site_pairs.sample(frac=.2, random_state=42)
time_site_pairs_test

Unnamed: 0,time,site
30146,2022-07-05,Sandgate
23187,2022-07-02,Shoreham Beach
44377,2022-08-05,Porthoustock
46583,2022-05-29,Harlyn Bay
3387,2022-07-03,Druridge Bay South
...,...,...
48917,2022-08-02,Wherry Town
23074,2022-08-09,Southwick
9803,2022-09-11,"Danes Dyke, Flamborough"
51359,2022-08-25,Par Sands


In [41]:
time_site_pairs_train = time_site_pairs[~time_site_pairs.isin(time_site_pairs_test)].dropna()
time_site_pairs_train

Unnamed: 0,time,site
3,2022-05-04,Seaton Carew North
13,2022-05-14,Seaton Carew North
18,2022-05-19,Seaton Carew North
23,2022-05-24,Seaton Carew North
28,2022-05-29,Seaton Carew North
...,...,...
63626,2022-09-11,Westward Ho!
63629,2022-09-14,Westward Ho!
63631,2022-09-16,Westward Ho!
63639,2022-09-24,Westward Ho!


# Train & Test Models 
- Train on training time-site pairs
- Test on testing time-site pairs

## BC

## RF

## NN

# Final Results
Dummy / sketch dataframe (just as an example, sub-models and statistics tbc)

In [4]:
tuples = [('Baseline Random Guess', 'N/A'),
        ('BC', 'No oversampling, all features'),
         ('BC', 'No oversampling, top 10 features'),
         ('BC', 'Oversampling, all features'),
         ('BC', 'Oversampling, top 10 features'),
         ('RF', 'No oversampling'),
         ('RF', 'Oversampling'),
         ('NN', 'No oversampling'),
         ('NN', 'Oversampling')]

index = pd.MultiIndex.from_tuples(tuples, names=["Model", "Sub-Model"])

df = pd.DataFrame(columns = {'F1': [0,0,0,0,0,0,0,0,0], 
                            'Precision': [0,0,0,0,0,0,0,0,0],
                            'Recall': [0,0,0,0,0,0,0,0,0],
                            'AUC': [0,0,0,0,0,0,0,0,0], 
                            'Acc': [0,0,0,0,0,0,0,0,0]}, index = index)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,F1,Precision,Recall,AUC,Acc
Model,Sub-Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Baseline Random Guess,,,,,,
BC,"No oversampling, all features",,,,,
BC,"No oversampling, top 10 features",,,,,
BC,"Oversampling, all features",,,,,
BC,"Oversampling, top 10 features",,,,,
RF,No oversampling,,,,,
RF,Oversampling,,,,,
NN,No oversampling,,,,,
NN,Oversampling,,,,,
