In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import datasets, layers, models, losses
import pandas as pd
import numpy as np
from functools import reduce
from imblearn.over_sampling import RandomOverSampler

# Load Satellite Data

In [2]:
sites_data = pd.read_csv("~data/sites_data_11x11.csv")
sites_data

Unnamed: 0,time,lat,lon,TUR,SPM,CHL,site
0,2022-03-01,55.189352,-1.519043,,,,Newbiggin_North
1,2022-03-01,55.189352,-1.517296,,,,Newbiggin_North
2,2022-03-01,55.189352,-1.515549,,,,Newbiggin_North
3,2022-03-01,55.189352,-1.513802,,,,Newbiggin_North
4,2022-03-01,55.189352,-1.512055,,,,Newbiggin_North
...,...,...,...,...,...,...,...
12331105,2022-10-31,50.277315,-3.893256,2.006134,1.186131,1.828091,Bigbury-on-Sea_South
12331106,2022-10-31,50.277315,-3.891509,0.708182,0.406647,1.046769,Bigbury-on-Sea_South
12331107,2022-10-31,50.277315,-3.889762,1.258156,0.729469,1.327342,Bigbury-on-Sea_South
12331108,2022-10-31,50.277315,-3.888015,1.037095,0.598882,1.254974,Bigbury-on-Sea_South


In [3]:
sites_data.isnull().sum()

time           0
lat            0
lon            0
TUR     10838900
SPM     10838900
CHL     10838900
site           0
dtype: int64

In [4]:
# Fill Null Values with 0

sites_data.fillna(value=0, inplace=True)

# Load Pollution Data

In [5]:
riskforecasting = pd.read_csv('~data/pollution_risk_forecasting.csv')
riskforecasting

Unnamed: 0.1,Unnamed: 0,site,time,warning,riskLevelLabel
0,0,Seaton Carew North,2022-04-28,Pollution RIsk Forecasts will start soon,normal
1,1,Seaton Carew North,2022-04-29,Pollution RIsk Forecasts will start soon,normal
2,2,Seaton Carew North,2022-04-30,Pollution RIsk Forecasts will start soon,normal
3,3,Seaton Carew North,2022-05-04,No warnings in place,normal
4,4,Seaton Carew North,2022-05-05,No warnings in place,normal
...,...,...,...,...,...
63641,63641,Westward Ho!,2022-09-26,No pollution incidents reported,normal
63642,63642,Westward Ho!,2022-09-27,No pollution incidents reported,normal
63643,63643,Westward Ho!,2022-09-28,No pollution incidents reported,normal
63644,63644,Westward Ho!,2022-09-29,No pollution incidents reported,normal


# Combine Datasets to Create Input Dataset
For every site (430) and time (237), create a 11 x 11 x 3 tensor, each corresponding to one risk level label 

In [6]:
%%time
def chl_to_array(chl_values):
    # Not 100% sure if this reshapes according to lat/lon (though it does not matter if we perform the same operation every time?)
    return np.array(chl_values).reshape(11, 11)

def get_features_data(sites_data, features_list):
    '''
    input: 
        - sites_data (pd.DataFrame):
            - dataframe where each row contains feature values for a time, site and coordinate
        - features_list (list):
            - list of strings of features to use
            
    output:
        - features data (pd.DataFrame)
            - row: data for every time and site pair
            - column: features
            - entries: np.array of shape 11x11
    '''
    dfs = []
    for feature in features_list:
        df = pd.DataFrame(sites_data.groupby(['time', 'site'])[feature].apply(chl_to_array))
        dfs.append(df)
    input_data = reduce(lambda  left,right: pd.merge(left,right,on=['time', 'site'],how='outer'), dfs)
    
    return input_data

features_df = get_features_data(sites_data, ['TUR', 'SPM', 'CHL'])
features_df

CPU times: user 5.83 s, sys: 925 ms, total: 6.75 s
Wall time: 7.29 s


Unnamed: 0_level_0,Unnamed: 1_level_0,TUR,SPM,CHL
time,site,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-03-01,Ainsdale,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2022-03-01,Allonby,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2022-03-01,Allonby_South,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2022-03-01,Amble_Links,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2022-03-01,Anderby,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
...,...,...,...,...
2022-10-31,Withernsea,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2022-10-31,Wolvercote_Mill_Stream,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2022-10-31,Woolacombe_Village,"[[5.9864016, 25.982555, 0.0, 0.0, 0.0, 0.0, 0....","[[3.564524, 16.43204, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[6.564089, 6.803925, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2022-10-31,Worthing,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [7]:
# Naming inconsistencies

site_df = pd.read_csv("~data/site.csv")
site_df = site_df[['label', 'Site Name in Files']]
riskforecasting = riskforecasting.merge(site_df, left_on='site', right_on='label', how='left').drop(['label', 'warning'], axis=1)
riskforecasting

Unnamed: 0.1,Unnamed: 0,site,time,riskLevelLabel,Site Name in Files
0,0,Seaton Carew North,2022-04-28,normal,Seaton_Carew_North
1,1,Seaton Carew North,2022-04-29,normal,Seaton_Carew_North
2,2,Seaton Carew North,2022-04-30,normal,Seaton_Carew_North
3,3,Seaton Carew North,2022-05-04,normal,Seaton_Carew_North
4,4,Seaton Carew North,2022-05-05,normal,Seaton_Carew_North
...,...,...,...,...,...
63641,63641,Westward Ho!,2022-09-26,normal,Westward_Ho!
63642,63642,Westward Ho!,2022-09-27,normal,Westward_Ho!
63643,63643,Westward Ho!,2022-09-28,normal,Westward_Ho!
63644,63644,Westward Ho!,2022-09-29,normal,Westward_Ho!


In [8]:
# Merging datasets. Merge on riskforecasting (only add CHL values if we have a risklevellabel)
input_data = features_df.merge(riskforecasting, how='right', left_on=['time', 'site'], right_on=['time', 'Site Name in Files'])

In [9]:
# No satellite data from 07-23 and 07-25

input_data[input_data['CHL'].isnull()]['time'].value_counts()

2022-07-23    421
2022-07-25    421
2022-02-08      2
2022-12-02      2
2022-02-25      2
2022-02-06      2
2022-11-07      1
2022-11-08      1
2022-11-03      1
2022-11-04      1
2022-01-09      1
2022-12-07      1
2022-12-08      1
2023-01-17      1
2022-12-19      1
2022-04-06      1
2022-02-09      1
2022-02-10      1
2022-02-19      1
2022-02-23      1
2022-01-17      1
Name: time, dtype: int64

In [10]:
# Other missing values to look into - why do we have risk level labels but not satellite data when merging? Naming issue?

missing_vals = input_data[(input_data['CHL'].isnull()) & (input_data['time'] != '2022-07-23') & (input_data['time'] != '2022-07-25')]
missing_vals

Unnamed: 0.1,time,TUR,SPM,CHL,Unnamed: 0,site,riskLevelLabel,Site Name in Files
13905,2022-12-19,,,,13905,Ingoldmells South,normal,Ingoldmells_South
21307,2022-02-06,,,,21307,Pevensey Bay,increased,Pevensey_Bay
21308,2022-02-08,,,,21308,Pevensey Bay,normal,Pevensey_Bay
21460,2022-02-06,,,,21460,Eastbourne,increased,Eastbourne
21461,2022-02-08,,,,21461,Eastbourne,normal,Eastbourne
27204,2022-11-07,,,,27204,Gurnard,increased,Gurnard
27205,2022-11-08,,,,27205,Gurnard,normal,Gurnard
27804,2022-11-03,,,,27804,Seagrove,increased,Seagrove
27805,2022-11-04,,,,27805,Seagrove,normal,Seagrove
43071,2022-01-09,,,,43071,Maenporth,increased,Maenporth


In [11]:
missing_vals[['time', 'site']]

Unnamed: 0,time,site
13905,2022-12-19,Ingoldmells South
21307,2022-02-06,Pevensey Bay
21308,2022-02-08,Pevensey Bay
21460,2022-02-06,Eastbourne
21461,2022-02-08,Eastbourne
27204,2022-11-07,Gurnard
27205,2022-11-08,Gurnard
27804,2022-11-03,Seagrove
27805,2022-11-04,Seagrove
43071,2022-01-09,Maenporth


In [None]:
# Drop NA values for now
input_data.dropna(inplace=True)
input_data.shape

In [None]:
# Remove rows where everything is 0 (i.e. all missing values)
def has_nonzero(arr):
    return np.any(arr != 0)

input_data = input_data[input_data['CHL'].apply(has_nonzero)]
input_data

# From Dataset to Tensor

In [None]:
def get_train_test_val(input_data, desired_pos_ratio =  0.5, train_test_ratio = 0.8, train_val_ratio = 0.8):
    '''
    input: 
        - input_data (pd.DataFrame): 
            - dataframe of shape (m, n)
            - number of datapoints = m
            - features to consider = n-1
            - one of the columns = 'riskLevelLabel'

        - desired_pos_ratio (float):
            - desired ratio of positive samples when performing random oversampling

        - train_test_ratio (float):
            - ratio of training data to testing data

        - train_val_ratio (float):
            - ratio of training data to validation data
            
    output:
        - X_train (tensor)
        - X_test (tensor)
        - X_val (tensor)
        - y_train (np.array)
        - y_test (np.array)
        - y_val (np.array)
    
    '''
    
    # Getting X and y
    features_column_names = list(input_data.columns)
    features_column_names.remove('riskLevelLabel')
    X = input_data[features_column_names]
    y = np.array([1 if x == 'increased' else 0 for x in input_data['riskLevelLabel']])
    
    # Counting number of samples to oversample
    num_positives, num_negatives = sum(y), len(y)-sum(y)
    num_positives_to_repeat = int(desired_pos_ratio * num_negatives * 2) - num_positives
    
    # Oversampling
    ros = RandomOverSampler(sampling_strategy={1: num_positives_to_repeat}, random_state=42)
    X, y = ros.fit_resample(X, y)
    
    # Reshape and Convert to Tensor
    if X.shape[1] == 1: 
        X = np.array([i for i in X[features_column_names[0]]])
        X = tf.convert_to_tensor(X)
        X = tf.expand_dims(X, axis=3, name=None)
    else:
        X = np.stack([np.stack(X[col].values) for col in X.columns], axis=1)
        X = np.transpose(X, (0, 2, 3, 1))
        X = tf.convert_to_tensor(X)
    
    # Reshape to 32x32 with simple padding for model input
    X = tf.pad(X, [[0, 0], [11,10], [11,10], [0,0]])
    
    # Train Test Split
    i = int(X.shape[0] * train_test_ratio)
    X_train, y_train = X[:i], y[:i]
    X_test, y_test = X[i:], y[i:]

    # Train Validation Split
    i = int(X.shape[0] * train_test_ratio * train_val_ratio)
    X_val, y_val = X_train[i:], y_train[i:]
    X_train, y_train = X_train[:i], y_train[:i]

    return X_train, X_test, X_val, y_train, y_test, y_val

In [None]:
X = input_data[['CHL']]
X.columns[0]
# X = np.array([i for i in X])
# X = tf.convert_to_tensor(X)
# X = tf.expand_dims(X, axis=3, name=None)

In [None]:
X = input_data[['CHL']]
np.array([i for i in X['CHL']]).shape

# Model Architecture

## Baseline (No Convolution)

In [None]:
model_b = models.Sequential()
model_b.add(layers.AveragePooling2D(pool_size=2, strides=2, input_shape=X_train.shape[1:]))
model_b.add(layers.Flatten())
model_b.add(layers.Dense(120, activation='relu')),
model_b.add(layers.Dense(84, activation='relu')),
model_b.add(layers.Dense(10, activation='relu')),
model_b.add(layers.Dense(1, activation='sigmoid'))

model_b.summary()

In [None]:
model_b.compile(optimizer=opt, loss='binary_crossentropy', metrics=["acc"])

## Convolution

In [None]:
model = models.Sequential()
model.add(layers.Conv2D(filters=6, kernel_size=5, activation='relu', padding='same', input_shape=X_train.shape[1:]))
model.add(layers.AveragePooling2D(pool_size=2, strides=2))
model.add(layers.Conv2D(filters=16, kernel_size=5, activation='relu'))
model.add(layers.AveragePooling2D(pool_size=2, strides=2))
model.add(layers.Flatten())
model.add(layers.Dense(120, activation='relu')),
model.add(layers.Dense(84, activation='relu')),
model.add(layers.Dense(10, activation='relu')),
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

In [None]:
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=["acc"])

# Training and Results

## Plotting Function

In [None]:
def plot_train_val_loss_acc(his):
    '''
    input: history
    output: 2 graphs
    '''
    fig, axs = plt.subplots(2, 1, figsize=(10,10))
    axs[0].plot(his.history['loss'])
    axs[0].plot(his.history['val_loss'])
    axs[0].title.set_text('Training Loss vs Validation Loss')
    axs[0].set_xlabel("Epochs")
    axs[0].set_ylabel("Loss")
    axs[0].legend(['Training', 'Validation'])
    axs[1].plot(his.history['acc'])
    axs[1].plot(his.history['val_acc'])
    axs[1].title.set_text('Training Acc vs Validation Acc')
    axs[1].legend(['Training', 'Validation'])
    axs[1].set_xlabel("Epochs")
    axs[1].set_ylabel("Accuracy")

## Baseline, 1 feature (CHL)

### Data

In [None]:
input_data_ = input_data[['CHL', 'riskLevelLabel']]

In [None]:
X_train, X_test, X_val, y_train, y_test, y_val = get_train_test_val(input_data_)

### Model

In [None]:
model_b = models.Sequential()
model_b.add(layers.AveragePooling2D(pool_size=2, strides=2, input_shape=X_train.shape[1:]))
model_b.add(layers.Flatten())
model_b.add(layers.Dense(120, activation='relu')),
model_b.add(layers.Dense(84, activation='relu')),
model_b.add(layers.Dense(10, activation='relu')),
model_b.add(layers.Dense(1, activation='sigmoid'))

model_b.summary()

In [None]:
model_b.compile(optimizer=opt, loss='binary_crossentropy', metrics=["acc"])

In [None]:
history_b = model_b.fit(X_train, y_train, batch_size=64, epochs=40, validation_data=(X_val, y_val))

In [None]:
plot_train_val_loss_acc(history_b)

In [None]:
result_b_1 = model_b.evaluate(X_test, y_test)
result_b_1

## Convolution, 1 feature (CHL)

### Model

In [None]:
model = models.Sequential()
model.add(layers.Conv2D(filters=6, kernel_size=5, activation='relu', padding='same', input_shape=X_train.shape[1:]))
model.add(layers.AveragePooling2D(pool_size=2, strides=2))
model.add(layers.Conv2D(filters=16, kernel_size=5, activation='relu'))
model.add(layers.AveragePooling2D(pool_size=2, strides=2))
model.add(layers.Flatten())
model.add(layers.Dense(120, activation='relu')),
model.add(layers.Dense(84, activation='relu')),
model.add(layers.Dense(10, activation='relu')),
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

In [None]:
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=["acc"])

In [None]:
history = model.fit(X_train, y_train, batch_size=64, epochs=40, validation_data=(X_val, y_val))

In [None]:
plot_train_val_loss_acc(history)

In [None]:
result_c_1 = model.evaluate(X_test, y_test)
result_c_1

## Baseline, 3 features

### Data

In [None]:
input_data_ = input_data[['TUR', 'SPM', 'CHL', 'riskLevelLabel']]

In [None]:
X_train, X_test, X_val, y_train, y_test, y_val = get_train_test_val(input_data_)

### Model

In [None]:
model_b = models.Sequential()
model_b.add(layers.AveragePooling2D(pool_size=2, strides=2, input_shape=X_train.shape[1:]))
model_b.add(layers.Flatten())
model_b.add(layers.Dense(120, activation='relu')),
model_b.add(layers.Dense(84, activation='relu')),
model_b.add(layers.Dense(10, activation='relu')),
model_b.add(layers.Dense(1, activation='sigmoid'))

model_b.summary()

In [None]:
model_b.compile(optimizer=opt, loss='binary_crossentropy', metrics=["acc"])

In [None]:
history_b = model_b.fit(X_train, y_train, batch_size=64, epochs=40, validation_data=(X_val, y_val))

In [None]:
plot_train_val_loss_acc(history_b)

In [None]:
result_b_3 = model_b.evaluate(X_test, y_test)
result_b_3

## Convolution, 3 features

### Model

In [None]:
model = models.Sequential()
model.add(layers.Conv2D(filters=6, kernel_size=5, activation='relu', padding='same', input_shape=X_train.shape[1:]))
model.add(layers.AveragePooling2D(pool_size=2, strides=2))
model.add(layers.Conv2D(filters=16, kernel_size=5, activation='relu'))
model.add(layers.AveragePooling2D(pool_size=2, strides=2))
model.add(layers.Flatten())
model.add(layers.Dense(120, activation='relu')),
model.add(layers.Dense(84, activation='relu')),
model.add(layers.Dense(10, activation='relu')),
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

In [None]:
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=["acc"])

In [None]:
history = model.fit(X_train, y_train, batch_size=64, epochs=40, validation_data=(X_val, y_val))

In [None]:
plot_train_val_loss_acc(history)

In [None]:
result_c_3 = model.evaluate(X_test, y_test)
result_c_3

### Result Summary

In [None]:
print('Baseline 1 Feature (CHL):', result_b_1)
print('Convolution 1 Feature (CHL):', result_c_1)
print('Baseline 3 Features:', result_b_3)
print('Convolution 3 Features:', result_c_3)