# Train Spectrogram Classifier

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from datetime import date
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
from tensorflow import keras
from tensorflow.keras import layers
from tqdm.notebook import tqdm

from scripts.get_s2_data_ee import band_descriptions
from scripts.viz_tools import stretch_histogram, normalize

from scripts import dl_utils
from scripts.dl_utils import rect_from_point
from scripts.nn_predict import predict_spectrogram, make_predictions, visualize_predictions

np.random.seed(1)

## Create a Training Dataset
Outputs will be: `x_train`, `y_train`, `x_test`, `y_test`, and optionally, `x_holdout`, `y_holdout`. Holdout data is only positive.

In [None]:
train_data_dir = '../data/training_data/pixel_arrays_1mo-mosaics_6mo-gap/'
MOSAIC_PERIOD = 1
MOSAIC_GAP = 6

data_files = ['tpa_polygons_2019-01-01_2021-01-01_pixel_arrays.pkl',
              'java_v1.0_positive_polygons_2019-01-01_2021-01-01_pixel_arrays.pkl',
              'v_1.1.5_negatives_2019-01-01_2021-01-01_pixel_arrays.pkl',
              'city_points_30_negatives_2019-01-01_2021-01-01_pixel_arrays.pkl',
              'bali_bare_earth_negatives_2019-01-01_2021-01-01_pixel_arrays.pkl',
              'bali_bootstrap_negatives_2019-01-01_2021-01-01_pixel_arrays.pkl',
              #'lombok_v1.1.7_negative_polygons_2019-01-01_2021-01-01_pixel_arrays.pkl'
             ]
label_files = [f.split('s.pkl')[0] + '_labels.pkl' for f in data_files]

# Specify a path to a holdout set of positive-class-only pixel arrays. Or leave an empty string.
holdout_file = ''

In [None]:
pixel_arrays = []
labels = []
for data, label in tqdm(zip(data_files, label_files), total=len(data_files)):
    with open(os.path.join(train_data_dir, data), 'rb') as f:
        pixel_arrays += pickle.load(f)
    with open(os.path.join(train_data_dir, label), 'rb') as f:
        labels += pickle.load(f)
            
pixel_arrays = np.array(pixel_arrays)
labels = np.array(labels)
positive_arrays = pixel_arrays[labels == 1]
negative_arrays = pixel_arrays[labels == 0]

print(f"Loaded {len(positive_arrays):,} positive pixel arrays and {len(negative_arrays):,} negative pixel arrays")

In [None]:
if holdout_file:
    with open(os.path.join(train_data_dir, holdout_file), 'rb') as f:
        holdout_pixel_arrays = np.array(pickle.load(f))

### Filter positive samples such that NDVI is within a range
This is useful since the positive patches can include surrounding vegetation

In [None]:
def compute_ndvi(pixel_arrays):
    return (pixel_arrays[:,7] - pixel_arrays[:,3]) / (pixel_arrays[:,7] + pixel_arrays[:,3])

def filter_ndvi(data, lower_bound=0, upper_bound=0.4):
    ndvi = compute_ndvi(data)
    index = np.logical_and(ndvi > lower_bound, ndvi < upper_bound)
    filtered_data = data[index.all(axis=1)]
    print(f"{len(filtered_data) / len(data):.1%} of samples within NDVI range")
    return filtered_data

def filter_bright(data, brightness_threshold=2500):
    filtered_data = data[np.mean(data, axis=(1,2)) < brightness_threshold]
    filtered_data.shape
    print(f"{len(filtered_data) / len(data) :.1%} of data below brightness limit")
    return filtered_data

In [None]:
filtered_positive_arrays = filter_ndvi(positive_arrays)
filtered_positive_arrays = filter_bright(filtered_positive_arrays)

### Combine data and create train test split
Also expand dimensions to account for batches

In [None]:
x = np.concatenate((filtered_positive_arrays, negative_arrays))
y = np.concatenate((np.ones(len(filtered_positive_arrays)), np.zeros(len(negative_arrays))))

x, y = shuffle(x, y, random_state=42)
x = normalize(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)

print("Num Train Samples:\t\t", len(x_train))
print("Num Test Samples:\t\t", len(x_test))
print(f"Percent Negative Train:\t {sum(y_train == 0.0) / len(y_train):.1%}")
print(f"Percent Negative Test:\t {sum(y_test == 0.0) / len(y_test):.1%}")
print(f"Input data shape: {x_train.shape}")

# Note: I am accustomed to assigning two classes for binary classification. 
# This habit comes from an issue in theano a long time ago, but I'm too superstitious to change it.
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [None]:
if holdout_file:
    filtered_holdout_arrays = filter_ndvi(holdout_pixel_arrays)
    filtered_holdout_arrays = filter_bright(filtered_holdout_arrays)
    x_holdout = normalize(filtered_holdout_arrays)
    x_holdout = np.expand_dims(x_holdout, -1)
    y_holdout = keras.utils.to_categorical(np.ones(len(x_holdout)), num_classes)
    print("Num Holdout Samples:\t\t", len(x_holdout))

## Create and Train a Model

In [None]:
input_shape = np.shape(x_train[0])
model = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(16, kernel_size=(3,2), activation="relu"),
        layers.Conv2D(32, kernel_size=(3,1), activation="relu"),
        layers.Flatten(),
        layers.Dense(32, activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dropout(0.1),
        layers.Dense(32, activation="relu"),
        layers.Dense(num_classes, activation="softmax"),
    ]
)
model.summary()

### Optional Class Weighting
Over experimental testing, I found that weighting classes seemed to degrade performance. This could use further investigation. Convert cell type from Raw NBConvert back to Code to run.

In [None]:
# Compile model. Note that many of these metrics are extraneous. 
# Can be useful to track during training at times though
model.compile(loss="binary_crossentropy", 
              optimizer="adam", 
              metrics=[keras.metrics.Recall(thresholds=(0.7), name='precision'), 
                       keras.metrics.Precision(thresholds=(0.7), name='recall'),
                       keras.metrics.AUC(curve='PR', name='auc'),
                       "accuracy"],
              #loss_weights = sum(y_train) / len(y_train),
              #weighted_metrics = ['accuracy']
             )

train_accuracy = []
test_accuracy = []

### Train the Model

In [None]:
batch_size = 128
epochs = 15

model.fit(x_train, 
          y_train, 
          batch_size=batch_size, 
          epochs=epochs, 
          validation_data=(x_test, y_test),
          #validation_split=0.1,
          #class_weight = {0: negative_weight, 1: positive_weight}
         )

In [None]:
train_accuracy += model.history.history['accuracy']
test_accuracy += model.history.history['val_accuracy']
plt.figure(figsize=(8,5), dpi=100, facecolor=(1,1,1))
plt.plot(train_accuracy, label='Train Acc')
plt.plot(test_accuracy, c='r', label='Val Acc')
percent_negative = (sum(y_train == 0.0) / len(y_train))[1]
plt.plot([0, epochs-1], [percent_negative, percent_negative], '--', c='gray', label='Baseline')
plt.grid()
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Network Train and Val Accuracy')
plt.show()

In [None]:
threshold = 0.8
print("Test Set Metrics:")
print(classification_report(y_test[:,1], model.predict(x_test)[:,1] > threshold, 
                            target_names=['No TPA', 'TPA']))

if holdout_file:
    print("\nHoldout Positive Set Metrics:")
    print(classification_report(y_holdout[:,1], model.predict(x_holdout)[:,1] > threshold, 
                                target_names=['No TPA', 'TPA']))

## Save Model

In [None]:
version_number = '0.0.4'

current_date = date.today()
model_name = f"spectrogram_v{version_number}_{current_date.isoformat()}"
assert not os.path.exists('../models/' + model_name + '.h5'), f"Model of name {model_name} already exists"

with open('../models/' + model_name + '_config.txt', 'w') as f:
    f.write('Input Data:\n')
    [f.write(file + '\n') for file in data_files]
    f.write(f"\nBatch Size: {batch_size}")
    f.write(f"\nTraining Epochs: {len(train_accuracy)}")
    f.write('\n\nClassification Report\n')
    f.write(classification_report(y_test[:,1], model.predict(x_test)[:,1] > threshold, 
                            target_names=['No TPA', 'TPA']))
model.save('../models/' + model_name + '.h5')

## Visualize Network Predictions

In [None]:
model = keras.models.load_model('/Users/zu/Earthrise/plastics/models/spectrogram_v0.0.4_2021-05-19.h5')

In [None]:
# TPA Kebon Kongok on Lombok
rect_width = 0.02
coords = [116.0908,-8.6451]
start_date = '2020-05-01'
mosaic_end_date = '2021-05-01'

mosaics = dl_utils.download_mosaics(rect_from_point(coords, rect_width), start_date, mosaic_end_date, MOSAIC_PERIOD)
kebon = dl_utils.pair(mosaics, MOSAIC_GAP)
print(f'{len(kebon)} pairs returned')

preds_kebon = [predict_spectrogram(pair, model) for pair in kebon]

# for comparison with pre-spectrogram baselines only
end_date = '2020-06-01'
kebon_patches = dl_utils.download_patch(rect_from_point(coords, rect_width), start_date, end_date)

In [None]:
# TPA Alak, West Timor
rect_width = 0.02
coords = [123.546910, -10.200799]
start_date = '2020-05-01'
mosaic_end_date = '2021-05-01'
mosaics = dl_utils.download_mosaics(rect_from_point(coords, rect_width), start_date, mosaic_end_date, MOSAIC_PERIOD)
alak = dl_utils.pair(mosaics, MOSAIC_GAP)
print(f'{len(alak)} pairs returned')

preds_alak = [predict_spectrogram(pair, model) for pair in alak]

In [None]:
# TPA Lelowai, West Timor, large box
rect_width = 0.04
coords = [124.892853, -9.196869]
start_date = '2020-05-01'
mosaic_end_date = '2021-05-01'
mosaics = dl_utils.download_mosaics(rect_from_point(coords, rect_width), start_date, mosaic_end_date, MOSAIC_PERIOD)
lelowai = dl_utils.pair(mosaics, MOSAIC_GAP)
print(f'{len(lelowai)} pairs returned')

preds_lelowai = [predict_spectrogram(pair, model) for pair in lelowai]

In [None]:
# single pair predictions
visualize_predictions(kebon[2], [preds_kebon[2]], threshold=0.8)

In [None]:
# averaging over pairs' predictions
visualize_predictions([p[1] for p in kebon], preds_kebon, threshold=0.8)

In [None]:
# Compare to a baseline model
baseline_model = keras.models.load_model('../models/65_mo_tpa_bootstrap_toa-12-20-2020.h5')
pred_stack_baseline = make_predictions(kebon_patches, baseline_model)
visualize_predictions(kebon_patches, pred_stack_baseline, threshold=0.8)

### Show timeseries predictions

In [None]:
def time_series(pairs, preds, threshold=0.8):
    fig, axes = plt.subplots(1, len(preds), figsize=(30, 30))
    for ax, (pair, pred) in zip(axes, zip(pairs, preds)):
        rgb = normalize(pair[1][:,:,3:0:-1])
        rgb[pred > threshold] = [0.9, 0, 0.1]
        ax.imshow(np.clip(rgb, 0, 1))
        ax.axis('off')

In [None]:
time_series(kebon, preds_kebon)

In [None]:
time_series(alak, preds_alak)

In [None]:
time_series(lelowai, preds_lelowai)