In [None]:
from functools import partial
import json
import os
import pickle

from datetime import datetime
import ee
import geemap
from multiprocessing.dummy import Pool as ThreadPool
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import sys
sys.path.append('../')
from scripts.get_s2_data_ee import get_history, get_history_polygon, get_pixel_vectors

%load_ext autoreload
%autoreload 2

In [None]:
# Configuration:
# Set directory where training site json files are located and files are saved
# Set rect width for all patches that are not TPA sites
DATA_DIR = '../data/training_sites'
RECT_WIDTH = 0.002
ee.Initialize()

In [None]:
def load_points(file_name):
    """Load points saved as a GeoJSON and return a dictionary"""
    with open(os.path.join(DATA_DIR, file_name)) as f:
        sites = json.load(f)
    f.close()

    site_table = pd.DataFrame({
        'name': [file_name.split('_')[0] + '_' + str(index) for index in range(len(sites['features']))],
        'lon': [site['geometry']['coordinates'][0] for site in sites['features']],
        'lat': [site['geometry']['coordinates'][1] for site in sites['features']],
        'coords': [site['geometry']['coordinates'][0:2] for site in sites['features']],
    })
    
    return site_table

In [None]:
def sample_adjacent(tpa_sites, offset, direction='east'):
    """
    Create a function that outputs a data frame of sampling locations based on a distance
    and direction from each TPA site.
    This can be used for adjacent site sampling, or to create "random" negative sites if the
    offset distance is set further away from the TPA location.
    Returns a data frame
    """
    if  'east' in direction.lower():
        adjacent_sites = pd.DataFrame({
            'name': [f"{name}_{direction.lower()}_{offset}" for name in tpa_sites['name']],
            'lon': [lon + offset for lon in tpa_sites['lon']],
            'lat': [lat for lat in tpa_sites['lat']],
            'coords': [[lon + offset, lat] for lon, lat in zip(tpa_sites['lon'], tpa_sites['lat'])]
        })
        
    if  'west' in direction.lower():
        adjacent_sites = pd.DataFrame({
            'name': [f"{name}_{direction.lower()}_{offset}" for name in tpa_sites['name']],
            'lon': [lon - offset for lon in tpa_sites['lon']],
            'lat': [lat for lat in tpa_sites['lat']],
            'coords': [[lon + offset, lat] for lon, lat in zip(tpa_sites['lon'], tpa_sites['lat'])]
        })
    
    if  'north' in direction.lower():
        adjacent_sites = pd.DataFrame({
            'name': [f"{name}_{direction.lower()}_{offset}" for name in tpa_sites['name']],
            'lon': [lon for lon in tpa_sites['lon']],
            'lat': [lat + offset for lat in tpa_sites['lat']],
            'coords': [[lon + offset, lat] for lon, lat in zip(tpa_sites['lon'], tpa_sites['lat'])]
        })
    
    if  'south' in direction.lower():
        adjacent_sites = pd.DataFrame({
            'name': [f"{name}_{direction.lower()}_{offset}" for name in tpa_sites['name']],
            'lon': [lon for lon in tpa_sites['lon']],
            'lat': [lat - offset for lat in tpa_sites['lat']],
            'coords': [[lon + offset, lat] for lon, lat in zip(tpa_sites['lon'], tpa_sites['lat'])]
        })
    
    return adjacent_sites

In [None]:
# Load TPA dataset
with open(os.path.join(DATA_DIR, 'tpa_points.json')) as f:
    tpa_points = json.load(f)
    f.close()

tpa_sites = pd.DataFrame({
    'name': [site['properties']['Name'] for site in tpa_points['features']],
    'lon': [site['geometry']['coordinates'][0] for site in tpa_points['features']],
    'lat': [site['geometry']['coordinates'][1] for site in tpa_points['features']],
    'area': [site['properties']['Surface_Ha'] for site in tpa_points['features']],
    'daily_volume': [site['properties']['TOT_Kg/Day'] for site in tpa_points['features']],
    'coords': [site['geometry']['coordinates'] for site in tpa_points['features']]
})


# Add TPA Polygons to TPA dataframe
with open(os.path.join(DATA_DIR, 'tpa_polygons', 'tpa_polygons.json'), 'r') as f:
    json_tpa = json.load(f)
f.close()
tpa_polygons = [ee.FeatureCollection([element]) for element in list(json_tpa['features'])]
tpa_sites['polygons'] = tpa_polygons
display(tpa_sites.head())


In [None]:
# Create a list of dataframes for negative site sampling
# Some lists are specifically chosen, and some are generated 
# automatically based on the locations of TPA sites
negative_site_list = [load_points('city_points.json'), 
                      load_points('bare_earth_points.json'),
                      sample_adjacent(tpa_sites, 0.008, 'north'),
                      sample_adjacent(tpa_sites, 0.01, 'south'),
                      sample_adjacent(tpa_sites, 0.01, 'east'),
                      sample_adjacent(tpa_sites, 0.01, 'west'),
                      sample_adjacent(tpa_sites, 0.05, 'east'),
                      sample_adjacent(tpa_sites, 0.1, 'north'),
                      sample_adjacent(tpa_sites, 0.2, 'west'),
                      sample_adjacent(tpa_sites, 0.1, 'east'),
                      sample_adjacent(tpa_sites, 0.5, 'east'),
                     ]

In [None]:
# Create a list of patch histories
# Each patch history is a dictionary with the format:
# patch_history[date][site_name][band][band_img]
# This function takes a while to run as it is extracting data from GEE
patch_histories = []
for site in negative_site_list:
    patch_histories.append(get_history(site['coords'], site['name'], RECT_WIDTH))

In [None]:
# Decompose patch history into vectors
# Output is site_type, month, pixel, band_value
negative_pixel_vectors = []
for patch_type in tqdm(patch_histories):
    vectors = []
    for month in patch_type.keys():
        pixel_vectors, width, height = get_pixel_vectors(patch_type, month)
        vectors.append(pixel_vectors)
    negative_pixel_vectors.append(vectors)
    
# flatten all pixel_vectors into a flat set of vectors
# num_vectors, num_bands
negative_data = []
for site_type in negative_pixel_vectors:
    for month in site_type:
        for pixel in month:
            negative_data.append(pixel)
            
print("Number of Negative Train Samples:", np.shape(negative_data)[0])

In [None]:
# Get patch histories for TPA sites
tpa_patch_histories = get_history_polygon(tpa_sites['coords'], tpa_sites['name'], tpa_sites['polygons'], 4 * RECT_WIDTH)

In [None]:
# holdout_months refers to a strategy of holding out the last n months of data for validation
# Set this value to the number of months you want to separate from the training data

holdout_months = 2

# decompose patch history into vectors
# Output is site_type, month, pixel, band_value
positive_pixel_vectors = []
for month in list(tpa_patch_histories.keys())[:-holdout_months]:
    pixel_vectors, width, height = get_pixel_vectors(tpa_patch_histories, month)
    positive_pixel_vectors.append(pixel_vectors)
    
# flatten all pixel_vectors into a flat set of vectors
# num_vectors, num_bands
positive_train = []
for month in positive_pixel_vectors:
    for pixel in month:
        positive_train.append(pixel)
        
print("Number of Positive Train Samples:", np.shape(positive_train)[0])


# decompose patch history into vectors
# Output is site_type, month, pixel, band_value
positive_pixel_vectors = []
for month in list(tpa_patch_histories.keys())[-holdout_months:]:
    pixel_vectors, width, height = get_pixel_vectors(tpa_patch_histories, month)
    positive_pixel_vectors.append(pixel_vectors)
    
# flatten all pixel_vectors into a flat set of vectors
# num_vectors, num_bands
positive_test = []
for month in positive_pixel_vectors:
    for pixel in month:
        positive_test.append(pixel)
        
print("Number of positive test samples:", np.shape(positive_test)[0])

In [None]:
# Save patch histories and pixel vectors

f = open(os.path.join(DATA_DIR, "negative_patch_histories.pkl"),"wb")
pickle.dump(patch_histories, f)
f.close()

f = open(os.path.join(DATA_DIR, "tpa_patch_histories.pkl"),"wb")
pickle.dump(tpa_patch_histories, f)
f.close()

f = open(os.path.join(DATA_DIR, "negative_data.pkl"),"wb")
pickle.dump(negative_data, f)
f.close()

f = open(os.path.join(DATA_DIR, "positive_data.pkl"),"wb")
pickle.dump(positive_train, f)
f.close()

f = open(os.path.join(DATA_DIR, "positive_data_test.pkl"),"wb")
pickle.dump(positive_test, f)
f.close()

### Debug and Visualization

In [None]:
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib.lines import Line2D

In [None]:
def animate_patch_history(data, name):
    """
    Used for visualization and debugging. Takes a history dictionary and outputs a video
    for each timestep at each site in the history.
    """
    fig, ax = plt.subplots(dpi=100, facecolor=(1,1,1))
    ax.set_axis_off()
    images = []
    init_date = list(data.keys())[0]
    for site_name in data[init_date]:
        for date in data.keys():
            ax.set_title(name)
            hyperpatch = data[date][site_name]
            rgb = np.stack((hyperpatch['B4'], hyperpatch['B3'], hyperpatch['B2']), axis=-1)
            if len(rgb) > 0:
                im = plt.imshow(rgb / 2000, animated=True)
                images.append([im])
    fig.tight_layout()
    print(site_name.split('_')[1:])
    ani = animation.ArtistAnimation(fig, images, interval=60, blit=True, repeat_delay=500)
    ani.save(os.path.join('figures', 'videos', name + '.mp4'))

In [None]:
animate_patch_history(tpa_patch_histories, 'TPA')
animate_patch_history(patch_histories[0], 'city')

In [None]:
# Plot pixel spectral profiles to observe any anomalies
num_samples = 2000
plt.figure(figsize=(8,5), dpi=150, facecolor=(1,1,1))
for i in range(num_samples):
    neg_index = np.random.randint(len(negative_data))
    pos_index = np.random.randint(len(positive_test))
    plt.plot(positive_test[i], c='r', alpha=0.01);
    plt.plot(negative_data[i], c='k', alpha=0.01);

custom_lines = [Line2D([0], [0], color='r', lw=2),
                Line2D([0], [0], color='k', lw=2)]
plt.legend(custom_lines, ['TPA Test', 'Negative'], loc='upper left')
plt.title('Spectral Profiles of Positive and Negative Samples')
plt.show()

plt.figure(figsize=(8,5), dpi=150, facecolor=(1,1,1))
for i in range(num_samples):
    neg_index = np.random.randint(len(negative_data))
    pos_index = np.random.randint(len(positive_train))
    plt.plot(positive_train[i], c='r', alpha=0.01);
    plt.plot(negative_data[i], c='k', alpha=0.01);

custom_lines = [Line2D([0], [0], color='r', lw=2),
                Line2D([0], [0], color='k', lw=2)]
plt.legend(custom_lines, ['TPA Train', 'Negative'], loc='upper left')
plt.title('Spectral Profiles of Positive and Negative Samples')
plt.show()