# Create Datasets
This notebook is the source for downloading Sentinel data for the spectral classifier

### Inputs
Sites are generated from geojson inputs. The positive TPA sites are defined by polygons, and the negative sites are defined by points. 
New negative sites can be added by following the example of the bootstrap dataset

### Raw Data:
The output is a dictionary with a structure `[date][site_name][band][band_img]`

### Pixel Vectors:
The output is a list of vectors. `[num_vectors][bands]`

In [1]:
import json
import os
import pickle

import ee
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import sys
sys.path.append('../')
from scripts.get_s2_data_ee import get_history, get_history_polygon, get_pixel_vectors

%load_ext autoreload
%autoreload 2

In [3]:
# Sentinel 2 band descriptions
band_descriptions = {
    'B1': 'Aerosols, 442nm',
    'B2': 'Blue, 492nm',
    'B3': 'Green, 559nm',
    'B4': 'Red, 665nm',
    'B5': 'Red Edge 1, 704nm',
    'B6': 'Red Edge 2, 739nm',
    'B7': 'Red Edge 3, 779nm',
    'B8': 'NIR, 833nm',
    'B8A': 'Red Edge 4, 864nm',
    'B9': 'Water Vapor, 943nm',
    'B11': 'SWIR 1, 1610nm',
    'B12': 'SWIR 2, 2186nm'
}

In [73]:
def load_geojson(file_name):
    """Load points saved as a GeoJSON and return a dictionary"""
    with open(os.path.join(DATA_DIR, file_name)) as f:
        sites = json.load(f)
    f.close()

    sampling_df = pd.DataFrame({
        'name': [file_name.split('_')[0] + '_' + str(index) for index in range(len(sites['features']))],
        'lon': [site['geometry']['coordinates'][0] for site in sites['features']],
        'lat': [site['geometry']['coordinates'][1] for site in sites['features']],
        'coords': [site['geometry']['coordinates'][0:2] for site in sites['features']],
    })
    
    return sampling_df

def load_csv(file_name):
    sampling_df = pd.read_csv(os.path.join(DATA_DIR, file_name), converters={'coords': eval})
    
    return sampling_df

def sample_adjacent(tpa_sites, offset, direction='east'):
    """
    Outputs a data frame of sampling locations based on a distance
    and direction from each TPA site.
    This can be used for adjacent site sampling, or to create "random" negative sites if the
    offset distance is set further away from the TPA location.
    Returns a data frame
    """
    if  'east' in direction.lower():
        adjacent_sites = pd.DataFrame({
            'name': [f"{name}_{direction.lower()}_{offset}" for name in tpa_sites['name']],
            'lon': [lon + offset for lon in tpa_sites['lon']],
            'lat': [lat for lat in tpa_sites['lat']],
            'coords': [[lon + offset, lat] for lon, lat in zip(tpa_sites['lon'], tpa_sites['lat'])]
        })
        
    if  'west' in direction.lower():
        adjacent_sites = pd.DataFrame({
            'name': [f"{name}_{direction.lower()}_{offset}" for name in tpa_sites['name']],
            'lon': [lon - offset for lon in tpa_sites['lon']],
            'lat': [lat for lat in tpa_sites['lat']],
            'coords': [[lon + offset, lat] for lon, lat in zip(tpa_sites['lon'], tpa_sites['lat'])]
        })
    
    if  'north' in direction.lower():
        adjacent_sites = pd.DataFrame({
            'name': [f"{name}_{direction.lower()}_{offset}" for name in tpa_sites['name']],
            'lon': [lon for lon in tpa_sites['lon']],
            'lat': [lat + offset for lat in tpa_sites['lat']],
            'coords': [[lon + offset, lat] for lon, lat in zip(tpa_sites['lon'], tpa_sites['lat'])]
        })
    
    if  'south' in direction.lower():
        adjacent_sites = pd.DataFrame({
            'name': [f"{name}_{direction.lower()}_{offset}" for name in tpa_sites['name']],
            'lon': [lon for lon in tpa_sites['lon']],
            'lat': [lat - offset for lat in tpa_sites['lat']],
            'coords': [[lon + offset, lat] for lon, lat in zip(tpa_sites['lon'], tpa_sites['lat'])]
        })
    
    return adjacent_sites

def save_patch_history(data, name, label_class):
    file_name = f"{name}_raw_{num_months}_months_{start_date}"
    with open(os.path.join(OUTPUT_DIR, 'patch_histories', f"{file_name}_patch_history.pkl"),"wb") as f:
        pickle.dump(data, f)
        
    with open(os.path.join(OUTPUT_DIR, 'patch_histories', f"{file_name}_patch_history_labels.pkl"),"wb") as f:
        pickle.dump([label_class] * len(data), f)
        
        
def create_pixel_vectors(patch_history, num_months, holdout=False):
    # Decompose patch history into vectors
    # Output is month, pixel, band_value
    pixel_data = []
    if not holdout:
        for month in list(patch_history.keys())[:num_months]:
            pixel_vectors, width, height = get_pixel_vectors(patch_history, month)
            pixel_data.append(pixel_vectors)

    else:
        for month in list(patch_history.keys())[num_months:]:
            pixel_vectors, width, height = get_pixel_vectors(patch_history, month)
            pixel_data.append(pixel_vectors)
    # flatten all pixel_vectors into a flat set of vectors
    # num_vectors, num_bands
    pixel_vectors = []
    for month in pixel_data:
        for pixel in month:
            pixel_vectors.append(pixel)

    print(np.shape(pixel_vectors)[0], "pixel vectors")
    
    return pixel_vectors

def save_pixel_vectors(data, name, label_class):
    file_name = f"{name}_raw_{num_months}_months_{start_date}"
    with open(os.path.join(OUTPUT_DIR, 'pixel_vectors', f"{file_name}_pixel_vectors.pkl"),"wb") as f:
        pickle.dump(data, f)
        
    with open(os.path.join(OUTPUT_DIR, 'pixel_vectors', f"{file_name}_pixel_vector_labels.pkl"),"wb") as f:
        pickle.dump([label_class] * len(data), f)

# Load Sampling Locations

In [2]:
# Configuration:
# Set directory where training site json files are located and files are saved
# Set rect width for all patches that are not TPA sites
DATA_DIR = '../data/sampling_locations'
OUTPUT_DIR = '../data/training_data'

if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

### Load TPA Polygon Sites from GeoJSON

In [62]:
# Load TPA dataset
with open(os.path.join(DATA_DIR, 'tpa_points.geojson')) as f:
    tpa_points = json.load(f)

tpa_sites = pd.DataFrame({
    'name': [site['properties']['Name'] for site in tpa_points['features']],
    'lon': [site['geometry']['coordinates'][0] for site in tpa_points['features']],
    'lat': [site['geometry']['coordinates'][1] for site in tpa_points['features']],
    'area': [site['properties']['Surface_Ha'] for site in tpa_points['features']],
    'daily_volume': [site['properties']['TOT_Kg/Day'] for site in tpa_points['features']],
    'coords': [site['geometry']['coordinates'] for site in tpa_points['features']]
})


# Add earth engine TPA Polygons to TPA dataframe
with open(os.path.join(DATA_DIR, 'tpa_polygons.geojson'), 'r') as f:
    json_tpa = json.load(f)
f.close()
tpa_polygons = [ee.FeatureCollection([element]) for element in list(json_tpa['features'])]

tpa_sites['polygons'] = tpa_polygons
display(tpa_sites.head())

Unnamed: 0,name,lon,lat,area,daily_volume,coords,polygons
0,TPA Jungut Batu,115.459414,-8.670958,1.2,,"[115.45941439485306, -8.670958330781342]","ee.FeatureCollection({\n ""functionInvocationV..."
1,TPA Biaung,115.498017,-8.67993,1.85,9433.0,"[115.49801683267276, -8.679930042100876]","ee.FeatureCollection({\n ""functionInvocationV..."
2,TPA Sente,115.45446,-8.530372,1.0,43219.0,"[115.45446033358267, -8.530371792768301]","ee.FeatureCollection({\n ""functionInvocationV..."
3,TPA Regional Bangli,115.367927,-8.353542,0.99,47350.0,"[115.3679270185395, -8.353541681392851]","ee.FeatureCollection({\n ""functionInvocationV..."
4,TPA Peh,114.583295,-8.327938,2.0,38130.0,"[114.58329467897306, -8.327937523143966]","ee.FeatureCollection({\n ""functionInvocationV..."


### Load Sampling Sites from GeoJSON

In [45]:
sampling_df = load_geojson('city_points_30.geojson')
sampling_df.head()

Unnamed: 0,name,lon,lat,coords
0,city_0,114.619837,-8.361932,"[114.6198374623975, -8.361931821454325]"
1,city_1,115.218992,-8.682543,"[115.2189915773064, -8.682542635447703]"
2,city_2,115.152099,-8.803352,"[115.1520991337562, -8.803351890677076]"
3,city_3,115.448223,-8.676354,"[115.4482234100242, -8.676354239123828]"
4,city_4,115.552125,-8.674258,"[115.5521252514949, -8.674258048038155]"


### Sample Sites Adjacent to another List

In [18]:
adjacent_df = sample_adjacent(tpa_sites, 0.01, 'north')
adjacent_df.head()

Unnamed: 0,name,lon,lat,coords
0,TPA Jungut Batu_north_0.01,115.459414,-8.660958,"[115.46941439485306, -8.670958330781342]"
1,TPA Biaung_north_0.01,115.498017,-8.66993,"[115.50801683267277, -8.679930042100876]"
2,TPA Sente_north_0.01,115.45446,-8.520372,"[115.46446033358268, -8.530371792768301]"
3,TPA Regional Bangli_north_0.01,115.367927,-8.343542,"[115.37792701853951, -8.353541681392851]"
4,TPA Peh_north_0.01,114.583295,-8.317938,"[114.59329467897307, -8.327937523143966]"


### Load Sampling Sites from CSV

In [61]:
sampling_df = load_csv('negative_sites.csv')
sampling_df.head()

Unnamed: 0,name,lon,lat,coords
0,city_0,114.619837,-8.361932,"[114.6198374623975, -8.361931821454325]"
1,city_1,115.218992,-8.682543,"[115.2189915773064, -8.682542635447703]"
2,city_2,115.152099,-8.803352,"[115.1520991337562, -8.803351890677076]"
3,city_3,115.448223,-8.676354,"[115.4482234100242, -8.676354239123828]"
4,city_4,115.552125,-8.674258,"[115.5521252514949, -8.674258048038155]"


### Write Sampling Sites to CSV

In [34]:
sampling_df.to_csv(os.path.join(DATA_DIR, 'negative_sites_test.csv'), index=False)

# Download Data

In [35]:
RECT_WIDTH = 0.002

### Download Patch History

In [54]:
# Create a list of patch histories
# Each patch history is a dictionary with the format:
# patch_history[date][site_name][band][band_img]
# This function takes a while to run as it is extracting data from GEE

site_list = sampling_df
num_months = 1
start_date = '2019-01-01'

patch_history = get_history(site_list['coords'], 
                              site_list['name'], 
                              RECT_WIDTH,
                              num_months = num_months,
                              start_date = start_date,
                              cloud_mask = True)

save_patch_history(patch_history, 'city_points_30', 0)

  0%|          | 0/1 [00:00<?, ?it/s]

Downloading city_0
Downloading city_1
Downloading city_2
Downloading city_3
Downloading city_4
Downloading city_5
Downloading city_6
Downloading city_7
Downloading city_8
Downloading city_9
Downloading city_10
Downloading city_11
Downloading city_12
Downloading city_13
Downloading city_14
Downloading city_15
Downloading city_16
Downloading city_17
Downloading city_18
Downloading city_19
Downloading city_20
Downloading city_21
Downloading city_22
Downloading city_23
Downloading city_24
Downloading city_25
Downloading city_26
Downloading city_27
Downloading city_28
Downloading city_29
Downloading city_30


100%|██████████| 1/1 [01:49<00:00, 109.43s/it]


### Download TPA Polygon History

In [68]:
# Get patch histories for TPA sites
num_months = 2
start_date = '2020-01-01'
tpa_patch_history = get_history_polygon(tpa_sites['coords'], 
                                        tpa_sites['name'], 
                                        tpa_sites['polygons'], 
                                        4 * RECT_WIDTH,
                                        start_date = start_date,
                                        num_months = num_months,
                                       )
save_patch_history(tpa_patch_history, 'tpa_sites', 1)

  0%|          | 0/2 [00:00<?, ?it/s]

Processing TPA Jungut Batu
Processing TPA Biaung
Processing TPA Sente
Processing TPA Regional Bangli
Processing TPA Peh
Processing TPA Temesi
Processing TPA Bengkala
Processing TPA Bebandem
Processing TPA Mandung
Processing TPA Regional Suwung


 50%|█████     | 1/2 [01:19<01:19, 79.11s/it]

Processing TPA Jungut Batu
Processing TPA Biaung
Processing TPA Sente
Processing TPA Regional Bangli
Processing TPA Peh
Processing TPA Temesi
Processing TPA Bengkala
Processing TPA Bebandem
Processing TPA Mandung
Processing TPA Regional Suwung


100%|██████████| 2/2 [02:06<00:00, 63.24s/it]


# Create Pixel Vectors

In [75]:
pixel_vectors = create_pixel_vectors(patch_histories, len(patch_histories))
save_pixel_vectors(pixel_vectors, 'city_points_30', 0)

10764 pixel vectors


### Pixel Vectors with a Holdout Dataset

In [76]:
# holdout_months refers to a strategy of holding out the last n months of data for validation
# Set this value to the number of months you want to separate from the training data

holdout_months = 3

pixel_vectors = create_pixel_vectors(tpa_patch_history, len(tpa_patch_history) - holdout_months)
save_pixel_vectors(pixel_vectors, 'tpa_train', 1)

holdout_pixel_vectors = create_pixel_vectors(tpa_patch_history, -holdout_months, holdout=True)
save_pixel_vectors(holdout_pixel_vectors, 'tpa_holdout', 1)

2547 pixel vectors
4269 pixel vectors


## Create Spatial Patches

In [None]:
def create_img_stack(patch_history):
    img_stack = []
    for date in patch_history:
        for site in patch_history[date]:
            spectral_stack = []
            band_shapes = [np.shape(patch_history[date][site][band]) for band in band_descriptions]
            if np.array(band_shapes).all() > 0:
                for band in band_descriptions:
                    spectral_stack.append(patch_history[date][site][band])
                if np.min(spectral_stack) > 0:
                    img_stack.append(np.rollaxis(np.array(spectral_stack), 0, 3))
    return img_stack

In [None]:
positive_patches = create_img_stack(tpa_patch_histories)
print(len(positive_patches), 'positive images extracted')

negative_patches = create_img_stack(negative_patch_histories)
print(len(negative_patches), 'negative images extracted')

In [None]:
# Save positive patch history
with open(os.path.join(OUTPUT_DIR, f"positive_patches_toa_{num_positive_months}_{positive_start_date}.pkl"),"wb") as f:
    pickle.dump(positive_patches, f)
    
# Save negative patch history
with open(os.path.join(OUTPUT_DIR, f"negative_patches_toa_{num_positive_months}_{positive_start_date}.pkl"),"wb") as f:
    pickle.dump(negative_patches, f)