# Create Pixel Dataset
This notebook is the source for downloading Sentinel data to produce inputs to the spectral classifier.

## Inputs
The notebook operates by loading a set of coordinates either from a geojson or csv. For each location in the list, it downloads a patch of width `RECT_WIDTH` across a specified period of time.

Note: the sampling of TPA sites is different. It constructs a bounding box around the polygon geometry of a TPA site. The process is convoluted. Will need to do better sampling at some point.

## Outputs
### Raw Data (Patch Histories):
This is a minimally-processed form of the data. It is a dictionary of arrays with a structure `[date][site_name][band][band_img]`. These dictionaries can then be processed into pixel vectors, or could also converted to 2D stacks of patches.

### Pixel Vectors:
The output is a list of vectors. `[num_vectors][bands]`

In [None]:
import json
import os
import pickle

import ee
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import sys
sys.path.append('../')
from scripts.get_s2_data_ee import get_history, get_history_polygon, get_pixel_vectors
from scripts.viz_tools import visualize_history

%load_ext autoreload
%autoreload 2

In [None]:
# Sentinel 2 band descriptions
band_descriptions = {
    'B1': 'Aerosols, 442nm',
    'B2': 'Blue, 492nm',
    'B3': 'Green, 559nm',
    'B4': 'Red, 665nm',
    'B5': 'Red Edge 1, 704nm',
    'B6': 'Red Edge 2, 739nm',
    'B7': 'Red Edge 3, 779nm',
    'B8': 'NIR, 833nm',
    'B8A': 'Red Edge 4, 864nm',
    'B9': 'Water Vapor, 943nm',
    'B11': 'SWIR 1, 1610nm',
    'B12': 'SWIR 2, 2186nm'
}

In [None]:
def load_geojson(file_name):
    """Load points saved as a GeoJSON and return a dictionary"""
    with open(os.path.join(DATA_DIR, file_name)) as f:
        sites = json.load(f)
    f.close()

    sampling_df = pd.DataFrame({
        'name': [file_name.split('_')[0] + '_' + str(index) for index in range(len(sites['features']))],
        'lon': [site['geometry']['coordinates'][0] for site in sites['features']],
        'lat': [site['geometry']['coordinates'][1] for site in sites['features']],
        'coords': [site['geometry']['coordinates'][0:2] for site in sites['features']],
    })
    
    return sampling_df

def load_csv(file_name):
    sampling_df = pd.read_csv(os.path.join(DATA_DIR, file_name), converters={'coords': eval})
    
    return sampling_df

def sample_adjacent(tpa_sites, offset, direction='east'):
    """
    Outputs a data frame of sampling locations based on a distance
    and direction from each TPA site.
    This can be used for adjacent site sampling, or to create "random" negative sites if the
    offset distance is set further away from the TPA location.
    Returns a data frame
    """
    if  'east' in direction.lower():
        adjacent_sites = pd.DataFrame({
            'name': [f"{name}_{direction.lower()}_{offset}" for name in tpa_sites['name']],
            'lon': [lon + offset for lon in tpa_sites['lon']],
            'lat': [lat for lat in tpa_sites['lat']],
            'coords': [[lon + offset, lat] for lon, lat in zip(tpa_sites['lon'], tpa_sites['lat'])]
        })
        
    if  'west' in direction.lower():
        adjacent_sites = pd.DataFrame({
            'name': [f"{name}_{direction.lower()}_{offset}" for name in tpa_sites['name']],
            'lon': [lon - offset for lon in tpa_sites['lon']],
            'lat': [lat for lat in tpa_sites['lat']],
            'coords': [[lon + offset, lat] for lon, lat in zip(tpa_sites['lon'], tpa_sites['lat'])]
        })
    
    if  'north' in direction.lower():
        adjacent_sites = pd.DataFrame({
            'name': [f"{name}_{direction.lower()}_{offset}" for name in tpa_sites['name']],
            'lon': [lon for lon in tpa_sites['lon']],
            'lat': [lat + offset for lat in tpa_sites['lat']],
            'coords': [[lon + offset, lat] for lon, lat in zip(tpa_sites['lon'], tpa_sites['lat'])]
        })
    
    if  'south' in direction.lower():
        adjacent_sites = pd.DataFrame({
            'name': [f"{name}_{direction.lower()}_{offset}" for name in tpa_sites['name']],
            'lon': [lon for lon in tpa_sites['lon']],
            'lat': [lat - offset for lat in tpa_sites['lat']],
            'coords': [[lon + offset, lat] for lon, lat in zip(tpa_sites['lon'], tpa_sites['lat'])]
        })
    
    return adjacent_sites

def save_patch_history(data, name, label_class):
    first_date = list(patch_history.keys())[0]
    first_site = list(patch_history[first_date].keys())[0]
    num_pixels = np.shape(patch_history[first_date][first_site]['B2'])[0]
    file_name = f"{name}_raw_{num_months}_months_{start_date}"
    
    visualize_history(data, file_path=os.path.join(OUTPUT_DIR, 'patch_histories', f"{file_name}_{num_pixels}px_patch_history.png"))
    with open(os.path.join(OUTPUT_DIR, 'patch_histories', f"{file_name}_{num_pixels}px_patch_history.pkl"),"wb") as f:
        pickle.dump(data, f)
        
    with open(os.path.join(OUTPUT_DIR, 'patch_histories', f"{file_name}_{num_pixels}px_patch_history_labels.pkl"),"wb") as f:
        pickle.dump([label_class] * len(data), f)
        
        
def create_pixel_vectors(patch_history, num_months, holdout=False):
    # Decompose patch history into vectors
    # Output is month, pixel, band_value
    pixel_data = []
    if not holdout:
        for month in list(patch_history.keys())[:num_months]:
            pixel_vectors, width, height = get_pixel_vectors(patch_history, month)
            pixel_data.append(pixel_vectors)

    else:
        for month in list(patch_history.keys())[num_months:]:
            pixel_vectors, width, height = get_pixel_vectors(patch_history, month)
            pixel_data.append(pixel_vectors)
    # flatten all pixel_vectors into a flat set of vectors
    # num_vectors, num_bands
    pixel_vectors = []
    for month in pixel_data:
        for pixel in month:
            pixel_vectors.append(pixel)

    print(np.shape(pixel_vectors)[0], "pixel vectors")
    
    return pixel_vectors

def save_pixel_vectors(data, name, label_class):
    file_name = f"{name}_raw_{num_months}_months_{start_date}"
    with open(os.path.join(OUTPUT_DIR, 'pixel_vectors', f"{file_name}_pixel_vectors.pkl"),"wb") as f:
        pickle.dump(data, f)
        
    with open(os.path.join(OUTPUT_DIR, 'pixel_vectors', f"{file_name}_pixel_vector_labels.pkl"),"wb") as f:
        pickle.dump([label_class] * len(data), f)

# Load Sampling Locations

In [None]:
# Configuration:
# Set directory where training site json files are located and files are saved
# Set rect width for all patches that are not TPA sites
DATA_DIR = '../data/sampling_locations'
OUTPUT_DIR = '../data/training_data'

if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

### Load TPA Polygon Sites from GeoJSON

In [None]:
# Load TPA dataset
with open(os.path.join(DATA_DIR, 'tpa_points.geojson')) as f:
    tpa_points = json.load(f)

tpa_sites = pd.DataFrame({
    'name': [site['properties']['Name'] for site in tpa_points['features']],
    'lon': [site['geometry']['coordinates'][0] for site in tpa_points['features']],
    'lat': [site['geometry']['coordinates'][1] for site in tpa_points['features']],
    'area': [site['properties']['Surface_Ha'] for site in tpa_points['features']],
    'daily_volume': [site['properties']['TOT_Kg/Day'] for site in tpa_points['features']],
    'coords': [site['geometry']['coordinates'] for site in tpa_points['features']]
})


# Add earth engine TPA Polygons to TPA dataframe
with open(os.path.join(DATA_DIR, 'tpa_polygons.geojson'), 'r') as f:
    json_tpa = json.load(f)
f.close()
tpa_polygons = [ee.FeatureCollection([element]) for element in list(json_tpa['features'])]

tpa_sites['polygons'] = tpa_polygons
display(tpa_sites.head())

### Load Sampling Sites from GeoJSON

In [None]:
sampling_df = load_geojson('city_points_30.geojson')
sampling_df.head()

### Sample Sites Adjacent to another List

In [None]:
adjacent_df = sample_adjacent(tpa_sites, 0.01, 'north')
adjacent_df.head()

### Load Sampling Sites from CSV

In [None]:
sampling_df = load_csv('negative_sites.csv')
sampling_df.head()

### Write Sampling Sites to CSV

In [None]:
sampling_df.to_csv(os.path.join(DATA_DIR, 'negative_sites_test.csv'), index=False)

# Download Data

In [None]:
RECT_WIDTH = 0.002

### Download Patch History

In [None]:
# Create a list of patch histories
# Each patch history is a dictionary with the format:
# patch_history[date][site_name][band][band_img]
# This function takes a while to run as it is extracting data from GEE

site_list = sampling_df
num_months = 2
start_date = '2019-01-01'

patch_history = get_history(site_list['coords'], 
                            site_list['name'], 
                            RECT_WIDTH,
                            num_months = num_months,
                            start_date = start_date,
                            cloud_mask = True)

save_patch_history(patch_history, 'city_points_30', 0)

### Download TPA Polygon History

In [None]:
# Get patch histories for TPA sites
num_months = 2
start_date = '2020-01-01'
tpa_patch_history = get_history_polygon(tpa_sites['coords'], 
                                        tpa_sites['name'], 
                                        tpa_sites['polygons'], 
                                        4 * RECT_WIDTH,
                                        start_date = start_date,
                                        num_months = num_months,
                                       )
save_patch_history(tpa_patch_history, 'tpa_sites', 1)

# Create Pixel Vectors

In [None]:
pixel_vectors = create_pixel_vectors(patch_histories, len(patch_histories))
save_pixel_vectors(pixel_vectors, 'city_points_30', 0)

### Export Pixel Vectors with a Holdout Dataset

In [None]:
# holdout_months refers to a strategy of holding out the last n months of data for validation
# Set this value to the number of months you want to separate from the training data

holdout_months = 3

pixel_vectors = create_pixel_vectors(tpa_patch_history, len(tpa_patch_history) - holdout_months)
save_pixel_vectors(pixel_vectors, 'tpa_train', 1)

holdout_pixel_vectors = create_pixel_vectors(tpa_patch_history, -holdout_months, holdout=True)
save_pixel_vectors(holdout_pixel_vectors, 'tpa_holdout', 1)