# Create Dataset
This notebook downloads Sentinel data from Descartes Labs to produce composited patches. Note: This requires acccess to Descartes Labs.

## Inputs
The notebook operates by loading a set of sampling sites from a geojson. If the geojson contains `Point` features, a bounding rect is constructed. If the geojson contains `Polygon` or `MultiPolygon` features, only pixels within the polygon will be extracted.

The `download_mosaic` function attempts to mask clouds. However, cloudy pixels and patches can still come through.

Pixels that fall outside of a polygon are also masked using a numpy masked array.

## Outputs

### Patch Arrays:
The output list of patch arrays is saved as a pickle. The arrays are not normalized. The dimensionality of each array is  `[num_samples][width][height][channels]`.
A labels file is also written that corresponds to the label class defined in the notebook.

### Image Plot:
To log the data in a dataset, a grid of input images is exported along with the datset.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys

import json
import numpy as np
import pickle
from tqdm.notebook import tqdm

parent_dir = os.path.split(os.getcwd())[0]
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

from scripts import dl_utils
from scripts.viz_tools import plot_image_grid

In [None]:
def save_patch_arrays(data, basepath, label_class):
    with open(basepath + '_patch_arrays.pkl', "wb") as f:
        pickle.dump(data, f)
    with open(basepath + '_patch_array_labels.pkl', "wb") as f:
        pickle.dump([label_class] * len(data), f)

### Define Parameters for data extraction
### Attention: make sure to set appropriate label class!
Negative sites = 0, Positive sites = 1

In [None]:
sampling_file = 'v2.6_amazon_negatives_v2'
data_dir = '../data/sampling_locations/'
label_class = 0

START_DATE = '2020-01-01'
END_DATE = '2021-02-01'
METHOD = 'median'
MOSAIC_PERIOD = 4  # the period over which to mosaic image data in months
MAX_CLOUD = 0.25  # maximum cloud cover for a tile to be included in the dataset

In [None]:
# Create or extract polygons from a sampling location
with open(os.path.join(data_dir, sampling_file + '.geojson'), 'r') as f:
    data = json.load(f)['features'] 
    
# Set rect width in pixels. Only required for point samples. 
# Generally select a larger rect than intended patch size. Better to go with slightly bigger patches that can then be cropped.
num_pixels = 48
# Convert pixels to degrees. Heuristic, not geographically sound
rect_width = np.round((num_pixels / 100) / 111.32, 4)    

polygons = []
for feature in data:
    if feature['geometry']['type'] == 'Point':
        polygons.append(dl_utils.rect_from_point(feature['geometry']['coordinates'], rect_width))
    if feature['geometry']['type'] == 'MultiPolygon' or feature['geometry']['type'] == 'Polygon':
        polygons.append(feature['geometry'])
print(f'{len(polygons)} polygons loaded.')

In [None]:
PATCH_OUTPUT_DIR = f'../data/training_data/patch_composites_{num_pixels}px'
if not os.path.exists(PATCH_OUTPUT_DIR):
    os.makedirs(PATCH_OUTPUT_DIR)

### Download Sentinel Data
Depending on the size of the dataset, this process can take a fair bit of time. Faster now, but can take ~20 sec / per patch

In [None]:
patches = []
for polygon in tqdm(polygons):
    try:
        data = dl_utils.SentinelData(polygon, START_DATE, END_DATE, MOSAIC_PERIOD, method=METHOD)
        rect_width = rect_width
        data.search_scenes()
        data.download_scenes()
        data.create_composites()
        composites = data.composites
        dates = data.composite_dates
        bounds = data.metadata[0]["wgs84Extent"]["coordinates"][0][:-1]
        data.compute_cloud_fraction()
        patches += [p for p, cloud in zip(composites, data.cloud_fraction) if cloud < MAX_CLOUD]
    except KeyboardInterrupt:
        print("Keyboard Interrupt!")
        break
    except Exception as e:
        print('Failure', polygon)
        print(e)


In [None]:
# View all images
figure_file_path = os.path.join(PATCH_OUTPUT_DIR, f"{sampling_file}_patches-Class_{label_class}-{START_DATE}-{END_DATE}-{METHOD}")
plot_image_grid(np.array(patches), file_path=figure_file_path)

In [None]:
patch_basepath = os.path.join(PATCH_OUTPUT_DIR, f"{sampling_file}_{START_DATE}_{END_DATE}_period_{MOSAIC_PERIOD}_method_{METHOD}")
save_patch_arrays(patches, patch_basepath, label_class)