# Create Patch Dataset
This notebook is the source for downloading Sentinel data for the 2D patch classifier

### Inputs
The notebook operates by loading a set of coordinates either from a geojson or csv. For each location in the list, it downloads a patch of width `RECT_WIDTH` across a specified period of time.

### Outputs:
Multispectral patches with the structure `[num_patches, height, width, bands]`

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import numpy as np
import os
import pandas as pd
import pickle
from tqdm.notebook import tqdm

from scripts.dl_utils import download_patch, rect_from_point, pad_patch
from scripts.viz_tools import normalize

In [None]:
def load_sites(file_name, rect_width):
    """Load points or polygons from a GeoJSON and return a list of rects as polygons"""
    with open(os.path.join(DATA_DIR, file_name + '.geojson'), 'r') as f:
        data = json.load(f)['features']

    coords = []
    for site in data:
        if site['geometry']['type'] == 'MultiPolygon':
            lon = np.squeeze(site['geometry']['coordinates'])[:,0]
            lat = np.squeeze(site['geometry']['coordinates'])[:,1]
            coords.append([np.mean(lon), np.mean(lat)])
        elif site['geometry']['type'] == 'Point':
            coords.append(site['geometry']['coordinates'])
    print(len(coords), ' sites loaded')
    polygons = [rect_from_point(point, rect_width) for point in coords]
    return polygons
        
def save_patches(data, name, label_class):
    num_pixels = np.shape(data)[1]
    file_name = f"{name}_patches_{start_date}_{end_date}"
    with open(os.path.join(OUTPUT_DIR, 'patches', f"{file_name}_{num_pixels}px_patches.pkl"),"wb") as f:
        pickle.dump(data, f)
        
    with open(os.path.join(OUTPUT_DIR, 'patches', f"{file_name}_{num_pixels}px_patch_labels.pkl"),"wb") as f:
        pickle.dump([label_class] * len(data), f)

## Load Sampling Locations

In [None]:
# Configuration:
# Set directory where training site json files are located and files are saved
DATA_DIR = '../data/sampling_locations'
OUTPUT_DIR = '../data/training_data'
if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)
    
patch_width = 45
# Improperly convert patch size to degrees (fix soon!)
rect_width = np.round((patch_width / 100) / 111.1, 4)

In [None]:
# Create site polygons from a geojson. Preferred method
file_name = 'v_1.1.5_negatives'
label_class = 0
polygons = load_sites(file_name, rect_width)

In [None]:
# Create site polygons from candidate csv
# Note: I'm trying to move away from using csv files for data
file_name = 'w_nusa_tenggara_v1.1_positives'
label_class = 1
coords = pd.read_csv(os.path.join(DATA_DIR, file_name + '.csv'), converters={'coords': eval})
polygons = [rect_from_point([lon, lat], rect_width) for lon, lat in zip(coords['lon'], coords['lat'])]
print(len(polygons), 'sites loaded')

## Download Data

In [None]:
start_date = '2020-01-01'
end_date = '2021-01-01'
cloud_threshold = 0.1

patches = []
for polygon in tqdm(polygons):
    img_stack = download_patch(polygon, start_date, end_date)
    for patch in img_stack:
        if np.sum(patch.mask) / patch.size < cloud_threshold:
            patches.append(pad_patch(patch, patch_width))
        
save_patches(patches, file_name, label_class)
print(len(patches), 'images extracted')

In [None]:
img_file_path = os.path.join(OUTPUT_DIR, 'patches', f"{file_name}_{patch_width}px_{start_date}_{end_date}_patches.png")
plot_image_grid(patches, file_path = img_file_path)