# Create Pixel Dataset
This notebook is the source for downloading Sentinel data to produce inputs to the spectral classifier.

## Inputs
The notebook operates by loading a set of sampling sites from a geojson. If the geojson contains `Point` features, a bounding rect is constructed. If the geojson contains `Polygon` or `MultiPolygon` features, only pixels within the polygon will be extracted.

The `download_patch` script attempts to mask clouds. However, cloudy pixels and patches can still come through.

Pixels that fall outside of a polygon are also masked using a numpy masked array. These vectors are not stored in the pixel vector list.

## Outputs

### Pixel Vectors:
The output is a list of vectors saved as a pickle. The vectors are not normalized. The dimensionality is `[num_pixel_vectors][bands]`
### Image Plot:
To log the data in a pixel vector dataset, a grid of input images is exported along with the datset.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import descarteslabs as dl
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import pandas as pd
import seaborn as sns
import sys
from tqdm.notebook import tqdm

sys.path.append('../')
from scripts.viz_tools import normalize, band_descriptions, plot_image_grid
from scripts.dl_utils import download_patch, rect_from_point

In [None]:
def save_pixel_vectors(data, name, label_class):
    file_name = f"{name}_{START_DATE}_{END_DATE}"
    with open(os.path.join(OUTPUT_DIR, f"{file_name}_pixel_vectors.pkl"),"wb") as f:
        pickle.dump(data, f)
    with open(os.path.join(OUTPUT_DIR, f"{file_name}_pixel_vector_labels.pkl"),"wb") as f:
        pickle.dump([label_class] * len(data), f)

In [None]:
# Define Parameters for data extraction
START_DATE = '2019-01-01'
END_DATE = '2019-02-01'

OUTPUT_DIR = '../data/training_data/pixel_vectors'
if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

In [None]:
# Create or extract polygons from a sampling location
sampling_file = 'tpa_polygons'
data_dir = '../data/sampling_locations/'
with open(os.path.join(data_dir, sampling_file + '.geojson'), 'r') as f:
    data = json.load(f)['features'] 
    
# Set rect width in pixels. Only required for point samples
num_pixels = 48
# Convert pixels to degrees. Heuristic, not geographically sound
# Better to go with slightly bigger patches that can then be cropped
rect_width = np.round((num_pixels / 100) / 111.32, 4)    

polygons = []
for feature in data:
    if feature['geometry']['type'] == 'Point':
        polygons.append(rect_from_point(feature['geometry']['coordinates'], rect_width))
    if feature['geometry']['type'] == 'MultiPolygon' or feature['geometry']['type'] == 'Polygon':
        polygons.append(feature['geometry'])

In [None]:
# Download Sentinel Data
img_stack = []
for polygon in tqdm(polygons):
    patches = download_patch(polygon, START_DATE, END_DATE)
    for patch in patches:
        img_stack.append(patch)
print(len(img_stack), "cloud masked patches extracted")

In [None]:
label_class = 0
name = sampling_file

figure_file_path = os.path.join(OUTPUT_DIR, f"{name} - Class {label_class}")
plot_image_grid(img_stack, file_path=figure_file_path)

## Make sure to set appropriate label class!
Negative sites = 0, Positive sites = 1

In [None]:
# Create pixel vectors
pixel_vectors = []
for img in tqdm(img_stack):
    height, width, channels = img.shape
    vectors = img.reshape(height * width, channels)
    [pixel_vectors.append(vector) for vector in vectors if np.mean(vector) > 0]
print(f"{np.shape(pixel_vectors)[0]:,} pixel vectors extracted")

# Save pixel vectors
save_pixel_vectors(data=pixel_vectors, name=name, label_class=label_class)

In [None]:
# Plot the mean pixel spectra of the extracted dataset.
# Optional process that can take time with many samples
data = pd.DataFrame(pixel_vectors, columns=band_descriptions.keys()).melt(var_name='band', value_name='value')
plt.figure(figsize=(6,4), dpi=150, facecolor=(1,1,1))
sns.lineplot(x='band', y='value', data=data, ci="sd")
plt.title('Mean Value +/- SD')
plt.show()