# Data Collection
This notebook is meant for connecting to Sentinel Hub Python API to search Sentinel-2 collection and retrieve the corresponding satellite imagery.

! Important: This notebook was tested only in Colab enviroment. In order to execute certain cell other task have to be completed before hand including mounting the google drive.

In [1]:
!pip install sentinelhub

Collecting sentinelhub
  Downloading sentinelhub-3.9.1-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.4/244.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aenum>=2.1.4 (from sentinelhub)
  Downloading aenum-3.1.15-py3-none-any.whl (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.6/137.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json (from sentinelhub)
  Downloading dataclasses_json-0.5.14-py3-none-any.whl (26 kB)
Collecting tomli-w (from sentinelhub)
  Downloading tomli_w-1.0.0-py3-none-any.whl (6.0 kB)
Collecting utm (from sentinelhub)
  Downloading utm-0.7.0.tar.gz (8.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json->sentinelhub)
  Downloading marshmallow-3.20.1-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:

In [14]:
from typing import Any, Optional, Tuple

import matplotlib.pyplot as plt
import numpy as np
import datetime
import os
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
import tarfile
import json
import cv2
from PIL import Image as im
import pandas as pd

from sentinelhub import (
    CRS,
    BBox,
    DataCollection,
    DownloadRequest,
    MimeType,
    SHConfig,
    MosaickingOrder,
    SentinelHubDownloadClient,
    SentinelHubRequest,
    bbox_to_dimensions,
    SentinelHubCatalog,
    filter_times
)

In [6]:
def plot_image(image, factor, clip_range = None, **kwargs: Any) -> None:
    """Utility function for plotting RGB images."""
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 15))
    if clip_range is not None:
        ax.imshow(np.clip(image * factor, *clip_range), **kwargs)
    else:
        ax.imshow(image * factor, **kwargs)
    ax.set_xticks([])
    ax.set_yticks([])

In order to access the data, a free new user profile has to be made in Sentinel Hub platform and a free trial account has to be activated. The next steps of where and which credential have to be obtain is detailed in their documentation page: https://sentinelhub-py.readthedocs.io/en/latest/configure.html#sentinel-hub-configuration

In [10]:
# Write your credentials here
CLIENT_ID = "<---sh_client_id---->"
CLIENT_SECRET = "<---sh_client_secret---->"
INSTANCE_ID = "<---instance_id---->"

config = SHConfig()
if CLIENT_ID and CLIENT_SECRET:
    config.instance_id = INSTANCE_ID
    config.sh_client_id = CLIENT_ID
    config.sh_client_secret = CLIENT_SECRET

In [11]:
# Define the time range and AOI
# Lyme Regis, South England
resolution = 1
landslide_bbox = BBox((-2.930574,50.727037,-2.904997,50.737625), crs=CRS.WGS84)
landslide_size = bbox_to_dimensions(landslide_bbox, resolution=resolution)
time_interval = "2015-01-01", "2022-12-31"

In [13]:
# Search the Sentinel-2 catalog and extract all the available dates that meet the criteria
# This request search for Sentinel-2 L1C product with less than 10% cloud coverage
# To search for Sentinel-2 L2A, replace the data collection to - DataCollection.SENTINEL2_L2A

# Returns product's ID, capture date and time, cloud coverage %

catalog = SentinelHubCatalog(config=config)
collections = catalog.get_collections()

search_iterator = catalog.search(
    DataCollection.SENTINEL2_L1C,
    bbox=landslide_bbox,
    time=time_interval,
    filter="eo:cloud_cover < 10",
    fields={"include": ["id", "properties.datetime", "properties.eo:cloud_cover"], "exclude": []},
)

results = list(search_iterator)
print("Total number of results:", len(results))

print("Catalog results: ")
results

Total number of results: 232
Catalog results: 


[{'id': 'S2B_MSIL1C_20221220T112409_N0509_R037_T30UVB_20221220T120224',
  'properties': {'datetime': '2022-12-20T11:27:06Z', 'eo:cloud_cover': 6.74}},
 {'id': 'S2B_MSIL1C_20221220T112409_N0509_R037_T30UWB_20221220T120224',
  'properties': {'datetime': '2022-12-20T11:27:02Z', 'eo:cloud_cover': 0.3}},
 {'id': 'S2A_MSIL1C_20221215T112501_N0509_R037_T30UVB_20221215T132021',
  'properties': {'datetime': '2022-12-15T11:27:07Z', 'eo:cloud_cover': 0.01}},
 {'id': 'S2A_MSIL1C_20221215T112501_N0509_R037_T30UWB_20221215T132021',
  'properties': {'datetime': '2022-12-15T11:27:02Z', 'eo:cloud_cover': 0.0}},
 {'id': 'S2B_MSIL1C_20221207T111339_N0509_R137_T30UVB_20221207T115404',
  'properties': {'datetime': '2022-12-07T11:17:07Z', 'eo:cloud_cover': 1.67}},
 {'id': 'S2B_MSIL1C_20221207T111339_N0509_R137_T30UWB_20221207T115404',
  'properties': {'datetime': '2022-12-07T11:17:03Z', 'eo:cloud_cover': 0.3}},
 {'id': 'S2A_MSIL1C_20221125T112411_N0400_R037_T30UVB_20221125T132446',
  'properties': {'datetim

In [15]:
# Find the unique date and time values between the two geographical tiles 30UVB and 30UWB
time_difference = dt.timedelta(hours=1)

all_timestamps = search_iterator.get_timestamps()
unique_acquisitions = filter_times(all_timestamps, time_difference)

print("Number of unique dates: ", len(unique_acquisitions))
print("Unique dates: ", unique_acquisitions)

Number of unique dates:  153
Unique dates:  [datetime.datetime(2015, 7, 25, 11, 25, 40, tzinfo=tzlocal()), datetime.datetime(2015, 9, 10, 11, 16, 33, tzinfo=tzlocal()), datetime.datetime(2015, 9, 30, 11, 11, 2, tzinfo=tzlocal()), datetime.datetime(2015, 10, 20, 11, 11, 3, tzinfo=tzlocal()), datetime.datetime(2016, 7, 19, 11, 21, 17, tzinfo=tzlocal()), datetime.datetime(2016, 8, 15, 11, 8, 3, tzinfo=tzlocal()), datetime.datetime(2016, 11, 6, 11, 23, 7, tzinfo=tzlocal()), datetime.datetime(2017, 1, 2, 11, 14, 41, tzinfo=tzlocal()), datetime.datetime(2017, 1, 5, 11, 24, 39, tzinfo=tzlocal()), datetime.datetime(2017, 1, 25, 11, 23, 33, tzinfo=tzlocal()), datetime.datetime(2017, 3, 13, 11, 12, 12, tzinfo=tzlocal()), datetime.datetime(2017, 4, 2, 11, 6, 47, tzinfo=tzlocal()), datetime.datetime(2017, 5, 25, 11, 24, 34, tzinfo=tzlocal()), datetime.datetime(2017, 6, 1, 11, 12, 25, tzinfo=tzlocal()), datetime.datetime(2017, 6, 21, 11, 12, 22, tzinfo=tzlocal()), datetime.datetime(2017, 7, 1, 11, 

In [16]:
## Define the evaluation function that will be passed in the request
# Request will return:
# - Blue (B02), Green (B03), Red (B04), NIR (B08) and SWIR (B11) bands + CLM (cloud mask raster)
# - Sentinel-2 metadata

evalscript_all_bands = """
    //VERSION=3
    function setup() {
        return {
            input: [{
                bands: ["B02", "B03", "B04", "B08", "B11", "CLM"],
                units: "DN"}],
            output: {
                bands: 6,
                sampleType: "INT16"
            },
            mosaicking: Mosaicking.TILE
        };
    }

    function updateOutputMetadata(scenes, inputMetadata, outputMetadata) {
        outputMetadata.userData = { "norm_factor":  inputMetadata.normalizationFactor,
                                    "scenes":  JSON.stringify(scenes)}
    }

    function evaluatePixel(sample) {
        return [sample[0].B02,
                sample[0].B03,
                sample[0].B04,
                sample[0].B08,
                sample[0].B11,
                sample[0].CLM];
    }
"""

In [18]:
# Sentinel Hub request script
# Key parameters to change:
# - data_folder (where the files will be saved),
# - data_collection (to return S2 L1C - DataCollection.SENTINEL2_L1C, S2 L2A - DataCollection.SENTINEL2_L2A)

process_requests = []

for timestamp in unique_acquisitions:
    request = SentinelHubRequest(
        data_folder="/content/drive/MyDrive/FINAL_project/full_lyme_regis_S2_L1C",
        evalscript=evalscript_all_bands,
        input_data=[
            SentinelHubRequest.input_data(
                data_collection=DataCollection.SENTINEL2_L1C,
                time_interval=(timestamp - time_difference, timestamp + time_difference),
                mosaicking_order=MosaickingOrder.LEAST_CC,
            )
        ],
        responses=[SentinelHubRequest.output_response("default", MimeType.TIFF),
                    SentinelHubRequest.output_response("userdata", MimeType.JSON)],
        bbox=landslide_bbox,
        size=landslide_size,
        config=config,
    )

    process_requests.append(request)

In [20]:
%%time
# Submit the request and download the tiles
downloaded_data = [request.get_data(save_data=True) for request in process_requests]

In [21]:
# # Load one of the tiles and visualize
image = process_requests[0].get_data()[0]['default.tif']
print(image.shape)
plot_image(image[:, :, [2, 1, 0]], factor=3.5 / 1e4, clip_range=(0, 1))

In [None]:
## Extract the data and meta data from the tar zipped folder
input_lyme_regis_path = "/content/drive/MyDrive/FINAL_project/full_lyme_regis_S2_L1C"

folder_ids = []

for root, dirs, files in os.walk(input_lyme_regis_path):
  for file in files:
    if file.startswith("response"):
      folder_id = root.rsplit('/', 1)[1]
      print(folder_id)
      output_path = os.path.join(root, f'response_{folder_id}')
      file_tar = tarfile.open(os.path.join(root,file))
      file_tar.extractall(output_path) # specify which folder to extract to
      file_tar.close()

      folder_ids.append(folder_id)

print("Check the number of extracted folders: ", len(folder_ids))