# STAC EuroSAT

In this demo we generate STAC metadata for the [EuroSAT](https://github.com/phelber/EuroSAT) dataset.

In [1]:
# data download

import requests
import os 
from pathlib import Path
import zipfile

url = 'https://madm.dfki.de/files/sentinel/EuroSATallBands.zip'
path = Path('data')

if not os.path.exists(path / 'ds'):
	r = requests.get(url, allow_redirects=True)
	open('data/EuroSATallBands.zip', 'wb').write(r.content)
	with zipfile.ZipFile(path / 'EuroSATallBands.zip', 'r') as zip_ref:
		zip_ref.extractall(path)

The EuroSAT dataset consists of 2700 Sentinel 2 images with one label per image for scene classification. There are 10 different categories in total.

In [2]:
import pandas 
from glob import glob 

images = glob(str(path) + '/ds/**/*.tif', recursive=True)
labels = [x.split('/')[-1].split('_')[0] for x in images]
cats = os.listdir(path / 'ds/images/remote_sensing/otherDatasets/sentinel_2/tif')
ixs = [cats.index(x) for x in labels]

df = pandas.DataFrame({'image': images, 'label': labels, 'ix': ixs})
df

Unnamed: 0,image,label,ix
0,data/ds/images/remote_sensing/otherDatasets/se...,Industrial,0
1,data/ds/images/remote_sensing/otherDatasets/se...,Industrial,0
2,data/ds/images/remote_sensing/otherDatasets/se...,Industrial,0
3,data/ds/images/remote_sensing/otherDatasets/se...,Industrial,0
4,data/ds/images/remote_sensing/otherDatasets/se...,Industrial,0
...,...,...,...
26995,data/ds/images/remote_sensing/otherDatasets/se...,Pasture,9
26996,data/ds/images/remote_sensing/otherDatasets/se...,Pasture,9
26997,data/ds/images/remote_sensing/otherDatasets/se...,Pasture,9
26998,data/ds/images/remote_sensing/otherDatasets/se...,Pasture,9


In [3]:
df.ix.unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

We start by generating STAC metadata following the core STAC specification. 

- We generate a STAC item for every image in the datasets
- a STAC collection to represent the images collection
- a STAC catalog to represent the final dataset (which will include also the annotations).

https://pystac.readthedocs.io/en/stable/

In [4]:
import pystac
from datetime import datetime
import rasterio as rio
import uuid
from shapely.geometry import GeometryCollection, Polygon, box, shape, mapping
from tqdm import tqdm

In [5]:
# create empty catalog

eurosat = pystac.Catalog(id="eurosat", description="EuroSAT dataset")
eurosat

0
ID: eurosat
Description: EuroSAT dataset

0
Rel: root
Target:
Media Type: application/json


In [6]:
# create collection

# # spatial extent (should compute from images)
sp_extent = pystac.SpatialExtent([None,None,None,None])

# temporal extentn (should compute from images or given by authors)
from_date = datetime.strptime('2015-10-22', '%Y-%m-%d') # unknown
to_date = datetime.strptime('2019-10-22', '%Y-%m-%d') # unknown
tmp_extent = pystac.TemporalExtent([(from_date, to_date)])

extent = pystac.Extent(sp_extent, tmp_extent)

sentinel = pystac.Collection(id='sentinel2', description = 'EuroSAT Sentinel 2 dataset', extent = extent)
eurosat.add_child(sentinel)

eurosat

0
ID: eurosat
Description: EuroSAT dataset

0
ID: sentinel2
Description: EuroSAT Sentinel 2 dataset

0
Rel: root
Target:
Media Type: application/json

0
Rel: parent
Target:
Media Type: application/json

0
Rel: root
Target:
Media Type: application/json

0
Rel: child
Target:
Media Type: application/json


In [7]:
# creating items

dst_path = path / 'eurosat'
def create_item(image):
    params = {}
    params['id'] = image.split('/')[-1].split('.')[0] # use original name
    params['datetime'] = from_date # unknown
    params['properties'] = {}
    with rio.open(image) as src:
        params['bbox'] = list(src.bounds)
        params['geometry'] = mapping(box(*params['bbox']))
        i = pystac.Item(**params)
        image_dst_path = dst_path / f"{params['id']}.tif"
        for band in src.indexes:
            image_dst_path = dst_path / f"{params['id']}_B{band}.tif"
            out_meta = src.meta.copy()
            out_meta.update({"count": 1})
            with rio.open(image_dst_path, "w", **out_meta) as dest:
                dest.write(src.read(band), 1)
            i.add_asset(key=f'B{band}', asset=pystac.Asset(href=str(image_dst_path), title='Geotiff', media_type=pystac.MediaType.GEOTIFF))
    return i

In [8]:
import multiprocessing
from concurrent.futures import ProcessPoolExecutor

num_cores = multiprocessing.cpu_count()
with ProcessPoolExecutor(max_workers=num_cores) as pool:
    with tqdm(total=len(images)) as progress:
        futures = []
        for image in df.image:
            future = pool.submit(create_item, image) 
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)
        items = []
        for future in futures:
            result = future.result()
            items.append(result)
            
for item in tqdm(items):
  sentinel.add_item(item)

100%|██████████| 27000/27000 [01:26<00:00, 313.81it/s]
100%|██████████| 27000/27000 [00:17<00:00, 1528.85it/s]


In [9]:
# reset spatial extent

bounds = [list(GeometryCollection([shape(s.geometry) for s in eurosat.get_all_items()]).bounds)]
sentinel.extent.spatial = pystac.SpatialExtent(bounds)

In [10]:
eurosat.normalize_hrefs('eurosat-stac')

In [11]:
eurosat.validate_all() # this takes too long...

In [12]:
eurosat.save(catalog_type=pystac.CatalogType.SELF_CONTAINED)

We have created a STAC Catalog for our dataset !