<a href="https://colab.research.google.com/github/daveluo/opencitiesaichallenge-stac/blob/master/challengestac_browser_modify.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install pystac

In [0]:
from pystac import (Catalog, CatalogType, Item, Asset, LabelItem, Collection)

In [0]:
# overwriting STAC_IO read method to handle http/s as per https://pystac.readthedocs.io/en/latest/concepts.html#using-stac-io

from urllib.parse import urlparse
import requests
from pystac import STAC_IO

def my_read_method(uri):
    parsed = urlparse(uri)
    if parsed.scheme.startswith('http'):
        return requests.get(uri).text
    else:
        return STAC_IO.default_read_text_method(uri)

STAC_IO.read_text_method = my_read_method

In [0]:
# load our official training STACs
s3_path = 'https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com'
train1_cat = Catalog.from_file(f'{s3_path}/train_tier_1/catalog.json')
train2_cat = Catalog.from_file(f'{s3_path}/train_tier_2/catalog.json')

In [5]:
train1_cat.id, train2_cat.id

('train_tier_1', 'train_tier_2')

In [0]:
# create new catalog
cat = Catalog(id='open_cities_ai_challenge', 
              title='Open Cities AI Challenge' ,
              description='Training data STACs modified for stac-browser use. Join the Challenge at: https://www.drivendata.org/competitions/60/building-segmentation-disaster-resilience/')

In [0]:
# rename original collections and add to new cat
for orig_cat in [train1_cat, train2_cat]:
  col_ids = [col.id for col in orig_cat.get_children()]
  tier = orig_cat.id
  for col_id in col_ids:
      col = Collection.from_file(f'{s3_path}/{tier}/{col_id}/collection.json')
      col.id = tier+'_'+col.id
      cat.add_child(col)

In [8]:
 cols = [col for col in cat.get_children()]
 cols

[<Collection id=train_tier_1_acc>,
 <Collection id=train_tier_1_mon>,
 <Collection id=train_tier_1_ptn>,
 <Collection id=train_tier_1_kam>,
 <Collection id=train_tier_1_dar>,
 <Collection id=train_tier_1_znz>,
 <Collection id=train_tier_1_nia>,
 <Collection id=train_tier_2_dar>,
 <Collection id=train_tier_2_gao>,
 <Collection id=train_tier_2_mah>,
 <Collection id=train_tier_2_kin>,
 <Collection id=train_tier_2_nia>]

In [9]:
# remove the non LabelItems which will be redundant for browsing. Doing this in 2 steps because trying to do it in 1 gives some IndexError to be figured out
remove_ids = []
for i in cat.get_all_items():
    if 'labels' not in i.id:
        print((i.get_parent().id, i.id))
        remove_ids.append((i.get_parent().id, i.id))

('train_tier_1_acc', '665946')
('train_tier_1_acc', 'a42435')
('train_tier_1_acc', 'ca041a')
('train_tier_1_acc', 'd41d81')
('train_tier_1_mon', '401175')
('train_tier_1_mon', '493701')
('train_tier_1_mon', '207cc7')
('train_tier_1_mon', 'f15272')
('train_tier_1_ptn', 'abe1a3')
('train_tier_1_ptn', 'f49f31')
('train_tier_1_kam', '4e7c7f')
('train_tier_1_dar', 'a017f9')
('train_tier_1_dar', 'b15fce')
('train_tier_1_dar', '353093')
('train_tier_1_dar', 'f883a0')
('train_tier_1_dar', '42f235')
('train_tier_1_dar', '0a4c40')
('train_tier_1_znz', '33cae6')
('train_tier_1_znz', '3b20d4')
('train_tier_1_znz', '076995')
('train_tier_1_znz', '75cdfa')
('train_tier_1_znz', '9b8638')
('train_tier_1_znz', '06f252')
('train_tier_1_znz', 'c7415c')
('train_tier_1_znz', 'aee7fd')
('train_tier_1_znz', '3f8360')
('train_tier_1_znz', '425403')
('train_tier_1_znz', 'bd5c14')
('train_tier_1_znz', 'e52478')
('train_tier_1_znz', 'bc32f1')
('train_tier_1_nia', '825a50')
('train_tier_2_dar', '8737a8')
('train_

In [10]:
for col, id in remove_ids:
  print(col,id)
  cat.get_child(col).remove_item(id)

train_tier_1_acc 665946
train_tier_1_acc a42435
train_tier_1_acc ca041a
train_tier_1_acc d41d81
train_tier_1_mon 401175
train_tier_1_mon 493701
train_tier_1_mon 207cc7
train_tier_1_mon f15272
train_tier_1_ptn abe1a3
train_tier_1_ptn f49f31
train_tier_1_kam 4e7c7f
train_tier_1_dar a017f9
train_tier_1_dar b15fce
train_tier_1_dar 353093
train_tier_1_dar f883a0
train_tier_1_dar 42f235
train_tier_1_dar 0a4c40
train_tier_1_znz 33cae6
train_tier_1_znz 3b20d4
train_tier_1_znz 076995
train_tier_1_znz 75cdfa
train_tier_1_znz 9b8638
train_tier_1_znz 06f252
train_tier_1_znz c7415c
train_tier_1_znz aee7fd
train_tier_1_znz 3f8360
train_tier_1_znz 425403
train_tier_1_znz bd5c14
train_tier_1_znz e52478
train_tier_1_znz bc32f1
train_tier_1_nia 825a50
train_tier_2_dar 8737a8
train_tier_2_dar e14d1d
train_tier_2_dar 8d7dd4
train_tier_2_dar 9870ba
train_tier_2_dar 3b3e53
train_tier_2_dar 0ccd08
train_tier_2_dar ab32c9
train_tier_2_dar 94a004
train_tier_2_dar fb4c1a
train_tier_2_dar ca3445
train_tier_2_dar

In [11]:
cat.describe()

* <Catalog id=open_cities_ai_challenge>
    * <Collection id=train_tier_1_acc>
      * <LabelItem id=665946-labels>
      * <LabelItem id=a42435-labels>
      * <LabelItem id=ca041a-labels>
      * <LabelItem id=d41d81-labels>
    * <Collection id=train_tier_1_mon>
      * <LabelItem id=401175-labels>
      * <LabelItem id=493701-labels>
      * <LabelItem id=207cc7-labels>
      * <LabelItem id=f15272-labels>
    * <Collection id=train_tier_1_ptn>
      * <LabelItem id=abe1a3-labels>
      * <LabelItem id=f49f31-labels>
    * <Collection id=train_tier_1_kam>
      * <LabelItem id=4e7c7f-labels>
    * <Collection id=train_tier_1_dar>
      * <LabelItem id=a017f9-labels>
      * <LabelItem id=b15fce-labels>
      * <LabelItem id=353093-labels>
      * <LabelItem id=f883a0-labels>
      * <LabelItem id=42f235-labels>
      * <LabelItem id=0a4c40-labels>
    * <Collection id=train_tier_1_znz>
      * <LabelItem id=33cae6-labels>
      * <LabelItem id=3b20d4-labels>
      * <LabelItem id=0

In [0]:
assets_path = s3_path

In [0]:
# set all asset hrefs to s3 bucket
cat.make_all_asset_hrefs_absolute()

In [20]:
for i in cat.get_all_items():
  print(i.id)

665946-labels
a42435-labels
ca041a-labels
d41d81-labels
401175-labels
493701-labels
207cc7-labels
f15272-labels
abe1a3-labels
f49f31-labels
4e7c7f-labels
a017f9-labels
b15fce-labels
353093-labels
f883a0-labels
42f235-labels
0a4c40-labels
33cae6-labels
3b20d4-labels
076995-labels
75cdfa-labels
9b8638-labels
06f252-labels
c7415c-labels
aee7fd-labels
3f8360-labels
425403-labels
bd5c14-labels
e52478-labels
bc32f1-labels
825a50-labels
8737a8-labels
e14d1d-labels
8d7dd4-labels
9870ba-labels
3b3e53-labels
0ccd08-labels
ab32c9-labels
94a004-labels
fb4c1a-labels
ca3445-labels
82a1f3-labels
cf83de-labels
97ce35-labels
24a7d8-labels
b8faa3-labels
63c3f9-labels
c533fa-labels
5fe6fb-labels
385a0e-labels
759e34-labels
ef8f27-labels
1d8af6-labels
d2f2f4-labels
eadfa3-labels
cbf72d-labels
5fadcd-labels
56e713-labels
219237-labels
109874-labels
f9d120-labels
541eff-labels
4f38e1-labels
048ffb-labels
71e6c2-labels
bd129c-labels
240168-labels
5fe2d3-labels
10d200-labels
255028-labels
982a1f-labels
b80615

In [21]:
i.to_dict()

{'assets': {'labels': {'href': 'https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_2/nia/b80615-labels/b80615.geojson',
   'type': 'application/geo+json'}},
 'bbox': [2.1333151101084424,
  13.461573543437861,
  2.1504421214572758,
  13.478692840960852],
 'geometry': {'coordinates': [[[2.1450795538629968, 13.462585681371642],
    [2.1450502285064967, 13.462607657702868],
    [2.1450348553964735, 13.46267340918685],
    [2.1449378706610887, 13.462752910646255],
    [2.144951977178319, 13.462850043383819],
    [2.1448575653828255, 13.462910441120258],
    [2.1447498699154384, 13.463010688826913],
    [2.1447075548759513, 13.463028776318522],
    [2.14464619936299, 13.463103270788855],
    [2.144612721500606, 13.46317247163292],
    [2.144423450822351, 13.463306796207672],
    [2.1443864699054136, 13.463348636678672],
    [2.1443626791631747, 13.463351185074725],
    [2.1442718367318343, 13.463441366425421],
    [2.144255540825557, 13.463487668393142

In [0]:
# manually add the corresponding image asset to every LabelItem
# also add label:classes to enable stac-browser preview to work correctly
for i in cat.get_all_items():
  if 'labels' in i.id:
      i.add_asset(key='image', asset=Asset(
          href=f"{assets_path}/{i.get_parent().id[:-4]}/{i.properties['area']}/{i.id.replace('-labels','')}/{i.id.replace('-labels','')}.tif", 
          media_type="image/tiff; application=geotiff; profile=cloud-optimized"
      ))
      i.properties['label:classes'] = [
          {
              "name":"building",
              "classes": [ 
                  "yes" 
              ]
          }
      ]

In [23]:
# check that image asset is added with correct href
i.to_dict()

{'assets': {'image': {'href': 'https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_2/nia/b80615/b80615.tif',
   'type': 'image/tiff; application=geotiff; profile=cloud-optimized'},
  'labels': {'href': 'https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_2/nia/b80615-labels/b80615.geojson',
   'type': 'application/geo+json'}},
 'bbox': [2.1333151101084424,
  13.461573543437861,
  2.1504421214572758,
  13.478692840960852],
 'geometry': {'coordinates': [[[2.1450795538629968, 13.462585681371642],
    [2.1450502285064967, 13.462607657702868],
    [2.1450348553964735, 13.46267340918685],
    [2.1449378706610887, 13.462752910646255],
    [2.144951977178319, 13.462850043383819],
    [2.1448575653828255, 13.462910441120258],
    [2.1447498699154384, 13.463010688826913],
    [2.1447075548759513, 13.463028776318522],
    [2.14464619936299, 13.463103270788855],
    [2.144612721500606, 13.46317247163292],
    [2.14442345

In [0]:
aoi_dict = {
    'ptn':'Pointe-Noire, Congo',
    'mon':'Monrovia, Liberia',
    'kin':'Kinshasa, DR Congo',
    'nia':'Niamey, Niger',
    'acc':'Accra, Ghana',
    'dar':'Dar es Salaam, Tanzania',
    'gao':'Ngaoundere, Cameroon',
    'znz':'Zanzibar (Unguja Island), Tanzania',
    'mah':'Mahe Island, Seychelles',
    'kam':'Kampala, Uganda'
}

In [0]:
from pystac import Provider

providers = [
    Provider(name="OpenStreetMap",
             roles=["producer"],
             url="https://www.openstreetmap.org"),
    Provider(name="GFDRR Labs",
             roles=["processor"],
             url="https://www.gfdrr.org/en/gfdrr-labs"),
    Provider(name="Azavea",
             roles=["processor"],
             url="https://www.azavea.com/"),
    Provider(name="DrivenData",
             roles=["host"],
             url="https://www.drivendata.org")
]

In [0]:
# add collection metadata
for col in cat.get_children():
    col.title=f'{col.id[:-4]} collection for {aoi_dict[col.id[-3:]]}'
    col.providers = providers

In [27]:
col.to_dict()

{'description': 'Tier 2 training data from nia',
 'extent': {'spatial': {'bbox': [[2.1333151101084424,
     13.461573543437861,
     2.1504421214572758,
     13.478692840960852]]},
  'temporal': {'interval': [['2019-10-29T00:00:00Z', None]]}},
 'id': 'train_tier_2_nia',
 'license': 'various',
 'links': [{'href': './982a1f-labels/982a1f-labels.json',
   'rel': 'item',
   'type': 'application/json'},
  {'href': './b80615-labels/b80615-labels.json',
   'rel': 'item',
   'type': 'application/json'},
  {'href': 'https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_2/nia/collection.json',
   'rel': 'self',
   'type': 'application/json'},
  {'href': None, 'rel': 'root', 'type': 'application/json'},
  {'href': None, 'rel': 'parent', 'type': 'application/json'}],
 'providers': [{'name': 'OpenStreetMap',
   'roles': ['producer'],
   'url': 'https://www.openstreetmap.org'},
  {'name': 'GFDRR Labs',
   'roles': ['processor'],
   'url': 'https://www.gfdrr.org/e

In [28]:
cat.to_dict()

{'description': 'Training data STACs modified for stac-browser use. Join the Challenge at: https://www.drivendata.org/competitions/60/building-segmentation-disaster-resilience/',
 'id': 'open_cities_ai_challenge',
 'links': [{'href': None, 'rel': 'root', 'type': 'application/json'},
  {'href': 'https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_1/acc/collection.json',
   'rel': 'child',
   'type': 'application/json'},
  {'href': 'https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_1/mon/collection.json',
   'rel': 'child',
   'type': 'application/json'},
  {'href': 'https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_1/ptn/collection.json',
   'rel': 'child',
   'type': 'application/json'},
  {'href': 'https://drivendata-competition-building-segmentation.s3-us-west-1.amazonaws.com/train_tier_1/kam/collection.json',
   'rel': 'child',
   'type': 'application/json'},
  {'

In [0]:
cat.normalize_and_save('challenge-stac', catalog_type=CatalogType.RELATIVE_PUBLISHED)

In [31]:
!tar -cvf challenge-stac.tar.gz challenge-stac

challenge-stac/
challenge-stac/train_tier_1_mon/
challenge-stac/train_tier_1_mon/493701-labels/
challenge-stac/train_tier_1_mon/493701-labels/493701-labels.json
challenge-stac/train_tier_1_mon/401175-labels/
challenge-stac/train_tier_1_mon/401175-labels/401175-labels.json
challenge-stac/train_tier_1_mon/f15272-labels/
challenge-stac/train_tier_1_mon/f15272-labels/f15272-labels.json
challenge-stac/train_tier_1_mon/collection.json
challenge-stac/train_tier_1_mon/207cc7-labels/
challenge-stac/train_tier_1_mon/207cc7-labels/207cc7-labels.json
challenge-stac/train_tier_1_acc/
challenge-stac/train_tier_1_acc/ca041a-labels/
challenge-stac/train_tier_1_acc/ca041a-labels/ca041a-labels.json
challenge-stac/train_tier_1_acc/665946-labels/
challenge-stac/train_tier_1_acc/665946-labels/665946-labels.json
challenge-stac/train_tier_1_acc/d41d81-labels/
challenge-stac/train_tier_1_acc/d41d81-labels/d41d81-labels.json
challenge-stac/train_tier_1_acc/a42435-labels/
challenge-stac/train_tier_1_acc/a42435-