In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ["EOTDL_API_URL"] = "http://localhost:8000/"


New way to ingest datasets:

1. In order to ingest a dataset to EOTDL we require:
	- `eotdl.parquet`: A parquet file representing the STAC catalog/collection as list of STAC items.
	- `README.md`: A markdown file with the metadata of the dataset.
2. The parquet file is autogenerated for all these cases:
	- Ingest all files in a folder (without STAC metadata)
	- Provide a list of links to files (virtual datasets)
	- Ingest an existing STAC catalo

Only local assets will be ingeted to the EOTDL (not URLs).

# Ingesting a dataset from a folder

If user wants to ingest dataset form folder without STAC metdata, first we read all files in the folder recursively and create a parquet file.

In [2]:
from glob import glob

path = 'data/EuroSAT-RGB-small'
# path = 'data/EuroSAT-RGB'

# # retrieve all files in the folder recursively
# files = glob(path + '/**/*', recursive=True)

# len(files), files[:3]

In order to ingest any dataset to EOTDL, we require a REDME.md file with some mandatory metadata.

In [3]:
# create README.md

text = """---
name: EuroSAT-RGB-small
authors: 
  - Juan B. Pedro
license: free
source: https://github.com/earthpulse/eotdl/blob/develop/tutorials/workshops/philab24/02_prototype_ingesting.ipynb
---

# EuroSAT-RGB-small

This is a prototype of the EuroSAT dataset.
"""

with open(f"{path}/README.md", "w") as outfile:
    outfile.write(text)

In [4]:
from eotdl.datasets import ingest_dataset

ingest_dataset(path)

  np.nanmin(b[:, 0]),  # minx
  np.nanmin(b[:, 1]),  # miny
  np.nanmax(b[:, 2]),  # maxx
  np.nanmax(b[:, 3]),  # maxy


Ingesting directory:  data/EuroSAT-RGB-small


Ingesting files: 100%|██████████| 102/102 [00:01<00:00, 51.08it/s]


PosixPath('data/EuroSAT-RGB-small/catalog.parquet')

In [5]:
import geopandas as gpd

gpd.read_parquet(path + "/catalog.parquet")

Unnamed: 0,stac_extensions,id,bbox,geometry,assets,links,collection,abc,123
0,[],catalog.parquet,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,{'asset': {'href': 'catalog.parquet'}},[],source,[],"{'asfhjk': [1, 2, 3]}"
1,[],README.md,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,{'asset': {'href': 'README.md'}},[],source,[],"{'asfhjk': [1, 2, 3]}"
2,[],Industrial/Industrial_1743.jpg,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,{'asset': {'href': 'Industrial/Industrial_1743...,[],source,[],"{'asfhjk': [1, 2, 3]}"
3,[],Industrial/Industrial_1273.jpg,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,{'asset': {'href': 'Industrial/Industrial_1273...,[],source,[],"{'asfhjk': [1, 2, 3]}"
4,[],Industrial/Industrial_1117.jpg,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,{'asset': {'href': 'Industrial/Industrial_1117...,[],source,[],"{'asfhjk': [1, 2, 3]}"
...,...,...,...,...,...,...,...,...,...
97,[],Pasture/Pasture_650.jpg,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,{'asset': {'href': 'Pasture/Pasture_650.jpg'}},[],source,[],"{'asfhjk': [1, 2, 3]}"
98,[],Pasture/Pasture_370.jpg,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,{'asset': {'href': 'Pasture/Pasture_370.jpg'}},[],source,[],"{'asfhjk': [1, 2, 3]}"
99,[],Pasture/Pasture_1976.jpg,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,{'asset': {'href': 'Pasture/Pasture_1976.jpg'}},[],source,[],"{'asfhjk': [1, 2, 3]}"
100,[],Pasture/Pasture_839.jpg,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,{'asset': {'href': 'Pasture/Pasture_839.jpg'}},[],source,[],"{'asfhjk': [1, 2, 3]}"


In [7]:
# import pyarrow.parquet as pq
# import stac_geoparquet

# table = pq.read_table(path + "/catalog.parquet")

# for item in stac_geoparquet.arrow.stac_table_to_items(table):
# 	print(item)
# 	break

will get all files in the folder recursively, create a simple catalog.json and ingest it into EOTDL.

# Ingesting a dataset from a list of links

We can ingest a new dataset from a list of links (huggingface, s3, etc).


In [7]:
links = [
	'https://link1.com',
	'https://link2.com',
	'https://link3.com',
]

metadata = {
	'name': 'Test-links',
	'authors': ['Juan B. Pedro'],
	'license': 'free',
	'source': 'https://github.com/earthpulse/eotdl/blob/develop/tutorials/workshops/philab24/02_prototype_ingesting.ipynb',
	'description': """# Test links

Testing the ingestion of a dataset from a list of links.
"""
}


In [8]:
from eotdl.datasets import ingest_virutal_dataset

path = 'data/test-links'

ingest_virutal_dataset(metadata, links)

  np.nanmin(b[:, 0]),  # minx
  np.nanmin(b[:, 1]),  # miny
  np.nanmax(b[:, 2]),  # maxx
  np.nanmax(b[:, 3]),  # maxy


'Dataset ingested successfully'

will create a simple catalog.json with links as items and ingest it into EOTDL. We can choose if we want to replicate the assets in EOTDL or not (use direct sources).

In [9]:
!rm -rf data/test-links

# Ingesting a dataset from a catalog


If STAC catalog already exists, we can ingest it into EOTDL. In this case, create README.md and place it in the root of the catalog.

In [8]:
path = 'data/EuroSAT-RGB-small-STAC'

files = os.listdir(path)
assert 'catalog.json' in files, "catalog.json not found"

!cat data/EuroSAT-RGB-small-STAC/catalog.json

{
  "type": "Catalog",
  "id": "EuroSAT-RGB-Q1",
  "stac_version": "1.0.0",
  "description": "EuroSAT-RGB dataset",
  "links": [
    {
      "rel": "root",
      "href": "./catalog.json",
      "type": "application/json"
    },
    {
      "rel": "child",
      "href": "./source/collection.json",
      "type": "application/json"
    },
    {
      "rel": "child",
      "href": "./labels/collection.json",
      "type": "application/json"
    }
  ]
}

In [9]:
# create README.md

text = """---
name: EuroSAT-RGB-small-STAC
authors: 
  - Juan B. Pedro
license: free
source: https://github.com/earthpulse/eotdl/blob/develop/tutorials/workshops/philab24/02_prototype_ingesting.ipynb
---

# EuroSAT-RGB-small-STAC

This is a prototype of the EuroSAT dataset.
"""

with open(f"{path}/README.md", "w") as outfile:
    outfile.write(text)

In [1]:
from eotdl.datasets import ingest_dataset

path = 'data/EuroSAT-RGB-small-STAC'

ingest_dataset(path)

Ingesting items from collection source: 100it [00:00, 1157.57it/s]
Ingesting items from collection labels: 100it [00:00, 3389.09it/s]


AttributeError: 'list' object has no attribute 'items'

PARECE QUE STAC-GEOPARQUET ESPERA LOS ASSETS COMO UNA LISTA DE DICTS, PERO PYSTAC LOS GUARDA COMO UN DICCIONARIO DE DICTS.

In [49]:
import pystac 

catalog = pystac.Catalog.from_file(path + "/catalog.json")
catalog

In [50]:
for collection in catalog.get_collections():
	print(collection.id)
	for item in collection.get_items():
		print(item.id)
		break
	break


source
Industrial_1743


In [2]:
import geopandas as gpd

gdf = gpd.read_parquet(path + "/catalog.parquet")
gdf

Unnamed: 0,assets,bbox,collection,geometry,id,links,stac_extensions,stac_version,type,datetime,label:classes,label:description,label:methods,label:properties,label:tasks,label:type
0,"{'AnnualCrop_1033': None, 'AnnualCrop_1101': N...","{'xmin': 0, 'ymin': 0, 'xmax': 0, 'ymax': 0}",source,"POLYGON ((0.00000 0.00000, 0.00000 0.00000, 0....",Industrial_1743,[{'href': '/home/juan/Desktop/eotdl/upgrade/da...,[],1.0.0,Feature,2000-01-01 00:00:00+00:00,,,,,,
1,"{'AnnualCrop_1033': None, 'AnnualCrop_1101': N...","{'xmin': 0, 'ymin': 0, 'xmax': 0, 'ymax': 0}",source,"POLYGON ((0.00000 0.00000, 0.00000 0.00000, 0....",Industrial_1273,[{'href': '/home/juan/Desktop/eotdl/upgrade/da...,[],1.0.0,Feature,2000-01-01 00:00:00+00:00,,,,,,
2,"{'AnnualCrop_1033': None, 'AnnualCrop_1101': N...","{'xmin': 0, 'ymin': 0, 'xmax': 0, 'ymax': 0}",source,"POLYGON ((0.00000 0.00000, 0.00000 0.00000, 0....",Industrial_1117,[{'href': '/home/juan/Desktop/eotdl/upgrade/da...,[],1.0.0,Feature,2000-01-01 00:00:00+00:00,,,,,,
3,"{'AnnualCrop_1033': None, 'AnnualCrop_1101': N...","{'xmin': 0, 'ymin': 0, 'xmax': 0, 'ymax': 0}",source,"POLYGON ((0.00000 0.00000, 0.00000 0.00000, 0....",Industrial_1121,[{'href': '/home/juan/Desktop/eotdl/upgrade/da...,[],1.0.0,Feature,2000-01-01 00:00:00+00:00,,,,,,
4,"{'AnnualCrop_1033': None, 'AnnualCrop_1101': N...","{'xmin': 0, 'ymin': 0, 'xmax': 0, 'ymax': 0}",source,"POLYGON ((0.00000 0.00000, 0.00000 0.00000, 0....",Industrial_1641,[{'href': '/home/juan/Desktop/eotdl/upgrade/da...,[],1.0.0,Feature,2000-01-01 00:00:00+00:00,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,"{'AnnualCrop_1033': None, 'AnnualCrop_1101': N...","{'xmin': 0, 'ymin': 0, 'xmax': 0, 'ymax': 0}",labels,"POLYGON ((0.00000 0.00000, 0.00000 0.00000, 0....",Pasture_650,[{'href': '../../source/Pasture_650/Pasture_65...,[https://stac-extensions.github.io/label/v1.0....,1.0.0,Feature,2000-01-01 00:00:00+00:00,"[{'classes': ['Industrial', 'Forest', 'Herbace...",Item label,[manual],[label],[classification],vector
196,"{'AnnualCrop_1033': None, 'AnnualCrop_1101': N...","{'xmin': 0, 'ymin': 0, 'xmax': 0, 'ymax': 0}",labels,"POLYGON ((0.00000 0.00000, 0.00000 0.00000, 0....",Pasture_370,[{'href': '../../source/Pasture_370/Pasture_37...,[https://stac-extensions.github.io/label/v1.0....,1.0.0,Feature,2000-01-01 00:00:00+00:00,"[{'classes': ['Industrial', 'Forest', 'Herbace...",Item label,[manual],[label],[classification],vector
197,"{'AnnualCrop_1033': None, 'AnnualCrop_1101': N...","{'xmin': 0, 'ymin': 0, 'xmax': 0, 'ymax': 0}",labels,"POLYGON ((0.00000 0.00000, 0.00000 0.00000, 0....",Pasture_1976,[{'href': '../../source/Pasture_1976/Pasture_1...,[https://stac-extensions.github.io/label/v1.0....,1.0.0,Feature,2000-01-01 00:00:00+00:00,"[{'classes': ['Industrial', 'Forest', 'Herbace...",Item label,[manual],[label],[classification],vector
198,"{'AnnualCrop_1033': None, 'AnnualCrop_1101': N...","{'xmin': 0, 'ymin': 0, 'xmax': 0, 'ymax': 0}",labels,"POLYGON ((0.00000 0.00000, 0.00000 0.00000, 0....",Pasture_839,[{'href': '../../source/Pasture_839/Pasture_83...,[https://stac-extensions.github.io/label/v1.0....,1.0.0,Feature,2000-01-01 00:00:00+00:00,"[{'classes': ['Industrial', 'Forest', 'Herbace...",Item label,[manual],[label],[classification],vector


In [46]:
import json
from glob import glob
from tqdm import tqdm


files = glob(path + "/**/*.json", recursive=True)

for file in tqdm(files):
	with open(file, "r") as f:
		data = json.load(f)
		if data["type"] == "Feature":
			assets = data["assets"]
			# convert assets dict to a list of assets
			if isinstance(assets, dict):
				assets = list(assets.values())
				# save new json with assets as list
				data["assets"] = assets
				data['links'] = []
				# print(file)
				# print(data)
				# new_file = file.replace(".json", "_assets.json")
				# with open(new_file, "w") as f:
				with open(file, "w") as f:
					json.dump(data, f)


100%|██████████| 203/203 [00:00<00:00, 5564.63it/s]
