In [1]:
%load_ext autoreload
%autoreload 2

import os
os.environ["EOTDL_API_URL"] = "http://localhost:8000/"


New way to ingest datasets:

1. In order to ingest a dataset to EOTDL we require:
	- `eotdl.parquet`: A parquet file representing the STAC catalog/collection as list of STAC items.
	- `README.md`: A markdown file with the metadata of the dataset.
2. The parquet file is autogenerated for all these cases:
	- Ingest all files in a folder (without STAC metadata)
	- Provide a list of links to files (virtual datasets)
	- Ingest an existing STAC catalo

Only local assets will be ingeted to the EOTDL (not URLs).

# Ingesting a dataset from a folder

If user wants to ingest dataset form folder without STAC metdata, first we read all files in the folder recursively and create a parquet file.

In [2]:
from glob import glob

path = 'data/EuroSAT-RGB-small'
# path = 'data/EuroSAT-RGB'

# # retrieve all files in the folder recursively
# files = glob(path + '/**/*', recursive=True)

# len(files), files[:3]

In [3]:
!rm -rf data/EuroSAT-RGB-small/README.md
!rm -rf data/EuroSAT-RGB-small/catalog.parquet

In order to ingest any dataset to EOTDL, we require a REDME.md file with some mandatory metadata.

In [4]:
# create README.md

text = """---
name: EuroSAT-RGB-small
authors: 
  - Juan B. Pedro
license: free
source: https://github.com/earthpulse/eotdl/blob/develop/tutorials/workshops/philab24/02_prototype_ingesting.ipynb
---

# EuroSAT-RGB-small

This is a prototype of the EuroSAT dataset.
"""

with open(f"{path}/README.md", "w") as outfile:
    outfile.write(text)

In [6]:
from eotdl.datasets import ingest_dataset

ingest_dataset(path)

  np.nanmin(b[:, 0]),  # minx
  np.nanmin(b[:, 1]),  # miny
  np.nanmax(b[:, 2]),  # maxx
  np.nanmax(b[:, 3]),  # maxy


Ingesting directory: data/EuroSAT-RGB-small
current version:  1


Ingesting files: 100%|██████████| 102/102 [00:00<00:00, 213.42it/s]

A new version was created, your dataset has changed.
Num changes: 2



  np.nanmin(b[:, 0]),  # minx
  np.nanmin(b[:, 1]),  # miny
  np.nanmax(b[:, 2]),  # maxx
  np.nanmax(b[:, 3]),  # maxy


PosixPath('data/EuroSAT-RGB-small/catalog.parquet')

In [15]:
import geopandas as gpd

gdf = gpd.read_parquet(path + "/catalog.parquet")
gdf.head()

Unnamed: 0,type,stac_version,stac_extensions,datetime,id,bbox,geometry,assets,links,repository
0,Feature,1.0.0,[],2025-02-06 12:33:57.165516,catalog.parquet,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,{'asset': {'href': 'http://localhost:8000/data...,[],eotdl
1,Feature,1.0.0,[],2025-02-06 12:33:57.165606,README.md,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,{'asset': {'href': 'http://localhost:8000/data...,[],eotdl
2,Feature,1.0.0,[],2025-02-06 12:33:57.165671,Industrial/Industrial_1743.jpg,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,{'asset': {'href': 'http://localhost:8000/data...,[],eotdl
3,Feature,1.0.0,[],2025-02-06 12:33:57.165704,Industrial/Industrial_1273.jpg,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,{'asset': {'href': 'http://localhost:8000/data...,[],eotdl
4,Feature,1.0.0,[],2025-02-06 12:33:57.165735,Industrial/Industrial_1117.jpg,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,{'asset': {'href': 'http://localhost:8000/data...,[],eotdl


In [16]:
gdf.assets[0]['asset']['href']

'http://localhost:8000/datasets/67a37f8d21a28cbb5f16d9bc/stage/catalog.parquet'

We can generate a STAC catalog from the parquet file.

In [None]:
# import pyarrow.parquet as pq
# import stac_geoparquet
# import json
# from tqdm import tqdm
# import pystac

# table = pq.read_table(path + "/catalog.parquet")

# # path = "data/stac"
# os.makedirs(path, exist_ok=True)

# for item in tqdm(stac_geoparquet.arrow.stac_table_to_items(table)):
# 	item = pystac.Item.from_dict(item)
# 	# path = "data/stac/" + item["id"] + ".json"
# 	# os.makedirs(os.path.dirname(path), exist_ok=True)
# 	# with open(path, "w") as f:
# 	# 	json.dump(item, f)
# 	item.validate()
# 	# save item
# 	os.makedirs(path, exist_ok=True)
# 	_path = path + '/' + item.id + ".json"
# 	os.makedirs(os.path.dirname(_path), exist_ok=True)
# 	with open(_path, "w") as f:
# 		json.dump(item.to_dict(), f)

0it [00:00, ?it/s]

102it [00:00, 365.37it/s]


Optionally, we could use `pystac` to create the STAC catalog, collections and links between items and collections.

will get all files in the folder recursively, create a simple catalog.json and ingest it into EOTDL.

# Ingesting a dataset from a list of links

We can ingest a new dataset from a list of links (huggingface, s3, etc).


In [2]:
links = [
	'https://link1.com',
	'https://link2.com',
	'https://link3.com',
]

metadata = {
	'name': 'Test-links',
	'authors': ['Juan B. Pedro'],
	'license': 'free',
	'source': 'https://github.com/earthpulse/eotdl/blob/develop/tutorials/workshops/philab24/02_prototype_ingesting.ipynb',
	'description': """# Test links

Testing the ingestion of a dataset from a list of links.
"""
}


In [3]:
from eotdl.datasets import ingest_virutal_dataset

path = 'data/test-links'

ingest_virutal_dataset(path, links, metadata)

  np.nanmin(b[:, 0]),  # minx
  np.nanmin(b[:, 1]),  # miny
  np.nanmax(b[:, 2]),  # maxx
  np.nanmax(b[:, 3]),  # maxy


'Dataset ingested successfully'

will create a simple catalog.json with links as items and ingest it into EOTDL. We can choose if we want to replicate the assets in EOTDL or not (use direct sources).

In [4]:
import geopandas as gpd

gpd.read_parquet(path + "/catalog.parquet")

Unnamed: 0,type,stac_version,stac_extensions,datetime,id,bbox,geometry,assets,links,collection,repository
0,Feature,1.0.0,[],2025-02-06 10:49:41.003037,https://link1.com,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,{'asset': {'href': 'https://link1.com'}},[],Test-links,eotdl
1,Feature,1.0.0,[],2025-02-06 10:49:41.003122,https://link2.com,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,{'asset': {'href': 'https://link2.com'}},[],Test-links,eotdl
2,Feature,1.0.0,[],2025-02-06 10:49:41.003133,https://link3.com,"{'xmax': 0.0, 'xmin': 0.0, 'ymax': 0.0, 'ymin'...",POLYGON EMPTY,{'asset': {'href': 'https://link3.com'}},[],Test-links,eotdl


In [9]:
!rm -rf data/test-links

# Ingesting a dataset from a catalog


If STAC catalog already exists, we can ingest it into EOTDL. In this case, create README.md and place it in the root of the catalog.

In [9]:
path = 'data/EuroSAT-RGB-small-STAC'

files = os.listdir(path)
assert 'catalog.json' in files, "catalog.json not found"

!cat data/EuroSAT-RGB-small-STAC/catalog.json

{
  "type": "Catalog",
  "id": "EuroSAT-RGB-Q1",
  "stac_version": "1.0.0",
  "description": "EuroSAT-RGB dataset",
  "links": [
    {
      "rel": "root",
      "href": "./catalog.json",
      "type": "application/json"
    },
    {
      "rel": "child",
      "href": "./source/collection.json",
      "type": "application/json"
    },
    {
      "rel": "child",
      "href": "./labels/collection.json",
      "type": "application/json"
    }
  ]
}

In [10]:
# create README.md

text = """---
name: EuroSAT-RGB-small-STAC
authors: 
  - Juan B. Pedro
license: free
source: https://github.com/earthpulse/eotdl/blob/develop/tutorials/workshops/philab24/02_prototype_ingesting.ipynb
---

# EuroSAT-RGB-small-STAC

This is a prototype of the EuroSAT dataset.
"""

with open(f"{path}/README.md", "w") as outfile:
    outfile.write(text)

In [13]:
from eotdl.datasets import ingest_dataset

path = 'data/EuroSAT-RGB-small-STAC'

ingest_dataset(path)

Ingesting items from collection source: 100it [00:00, 225621.52it/s]
Ingesting items from collection labels: 100it [00:00, 226108.03it/s]
Ingesting files: 100%|██████████| 200/200 [00:03<00:00, 59.23it/s]


PosixPath('data/EuroSAT-RGB-small-STAC/catalog.parquet')

PARECE QUE STAC-GEOPARQUET ESPERA LOS ASSETS COMO UNA LISTA DE DICTS, PERO PYSTAC LOS GUARDA COMO UN DICCIONARIO DE DICTS.

In [14]:
import geopandas as gpd

gdf = gpd.read_parquet(path + "/catalog.parquet")
gdf.head()

Unnamed: 0,assets,bbox,collection,geometry,id,links,stac_extensions,stac_version,type,datetime,label:classes,label:description,label:methods,label:properties,label:tasks,label:type
0,{'asset': {'href': 'http://localhost:8000/data...,"{'xmax': 0, 'xmin': 0, 'ymax': 0, 'ymin': 0}",source,"POLYGON ((0.00000 0.00000, 0.00000 0.00000, 0....",Industrial_1743,[{'href': '/home/juan/Desktop/eotdl/upgrade/da...,[],1.0.0,Feature,2000-01-01 00:00:00+00:00,,,,,,
1,{'asset': {'href': 'http://localhost:8000/data...,"{'xmax': 0, 'xmin': 0, 'ymax': 0, 'ymin': 0}",source,"POLYGON ((0.00000 0.00000, 0.00000 0.00000, 0....",Industrial_1273,[{'href': '/home/juan/Desktop/eotdl/upgrade/da...,[],1.0.0,Feature,2000-01-01 00:00:00+00:00,,,,,,
2,{'asset': {'href': 'http://localhost:8000/data...,"{'xmax': 0, 'xmin': 0, 'ymax': 0, 'ymin': 0}",source,"POLYGON ((0.00000 0.00000, 0.00000 0.00000, 0....",Industrial_1117,[{'href': '/home/juan/Desktop/eotdl/upgrade/da...,[],1.0.0,Feature,2000-01-01 00:00:00+00:00,,,,,,
3,{'asset': {'href': 'http://localhost:8000/data...,"{'xmax': 0, 'xmin': 0, 'ymax': 0, 'ymin': 0}",source,"POLYGON ((0.00000 0.00000, 0.00000 0.00000, 0....",Industrial_1121,[{'href': '/home/juan/Desktop/eotdl/upgrade/da...,[],1.0.0,Feature,2000-01-01 00:00:00+00:00,,,,,,
4,{'asset': {'href': 'http://localhost:8000/data...,"{'xmax': 0, 'xmin': 0, 'ymax': 0, 'ymin': 0}",source,"POLYGON ((0.00000 0.00000, 0.00000 0.00000, 0....",Industrial_1641,[{'href': '/home/juan/Desktop/eotdl/upgrade/da...,[],1.0.0,Feature,2000-01-01 00:00:00+00:00,,,,,,


In [104]:
# import json
# from glob import glob
# from tqdm import tqdm


# files = glob(path + "/**/*.json", recursive=True)

# for file in tqdm(files):
# 	with open(file, "r") as f:
# 		data = json.load(f)
# 		if data["type"] == "Feature":
# 			assets = data["assets"]
# 			if isinstance(assets, dict):
# 				# use same key for all assets (otherwise cannot get correct squema)
# 				data["assets"] = {'asset': v for k, v in data['assets'].items() if isinstance(v, dict)}
# 				# print(file)
# 				# print(data)
# 				# new_file = file.replace(".json", "_assets.json")
# 				# with open(new_file, "w") as f:
# 				with open(file, "w") as f:
# 					json.dump(data, f)


100%|██████████| 203/203 [00:00<00:00, 5730.58it/s]
