In [1]:
import os
import re

import pystac
from pystac.extensions.projection import ProjectionExtension
import pystac.item
from pystac.provider import ProviderRole

from pyproj import Transformer
from shapely.geometry import GeometryCollection, box, shape, mapping
from datetime import datetime, UTC

import rasterio as rio
import rasterio.warp
import rasterio.crs

import xml.etree.cElementTree as ET
from xml.dom import minidom

In [43]:
img_path = 'X:/EO/u2018_clc2012_v2020_20u1_raster100m/DATA/U2018_CLC2012_V2020_20u1.tif'
# os.path.split(img_path)

In [33]:
def deconstruct_clc_name(filename: str):
    id = os.path.basename(filename).split('.')[0]
    p = re.compile(("U(?P<update_campaign>[0-9]{4})_"
                    "(?P<theme>CLC|CHA)(?P<reference_year>[0-9]{4})_"
                    "V(?P<release_year>[0-9]{4})_(?P<release_number>[0-9a-z]*)"
                    "_?(?P<country_code>[A-Z]*)?"
                    "_?(?P<DOM_code>[A-Z]*)?"))
    m = p.search(id)

    return(m.groupdict())

DOM_DICT = {
    'GLP': 'Guadeloupe',
    'GUF': 'French Guyana',
    'MTQ': 'Martinique',
    'MYT': 'Mayotte',
    'REU': 'Réunion',
    '': 'Europe',
}


In [41]:
# label = DOM_DICT.get('')

def create_asset(filename: str, label: str):

    MEDIA_TYPE_DICT = {
        'tif': pystac.MediaType.COG,
        'tif_xml': pystac.MediaType.XML,
        'tif_ovr': 'image/tiff; application=geotiff; profile=pyramid',
        'vat_cpg': pystac.MediaType.TEXT,
        'vat_dbf': 'application/dbf',
        'txt': pystac.MediaType.TEXT,
        'lyr': 'image/tiff; application=geotiff; profile=layer',
    }


    TITLE_DICT = {
        'tif': f'Single Band Land Classification {label}',
        'tif_xml': f'TIFF Metadata {label}',
        'tif_ovr': f'Pyramid {label}',
        'vat_cpg': f'Encoding {label}',
        'vat_dbf': f'Database {label}',
        'txt': f'Legends {label}',
        'lyr': f'LEgend Layer {label}',
    }

    ROLES_DICT = {
        'tif': ['data', 'visual'],
        'tif_xml': ['metadata'],
        'tif_ovr': ['metadata'],
        'vat_cpg': ['metadata'],
        'vat_dbf': ['metadata'],
        'txt': ['metadata'],
        'lyr': ['metadata'],
    }

'Single Band Land Classification Europe'

In [4]:
file_metadata = deconstruct_clc_name(img_path)

year = file_metadata.get('reference_year')
year

'2012'

In [29]:
def get_tif_files(path: str):    
    tif_files=[]
    for root, dirs, files in os.walk(path):
        if root.endswith(('DATA', 'French_DOMs')):
            for file in files:
                if file.endswith('.tif'):
                    tif_files.append(os.path.join(root, file))

    return(tif_files)


def extract_clc_name(path: str):
    clc_name = os.path.basename(path).split('.')[0]
    return(clc_name)


In [97]:
root = 'X:/EO/u2018_clc2012_v2020_20u1_raster100m'

tif_files = get_tif_files(path=root)
clc_names = [extract_clc_name(f) for f in tif_files]
clc_names

['U2018_CLC2012_V2020_20u1',
 'U2018_CLC2012_V2020_20u1_FR_GLP',
 'U2018_CLC2012_V2020_20u1_FR_GUF',
 'U2018_CLC2012_V2020_20u1_FR_MTQ',
 'U2018_CLC2012_V2020_20u1_FR_MYT',
 'U2018_CLC2012_V2020_20u1_FR_REU']

In [108]:
def get_asset_files(path, clc_name):

    clc_name_elements = deconstruct_clc_name(clc_name)

    # clc_pattern = ('U{update_campaign}_{theme}{reference_year}_V{release_year}_'.format(**clc_name_elements),
    #                clc_name_elements['release_number'][:2] + '[0-9a-z]{0,2}',
    #                '_?({})'

    if clc_name_elements['DOM_code']:
        allowed_dirs = ['DATA', 'French_DOMs', 'Metadata']
    else:
        allowed_dirs = ['DATA', 'Legend', 'Metadata']
    
    print(allowed_dirs)

    asset_files = []
    
    for root, dirs, files in os.walk(path, topdown=True):
        [dirs.remove(d) for d in list(dirs) if d not in allowed_dirs]
        for file in files:
            if file.startswith(clc_name + '.') or file.endswith(('.lyr', 'QGIS.txt',)):
                asset_files.append(os.path.join(root, file))

    return(asset_files)
 

    with os.scandir(path) as it:
        for entry in it:
            if entry.is_file() and entry.name.startswith(clc_name) and not entry.name.endswith('.tif'):
                asset_files.append(entry.name)
    
    tif_file_assets.append(asset_files)

In [109]:
# 'bo' in ['bi', 'ba', 'bo']
# tif_file_assets[3]
get_asset_files(root, clc_name=clc_names[0])

['DATA', 'Legend', 'Metadata']


['X:/EO/u2018_clc2012_v2020_20u1_raster100m\\DATA\\U2018_CLC2012_V2020_20u1.tfw',
 'X:/EO/u2018_clc2012_v2020_20u1_raster100m\\DATA\\U2018_CLC2012_V2020_20u1.tif',
 'X:/EO/u2018_clc2012_v2020_20u1_raster100m\\DATA\\U2018_CLC2012_V2020_20u1.tif.aux.xml',
 'X:/EO/u2018_clc2012_v2020_20u1_raster100m\\DATA\\U2018_CLC2012_V2020_20u1.tif.ovr',
 'X:/EO/u2018_clc2012_v2020_20u1_raster100m\\DATA\\U2018_CLC2012_V2020_20u1.tif.vat.cpg',
 'X:/EO/u2018_clc2012_v2020_20u1_raster100m\\DATA\\U2018_CLC2012_V2020_20u1.tif.vat.dbf',
 'X:/EO/u2018_clc2012_v2020_20u1_raster100m\\DATA\\U2018_CLC2012_V2020_20u1.tif.xml',
 'X:/EO/u2018_clc2012_v2020_20u1_raster100m\\Legend\\CLC2018_CLC2012_V2018_20.tif.lyr',
 'X:/EO/u2018_clc2012_v2020_20u1_raster100m\\Legend\\CLC2018_CLC2012_V2018_20_QGIS.txt',
 'X:/EO/u2018_clc2012_v2020_20u1_raster100m\\Metadata\\U2018_CLC2012_V2020_20u1.xml']

In [5]:


CLC_PROVIDER = pystac.Provider(
    name='Copernicus Land Monitoring Service',
    description=('The Copernicus Land Monitoring Service provides '
                 'geographical information on land cover and its '
                 'changes, land use, ground motions, vegetation state, '
                 'water cycle and Earth\'s surface energy variables to '
                 'a broad range of users in Europe and across the World '
                 'in the field of environmental terrestrial applications.'),
    roles=[ProviderRole.LICENSOR, ProviderRole.HOST],
    url= 'https://land.copernicus.eu'
)

props = {'description': (f'Corine Land Cover {year} (CLC{year}) is one of the Corine Land Cover (CLC) ' 
                         f'datasets produced within the frame the Copernicus Land Monitoring Service '
                         f'referring to land cover / land use status of year {year}. '
                         f'CLC service has a long-time heritage (formerly known as \"CORINE Land Cover Programme\"), '
                         f'coordinated by the European Environment Agency (EEA). It provides consistent '
                         f'and thematically detailed information on land cover and land cover changes across Europe. '
                         f'CLC datasets are based on the classification of satellite images produced by the national '
                         f'teams of the participating countries - the EEA members and cooperating countries (EEA39). '
                         f'National CLC inventories are then further integrated into a seamless land cover map of Europe. '
                         f'The resulting European database relies on standard methodology and nomenclature with following '
                         f'base parameters: 44 classes in the hierarchical 3-level CLC nomenclature; '
                         f'minimum mapping unit (MMU) for status layers is 25 hectares; '
                         f'minimum width of linear elements is 100 metres. '
                         f'Change layers have higher resolution, i.e. minimum mapping unit (MMU) is 5 hectares '
                         f'for Land Cover Changes (LCC), and the minimum width of linear elements is 100 metres. '
                         f'The CLC service delivers important data sets supporting the implementation of key priority '
                         f'areas of the Environment Action Programmes of the European Union as e.g. protecting ecosystems, '
                         f'halting the loss of biological diversity, tracking the impacts of climate change, '
                         f'monitoring urban land take, assessing developments in agriculture or dealing with '
                         f'water resources directives. CLC belongs to the Pan-European component of the '
                         f'Copernicus Land Monitoring Service (https://land.copernicus.eu/), part of the '
                         f'European Copernicus Programme coordinated by the European Environment Agency, '
                         f'providing environmental information from a combination of air- and space-based observation '
                         f'systems and in-situ monitoring. Additional information about CLC product description including '
                         f'mapping guides can be found at https://land.copernicus.eu/user-corner/technical-library/. '
                         f'CLC class descriptions can be found at '
                         f'https://land.copernicus.eu/user-corner/technical-library/corine-land-cover-nomenclature-guidelines/html/.'),
         'created': None,
         'providers': CLC_PROVIDER
}



with rio.open(img_path) as img:

    bbox = rio.warp.transform_bounds(img.crs, rio.crs.CRS.from_epsg(4326), *img.bounds)
    params = {
        'id': id,
        'bbox': bbox,
        'geometry': mapping(box(*bbox)),
        'datetime': None,
        'start_datetime': datetime(int(year), 1, 1, microsecond=0, tzinfo=UTC),
        'end_datetime': datetime(int(year), 12, 31, microsecond=0, tzinfo=UTC),
        'properties': props,
    }

item = pystac.Item(**params)

item.add_asset(
    key='image',
    asset=pystac.Asset(href=img_path, title='Geotiff', media_type=pystac.MediaType.GEOTIFF),
)


In [42]:
from pystac.extensions.projection import ProjectionExtension
import pystac.link

proj_ext = ProjectionExtension.ext(item.assets['image'], add_if_missing=True)

proj_ext.apply(epsg=rio.crs.CRS(img.crs).to_epsg(),
               bbox=img.bounds,
               shape=[_ for _ in img.shape],
               transform=[_ for _ in img.transform] + [0.0, 0.0, 1.0],
               )

license = pystac.link.Link(rel='LICENSE', target="https://land.copernicus.eu/en/data-policy")
item.add_link(license)


In [43]:
item.save_object(dest_href='testY.json')

In [28]:
datetime(int(year), 1, 1, microsecond=0)

datetime.datetime(2012, 1, 1, 0, 0)

In [17]:
# Taken from https://stackoverflow.com/questions/2148119/how-to-convert-an-xml-string-to-a-dictionary
from xml.etree import cElementTree as ElementTree


class XmlListConfig(list):
    def __init__(self, aList):
        for element in aList:
            if element:
                if len(element) == 1 or element[0].tag != element[1].tag:
                    self.append(XmlDictConfig(element))
                elif element[0].tag == element[1].tag:
                    self.append(XmlListConfig(element))
            elif element.text:
                text = element.text.strip()
                if text:
                    self.append(text)


class XmlDictConfig(dict):
    def __init__(self, parent_element):
        if parent_element.items():
            self.update(dict(parent_element.items()))
        for element in parent_element:
            if element:
                if len(element) == 1 or element[0].tag != element[1].tag:
                    aDict = XmlDictConfig(element)
                else:
                    aDict = {element[0].tag: XmlListConfig(element)}
                if element.items():
                    aDict.update(dict(element.items()))
                self.update({element.tag: aDict})
            elif element.items():
                self.update({element.tag: dict(element.items())})
            else:
                self.update({element.tag: element.text})

stac_io = pystac.StacIO.default()

def get_metadata(xml: str):
    result = XmlDictConfig(ElementTree.XML(stac_io.read_text(xml)))
    result[
        "ORIGINAL_URL"
    ] = xml  # Include the original URL in the metadata for use later
    return result

In [18]:
xml_path = '../CLC_samples/U2018_CLC2018_V2020_20u1.xml'

get_metadata(xml_path)


{'{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://www.isotc211.org/2005/gmd http://schemas.opengis.net/csw/2.0.2/profiles/apiso/1.0.0/apiso.xsd',
 '{http://www.isotc211.org/2005/gmd}fileIdentifier': {'{http://www.isotc211.org/2005/gco}CharacterString': '7e162b2d-5196-41b2-b6dd-e889651e2f1f'},
 '{http://www.isotc211.org/2005/gmd}language': {'{http://www.isotc211.org/2005/gmd}LanguageCode': {'codeList': 'http://www.loc.gov/standards/iso639-2/',
   'codeListValue': 'eng'}},
 '{http://www.isotc211.org/2005/gmd}characterSet': {'{http://www.isotc211.org/2005/gmd}MD_CharacterSetCode': {'codeList': 'http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_CharacterSetCode',
   'codeListValue': 'utf8'}},
 '{http://www.isotc211.org/2005/gmd}hierarchyLevel': {'{http://www.isotc211.org/2005/gmd}MD_ScopeCode': {'codeList': 'http://standards.iso.org/iso/19139/resources/gmxCodelists.xml#MD_ScopeCode',
   'codeListValue': 'dataset'}},
 '{http://www.isotc211.org/2005/gmd}con