In [None]:
import requests
import json
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, DC

# === CONFIGURATION ===
API_BASE = "https://www.bexis.uni-jena.de/api"  # adjust if self-hosted or versioned
TOKEN = ""  # replace with your actual token

HEADERS = {
    "Authorization": f"Bearer {TOKEN}",
    "Accept": "application/json"
}

def get_metadata(pub_id: int) -> dict:
    url = f"{API_BASE}/Metadata/{pub_id}/"
    #url = f"{API_BASE}/publication/{pub_id}/metadata"
    params = {
        #"format": "External",
        "simplifiedJson": 2  # This enables the simplified output
    }
    resp = requests.get(url, headers=HEADERS, params=params)
    resp.raise_for_status()
    return resp.json()

if __name__ == "__main__":
    pub_id = 31873   # example publication ID
    ds_id = [31067, 31098, 31099]  # example dataset ID

    ds_metadata = []
    for dataset in ds_id:
        try:
            meta = get_metadata(dataset)
            ds_metadata.append(meta)
            with open(f"dataset_{dataset}.json", "w", encoding="utf-8") as f:
                json.dump(meta, f, indent=2, ensure_ascii=False)
        except requests.HTTPError as e:
            print(f"Failed to fetch dataset {dataset}: {e}")

    # Save publication metadata too
    try:
        pub_meta = get_metadata(pub_id)
        with open(f"publication_{pub_id}.json", "w", encoding="utf-8") as f:
            json.dump(pub_meta, f, indent=2, ensure_ascii=False)
    except requests.HTTPError as e:
        print(f"Failed to fetch publication {pub_id}: {e}")


In [115]:
# === Step 2: Aggregate into one file ===
all_metadata = {
    "datasets": ds_metadata,
    "publication": pub_meta
}

with open("all_metadata.json", "w", encoding="utf-8") as f:
    json.dump(all_metadata, f, indent=2, ensure_ascii=False)

# === Step 3: Convert metadata to RDF ===
g = Graph()
EX = Namespace("http://example.org/bexis/")

def add_metadata_to_graph(meta: dict, resource_type="Dataset"):
    if "id" not in meta:
        return
    uri = URIRef(f"http://bexis.uni-jena.de/resource/{resource_type.lower()}/{meta['id']}")
    g.add((uri, RDF.type, EX[resource_type]))
    for key, value in meta.items():
        pred = EX[key]
        if isinstance(value, list):
            for v in value:
                g.add((uri, pred, Literal(v)))
        elif isinstance(value, dict):
            # optionally flatten nested values
            for subk, subv in value.items():
                g.add((uri, EX[f"{key}_{subk}"], Literal(subv)))
        else:
            g.add((uri, pred, Literal(value)))

for ds in ds_metadata:
    add_metadata_to_graph(ds, "Dataset")

if pub_meta:
    add_metadata_to_graph(pub_meta, "Publication")

# === Step 4: Serialize RDF to Turtle ===
g.serialize(destination="all_metadata.ttl", format="turtle")
print("‚úÖ Metadata saved as RDF to all_metadata.ttl")

‚úÖ Metadata saved as RDF to all_metadata.ttl


In [116]:
print(pub_meta)

{'@id': '3', 'general': {'title': {'#text': 'IBP Score (Index of Biodiversity Potential) of all forest plots'}, 'abstract': {'#text': 'To identify management effects on biodiversity, an estimation of biodiversity using forest structural attributes may be a reasonable approach. Forest structure can - compared to conventional species-based monitoring - easily be captured during forest inventories and does not require specific taxonomic expertise. The IBP (Index of Biodiversity Potential) developed by Larrieu and Gonin (2008) is a composite index aiming to provide practitioners with an efficient tool for estimating biodiversity at the local level. The IBP protocol consists of 10 factors considered among the most common structural drivers of forest biodiversity. They are easy and quick to record in the field, and do not require additional equipment or special taxonomic expertise to get the IBP score. The IBP was assessed in August 2020 on 147 EPs in all three biodiversity exploratories (th

In [117]:
def walk_metadata_tree(data, prefix=""):
    if isinstance(data, dict):
        for key, value in data.items():
            print(f"{prefix}{key}")
            walk_metadata_tree(value, prefix=prefix + "  ")
    elif isinstance(data, list):
        for idx, item in enumerate(data):
            print(f"{prefix}[{idx}]")
            walk_metadata_tree(item, prefix=prefix + "  ")
    else:
        print(f"{prefix}= {data}")  # Leaf node (e.g., string, int)

# Example usage:
walk_metadata_tree(pub_meta)

@id
  = 3
general
  title
    #text
      = IBP Score (Index of Biodiversity Potential) of all forest plots
  abstract
    #text
      = To identify management effects on biodiversity, an estimation of biodiversity using forest structural attributes may be a reasonable approach. Forest structure can - compared to conventional species-based monitoring - easily be captured during forest inventories and does not require specific taxonomic expertise. The IBP (Index of Biodiversity Potential) developed by Larrieu and Gonin (2008) is a composite index aiming to provide practitioners with an efficient tool for estimating biodiversity at the local level. The IBP protocol consists of 10 factors considered among the most common structural drivers of forest biodiversity. They are easy and quick to record in the field, and do not require additional equipment or special taxonomic expertise to get the IBP score. The IBP was assessed in August 2020 on 147 EPs in all three biodiversity exploratories (

In [118]:
# Initialize RDF Graph
g = Graph()
EX = Namespace("http://example.org/publication/")
g.bind("ex", EX)

# Helper to create hierarchical URIs
def build_uri(path):
    safe_path = "/".join(str(p).replace(" ", "_") for p in path)
    return URIRef(EX + safe_path)

# Recursive triple builder
def process_node(subject_uri, obj, path):
    if isinstance(obj, dict):
        for k, v in obj.items():
            pred_uri = URIRef(EX + k)
            child_path = path + [k]
            if isinstance(v, (dict, list)):
                child_uri = build_uri(child_path)
                g.add((subject_uri, pred_uri, child_uri))
                process_node(child_uri, v, child_path)
            else:
                g.add((subject_uri, pred_uri, Literal(v)))
    elif isinstance(obj, list):
        for i, item in enumerate(obj):
            item_path = path + [str(i)]
            item_uri = build_uri(item_path)
            g.add((subject_uri, URIRef(EX + path[-1]), item_uri))
            process_node(item_uri, item, item_path)

# Start from root publication node
publication_uri = EX["31873"]
g.add((publication_uri, RDF.type, EX.Publication))
process_node(publication_uri, pub_meta, ["31873"])

# Save TTL
ttl_path = "../data/publication_31873_structured.ttl"
g.serialize(destination=ttl_path, format="turtle")

<Graph identifier=Nccbf9a8108e14cba8208a631c52ea1b7 (<class 'rdflib.graph.Graph'>)>

In [132]:
import requests
import json
import time

# === CONFIGURATION ===
API_BASE = "https://www.bexis.uni-jena.de/api"  # adjust if self-hosted or versioned
TOKEN = "3kK6gCqpQvQvVyR69dbhLj4ysf5abJ6nAVcKDUNndLjGQCUCtbzUbiE6Dy2KCJAQ"  # replace with your actual token
HEADERS = {
    "Authorization": f"Bearer {TOKEN}",
    "Accept": "application/json"
}

def get_metadata(entry_id: int) -> dict:
    """Fetch metadata using simplifiedJson=2 format."""
    url = f"{API_BASE}/Metadata/{entry_id}"
    params = {
        "simplifiedJson": 2
    }
    response = requests.get(url, headers=HEADERS, params=params)
    response.raise_for_status()
    return response.json()

# List of known publication/dataset IDs
entry_ids = ["31097", "31026"]  # publication + 3 datasets

# Fetch and save each
all_metadata = {}
for eid in entry_ids:
    try:
        print(f"Fetching metadata for ID: {eid}")
        meta = get_metadata(eid)
        all_metadata[eid] = meta
        time.sleep(10)
    except Exception as e:
        print(f"‚ùå Failed for ID {eid}: {e}")

# Optionally save to file
with open("all_simplified_metadata.json", "w", encoding="utf-8") as f:
    json.dump(all_metadata, f, indent=2, ensure_ascii=False)

print("‚úÖ Done fetching all metadata.")


Fetching metadata for ID: 31097
Fetching metadata for ID: 31026
‚úÖ Done fetching all metadata.


In [135]:
from rdflib import Graph, Literal, URIRef, Namespace
from rdflib.namespace import RDF
import json
import os

# Load the metadata
#with open("all_simplified_metadata.json", "r", encoding="utf-8") as f:
#    all_metadata = json.load(f)

# Prepare output directory
os.makedirs("../data/ttl_exports", exist_ok=True)

# Initialize RDF Graph
EX = Namespace("http://example.org/")

# Helper to create hierarchical URIs
def build_uri(path):
    safe_path = "/".join(str(p).replace(" ", "_") for p in path)
    return URIRef(EX + safe_path)

# Recursive triple builder
def process_node(g, subject_uri, obj, path):
    if isinstance(obj, dict):
        for k, v in obj.items():
            pred_uri = URIRef(EX + k)
            child_path = path + [k]
            if isinstance(v, (dict, list)):
                child_uri = build_uri(child_path)
                g.add((subject_uri, pred_uri, child_uri))
                process_node(g, child_uri, v, child_path)
            else:
                g.add((subject_uri, pred_uri, Literal(v)))
    elif isinstance(obj, list):
        for i, item in enumerate(obj):
            item_path = path + [str(i)]
            item_uri = build_uri(item_path)
            g.add((subject_uri, URIRef(EX + path[-1]), item_uri))
            process_node(g, item_uri, item, item_path)

# Loop through all entries
for entry_id, pub_meta in all_metadata.items():
    g = Graph()
    g.bind("ex", EX)
    
    # Define root subject URI
    publication_uri = EX[str(entry_id)]
    g.add((publication_uri, RDF.type, EX.Publication))
    
    # Recursively add triples
    process_node(g, publication_uri, pub_meta, [str(entry_id)])
    
    # Save TTL
    ttl_path = f"../data/ttl_exports/publication_{entry_id}_structured.ttl"
    g.serialize(destination=ttl_path, format="turtle")
    print(f"‚úÖ Written: {ttl_path}")

# Create graph with linkage triples
link_graph = Graph()
link_graph.bind("ex", EX)

# Define URIs
pub_uri = EX["31097"]
dataset_ids = ["31026"]

# Add links
for ds_id in dataset_ids:
    ds_uri = EX[ds_id]
    link_graph.add((ds_uri, URIRef(EX + "isPartOf"), pub_uri))

# Save linkage triples to a separate TTL file
link_ttl_path = "../data/ttl_exports/dataset_publication_links.ttl"
link_graph.serialize(destination=link_ttl_path, format="turtle")
print(f"üîó Link triples written to: {link_ttl_path}")


‚úÖ Written: ../data/ttl_exports/publication_31097_structured.ttl
‚úÖ Written: ../data/ttl_exports/publication_31026_structured.ttl
üîó Link triples written to: ../data/ttl_exports/dataset_publication_links.ttl


In [None]:
#-----------------------------------------------
# starting 22nd of july - testing api and which metadata schemes we need for publications and datasets, as there might be multiple per type
import requests
import json
import time

# === CONFIGURATION ===
API_BASE = "https://www.bexis.uni-jena.de/api"  # adjust if self-hosted or versioned
TOKEN = ""  # replace with your actual token
HEADERS = {
    "Authorization": f"Bearer {TOKEN}",
    "Accept": "application/json"
}

def get_metadata(entry_id: int) -> dict:
    """Fetch metadata using simplifiedJson=2 format."""
    url = f"{API_BASE}/Metadata/{entry_id}"
    params = {
        "format": 1
        #"simplifiedJson": 2
    }
    response = requests.get(url, headers=HEADERS, params=params)
    response.raise_for_status()
    return response.json()

# List of known publication/dataset IDs
entry_ids = ["30894"]  # publication + 3 datasets

# Fetch and save each
all_metadata = {}
for eid in entry_ids:
    try:
        print(f"Fetching metadata for ID: {eid}")
        meta = get_metadata(eid)
        all_metadata[eid] = meta
        time.sleep(10)
    except Exception as e:
        print(f"‚ùå Failed for ID {eid}: {e}")

# Optionally save to file
with open("30894Metadata.json", "w", encoding="utf-8") as f:
    json.dump(all_metadata, f, indent=2, ensure_ascii=False)

print("‚úÖ Done fetching all metadata.")


Fetching metadata for ID: 30894
‚úÖ Done fetching all metadata.


In [None]:
#get all publications 
import requests
import json
import time

# === CONFIGURATION ===
API_BASE = "https://www.bexis.uni-jena.de/api"  # adjust if self-hosted or versioned
TOKEN = ""  # replace with your actual token
HEADERS = {
    "Authorization": f"Bearer {TOKEN}",
    "Accept": "application/json"
}

def get_metadata(entry_id: int) -> dict:
    """Fetch metadata using simplifiedJson=2 format."""
    url = f"{API_BASE}/MetadataBySchema/6"
    params = {
        "format": 1
        #"simplifiedJson": 2
    }
    response = requests.get(url, headers=HEADERS, params=params)
    response.raise_for_status()
    return response.json()

# List of known publication/dataset IDs
entry_ids = ["30894"]  # publication + 3 datasets

# Fetch and save each
all_metadata = {}
for eid in entry_ids:
    try:
        print(f"Fetching metadata for ID: {eid}")
        meta = get_metadata(eid)
        all_metadata[eid] = meta
        time.sleep(10)
    except Exception as e:
        print(f"‚ùå Failed for ID {eid}: {e}")

# Optionally save to file
with open("30894Metadata.json", "w", encoding="utf-8") as f:
    json.dump(all_metadata, f, indent=2, ensure_ascii=False)

print("‚úÖ Done fetching all metadata.")

In [None]:
import requests
import json

TOKEN = ""
HEADERS = {
    "Authorization": f"Bearer {TOKEN}",
    "Accept": "application/xml"
}

def get_metadata() -> dict:
    """Fetch metadata by schema using simplifiedJson format."""
    url = "https://www.bexis.uni-jena.de/api/MetadataBySchema/BE-PublicationSchema"

    #params = {
    #    "format": 1  # same as simplifiedJson=2
    #}
    response = requests.get(url, headers=HEADERS)
    response.raise_for_status()
    return response.json()

if __name__ == "__main__":
    try:
        print("Fetching metadata...")
        data = get_metadata()

        # Save to JSON file
        with open("metadata_output.json", "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

        print("Metadata saved to metadata_output.json")

    except requests.HTTPError as e:
        print(f"HTTP error occurred: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")


Fetching metadata...


KeyboardInterrupt: 

In [None]:
# GET all Publication metadata
#!pip install xmltodict
import requests
import xmltodict
import json

HEADERS = {
    "Authorization": f"Bearer {TOKEN}",
    "Accept": "application/xml"
}
URL = "https://www.bexis.uni-jena.de/api/MetadataBySchema/BE-PublicationSchema"

def fetch_and_save_xml(path="publication_metadata_output.xml"):
    response = requests.get(URL, headers=HEADERS)
    response.raise_for_status()
    with open(path, "wb") as f:
        f.write(response.content)
    print(f"‚úÖ XML saved to {path}")
    return path

def parse_xml_to_dict(path):
    with open(path, "rb") as f:
        doc = xmltodict.parse(f, xml_attribs=True)
    return doc

if __name__ == "__main__":
    xml_file = fetch_and_save_xml()
    data_dict = parse_xml_to_dict(xml_file)

    # Print the ‚Äúhead‚Äù ‚Äì top-level keys and first few items
    print("\nüéØ Preview of parsed dictionary structure:\n")
    root_keys = list(data_dict.keys())
    print("Root-level keys:", root_keys)

    # Show the first entry if iterable
    first_key = root_keys[0]
    first_entry = data_dict[first_key]
    print(f"\nFirst item under '{first_key}':")
    if isinstance(first_entry, list):
        preview = first_entry[0]
    else:
        preview = first_entry
    print(json.dumps(preview, indent=2, ensure_ascii=False)[:1000], "...\n")


‚úÖ XML saved to publication_metadata_output.xml

üéØ Preview of parsed dictionary structure:

Root-level keys: ['root']

First item under 'root':
{
  "Dataset": [
    {
      "@id": "30000",
      "Metadata": {
        "@id": "6",
        "publicationDetails": {
          "@type": "MetadataPackageUsage",
          "@name": "publicationDetails",
          "@id": "48",
          "publicationDetailsType": {
            "@type": "MetadataPackage",
            "@name": "publicationDetailsType",
            "@roleId": "48",
            "@id": "48",
            "@number": "1",
            "title": {
              "@type": "MetadataAttributeUsage",
              "@name": "title",
              "@id": "192",
              "titleType": {
                "@type": "MetadataAttribute",
                "@name": "titleType",
                "@roleId": "192",
                "@id": "483",
                "@number": "1",
                "#text": "Local- and landscape-scale forest attributes differ in

In [15]:
import xmltodict
import pandas as pd

XML_PATH = "metadata_output.xml"
CSV_PATH = "metadata_flat.csv"

# 1. Parse XML to a dict
with open(XML_PATH, "rb") as f:
    doc = xmltodict.parse(f, xml_attribs=True)

# 2. Navigate to repeated section (list of datasets)
datasets = doc.get("root", {}).get("Dataset", [])
if not isinstance(datasets, list):
    datasets = [datasets]

# 3. Auto-flatten
df = pd.json_normalize(datasets, sep=".")

# 4. Clean column names (strip leading '@')
df.columns = [col.lstrip("@") for col in df.columns]

# 5. Save to CSV and preview
df.to_csv(CSV_PATH, index=False)
print(f"‚úÖ Saved flattened data to {CSV_PATH}\n")
print("Preview:")
print(df.head().to_string(index=False))


‚úÖ Saved flattened data to metadata_flat.csv

Preview:
   id Metadata.@id Metadata.publicationDetails.@type Metadata.publicationDetails.@name Metadata.publicationDetails.@id Metadata.publicationDetails.publicationDetailsType.@type Metadata.publicationDetails.publicationDetailsType.@name Metadata.publicationDetails.publicationDetailsType.@roleId Metadata.publicationDetails.publicationDetailsType.@id Metadata.publicationDetails.publicationDetailsType.@number Metadata.publicationDetails.publicationDetailsType.title.@type Metadata.publicationDetails.publicationDetailsType.title.@name Metadata.publicationDetails.publicationDetailsType.title.@id Metadata.publicationDetails.publicationDetailsType.title.titleType.@type Metadata.publicationDetails.publicationDetailsType.title.titleType.@name Metadata.publicationDetails.publicationDetailsType.title.titleType.@roleId Metadata.publicationDetails.publicationDetailsType.title.titleType.@id Metadata.publicationDetails.publicationDetailsType.title.ti

In [20]:
import xmltodict
import pandas as pd
import re

XML_PATH = "metadata_output.xml"
CSV_PATH = "metadata_cleaned.csv"

# 1. Parse and flatten
with open(XML_PATH, "rb") as f:
    doc = xmltodict.parse(f, xml_attribs=True)
datasets = doc.get("root", {}).get("Dataset", [])
if not isinstance(datasets, list):
    datasets = [datasets]

# Use underscore as separator to avoid confusing dots
df = pd.json_normalize(datasets, sep="_")
df.rename(columns=lambda c: c.lstrip("@"), inplace=True)

# 2. Filter for id, Metadata_id, and *_#text fields
text_cols = [c for c in df.columns if c.endswith("_#text")]
keep = [c for c in ["id", "Metadata_id"] + text_cols if c in df.columns]
df = df[keep]

# 3. Clean and simplify column names
def simplify_col(c):
    # Keep id and Metadata_id as is
    if c in ("id", "Metadata_id"):
        return {"id": "Id", "Metadata_id": "Metadata Id"}[c]
    # Remove trailing '_#text'
    name = c[:-6]  # drop '_#text'
    # Title-case and replace underscores with spaces
    name = name.replace("_", " ").title()
    return name

df.columns = [simplify_col(c) for c in df.columns]

# 4. Save and preview
df.to_csv(CSV_PATH, index=False)
print(f"‚úÖ Saved cleaned CSV as {CSV_PATH}\n")
print(df.head().to_string(index=False))
print("\nFinal columns:", df.columns.tolist())


‚úÖ Saved cleaned CSV as metadata_cleaned.csv

   Id                                                                     Metadata Publicationdetails Publicationdetailstype Title Titletype                                                                                                                                                                                                                                                                                                                                                                          Metadata Publicationdetails Publicationdetailstype Citation Citationtype Metadata Publicationdetails Publicationdetailstype Firstauthor Firstauthortype Metadata Publicationdetails Publicationdetailstype Correspondingemail Correspondingemailtype Metadata Publicationdetails Publicationdetailstype Year Yeartype                                                                                                                                              

In [None]:
#New-BE-MetadataSchema
# GET all dataset metadata
import requests
import xmltodict
import json

TOKEN = ""
HEADERS = {
    "Authorization": f"Bearer {TOKEN}",
    "Accept": "application/xml"
}
URL = "https://www.bexis.uni-jena.de/api/MetadataBySchema/New-BE-MetadataSchema"

def fetch_and_save_xml(path="dataset_metadata_output.xml"):
    response = requests.get(URL, headers=HEADERS)
    response.raise_for_status()
    with open(path, "wb") as f:
        f.write(response.content)
    print(f"‚úÖ XML saved to {path}")
    return path

def parse_xml_to_dict(path):
    with open(path, "rb") as f:
        doc = xmltodict.parse(f, xml_attribs=True)
    return doc

if __name__ == "__main__":
    xml_file = fetch_and_save_xml()
    data_dict = parse_xml_to_dict(xml_file)

    # Print the ‚Äúhead‚Äù ‚Äì top-level keys and first few items
    print("\nüéØ Preview of parsed dictionary structure:\n")
    root_keys = list(data_dict.keys())
    print("Root-level keys:", root_keys)

    # Show the first entry if iterable
    first_key = root_keys[0]
    first_entry = data_dict[first_key]
    print(f"\nFirst item under '{first_key}':")
    if isinstance(first_entry, list):
        preview = first_entry[0]
    else:
        preview = first_entry
    print(json.dumps(preview, indent=2, ensure_ascii=False)[:1000], "...\n")


‚úÖ XML saved to metadata_output.xml

üéØ Preview of parsed dictionary structure:

Root-level keys: ['root']

First item under 'root':
{
  "Dataset": [
    {
      "@id": "1000",
      "Metadata": {
        "@id": "3",
        "general": {
          "@type": "MetadataPackageUsage",
          "@name": "general",
          "@id": "20",
          "generalType": {
            "@type": "MetadataPackage",
            "@name": "generalType",
            "@roleId": "20",
            "@id": "20",
            "@number": "1",
            "title": {
              "@type": "MetadataAttributeUsage",
              "@name": "title",
              "@id": "63",
              "titleType": {
                "@type": "MetadataAttribute",
                "@name": "titleType",
                "@roleId": "63",
                "@id": "120",
                "@number": "1",
                "#text": "Basic information and coordinates of field plots of the Biodiversity Exploratories project"
              }
     

In [26]:
import xmltodict
import pandas as pd

XML_PATH = "dataset_metadata_output.xml"
CSV_PATH = "dataset_metadata_flat.csv"

# 1. Parse XML to a dict
with open(XML_PATH, "rb") as f:
    doc = xmltodict.parse(f, xml_attribs=True)

# 2. Navigate to repeated section (list of datasets)
datasets = doc.get("root", {}).get("Dataset", [])
if not isinstance(datasets, list):
    datasets = [datasets]

# 3. Auto-flatten
df = pd.json_normalize(datasets, sep=".")

# 4. Clean column names (strip leading '@')
df.columns = [col.lstrip("@") for col in df.columns]

# 5. Save to CSV and preview
df.to_csv(CSV_PATH, index=False)
print(f"‚úÖ Saved flattened data to {CSV_PATH}\n")
print("Preview:")
print(df.head().to_string(index=False))


‚úÖ Saved flattened data to dataset_metadata_flat.csv

Preview:
  id Metadata.@id Metadata.general.@type Metadata.general.@name Metadata.general.@id Metadata.general.generalType.@type Metadata.general.generalType.@name Metadata.general.generalType.@roleId Metadata.general.generalType.@id Metadata.general.generalType.@number Metadata.general.generalType.title.@type Metadata.general.generalType.title.@name Metadata.general.generalType.title.@id Metadata.general.generalType.title.titleType.@type Metadata.general.generalType.title.titleType.@name Metadata.general.generalType.title.titleType.@roleId Metadata.general.generalType.title.titleType.@id Metadata.general.generalType.title.titleType.@number                                         Metadata.general.generalType.title.titleType.#text Metadata.general.generalType.abstract.@type Metadata.general.generalType.abstract.@name Metadata.general.generalType.abstract.@id Metadata.general.generalType.abstract.abstractType.@type Metadata.general.g

In [30]:
import xmltodict
import pandas as pd
import re

XML_PATH = "dataset_metadata_output.xml"
CSV_PATH = "dataset_metadata_cleaned.csv"

# 1. Parse and flatten
with open(XML_PATH, "rb") as f:
    doc = xmltodict.parse(f, xml_attribs=True)
datasets = doc.get("root", {}).get("Dataset", [])
if not isinstance(datasets, list):
    datasets = [datasets]

# Use underscore as separator to avoid confusing dots
df = pd.json_normalize(datasets, sep="_")
df.rename(columns=lambda c: c.lstrip("@"), inplace=True)

# 2. Filter for id, Metadata_id, and *_#text fields
text_cols = [c for c in df.columns if c.endswith("_#text")]
keep = [c for c in ["id", "Metadata_id"] + text_cols if c in df.columns]
df = df[keep]

# 3. Clean and simplify column names
def simplify_col(raw):
    """Turn monstrous headers into 1‚Äì4 clean words."""

    # ------------------------------------------------------------------
    # special‚Äëcases
    if raw == "id":
        return "Id"
    if raw == "Metadata_id":
        return "Metadata Id"

    # ------------------------------------------------------------------
    # strip trailing "_#text"
    core = raw[:-6] if raw.endswith("_#text") else raw

    # split on "_" and normalise
    parts = core.split("_")

    # blacklist generic hierarchy words
    blacklist = {
        "metadata", "general", "generaltype", "contacts", "contactstype",
        "coverage", "coveragetype", "geographiccoverage", "geographiccoveragetype",
        "temporalcoverage", "temporalcoveragetype", "taxonomiccoverage",
        "methodstype", "methods", "keywordstype", "keywords",
        "datatypeandstatus", "datatypeandstatustype", "repository",
        "repositorytype", "coordinates", "coordinates-wgs84", "boundingbox",
        "boundingbox-wgs84", "acronyms", "acronymstype", "acronympair",
        "acronympairtype", "measurementstype", "measurements", "processesandservices"
    }
    cleaned = []
    for p in parts:
        p_low = p.lower()
        if p_low in blacklist:
            continue
        # remove trailing 'type' or 'status'
        p = re.sub(r"(type|status)$", "", p, flags=re.IGNORECASE)
        if p:                       # avoid empty strings
            cleaned.append(p)

    # collapse duplicate words (case‚Äëinsensitive)
    deduped, seen = [], set()
    for p in cleaned:
        if p.lower() not in seen:
            deduped.append(p)
            seen.add(p.lower())

    # if still very long keep only the last ‚â§‚ÄØ4 tokens
    if len(deduped) > 4:
        deduped = deduped[-4:]

    # join, de‚Äëdash, split CamelCase, title‚Äëcase
    header = " ".join(deduped)
    header = header.replace("-", " ")
    header = re.sub(r"(?<!^)(?=[A-Z])", " ", header)       # split CamelCase
    header = re.sub(r"\s+", " ", header).strip().title()

    return header


# apply
df.columns = [simplify_col(c) for c in df.columns]

# If you now have two identical "Id" headers, rename the second to "Dataset Id"
cols = pd.Series(df.columns)
if (cols == "Id").sum() > 1:
    first = True
    new = []
    for col in cols:
        if col == "Id" and first:
            new.append(col); first = False
        elif col == "Id":
            new.append("Dataset Id")
        else:
            new.append(col)
    df.columns = new


# 4. Save and preview
df.to_csv(CSV_PATH, index=False)
print(f"‚úÖ Saved cleaned CSV as {CSV_PATH}\n")
print(df.head().to_string(index=False))
print("\nFinal columns:", df.columns.tolist())


‚úÖ Saved cleaned CSV as dataset_metadata_cleaned.csv

  Id                                                                                      Title                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          Abstract  Project Name     Consortium Metadata Creation Date Metadata Last Modification Date Data Last Modification Date Dataset Id Version                                Metadata Sche

In [39]:
#!/usr/bin/env python3
import os
import xmltodict
import pandas as pd

# ‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî
class AlwaysList:
    def __contains__(self, key): return True
    def __iter__(self): return iter(())
# ‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî‚Äî

def parse_xml_datasets(xml_path):
    with open(xml_path, "rb") as f:
        doc = xmltodict.parse(f, xml_attribs=True, force_list=AlwaysList())
    root_key = next(iter(doc))
    container = doc[root_key]
    if not isinstance(container, list):
        container = [container]
    datasets = []
    for block in container:
        ds = block.get("Dataset", [])
        if isinstance(ds, list):
            datasets.extend(ds)
        else:
            datasets.append(ds)
    return datasets

def extract_records(datasets):
    records = []
    for ds in datasets:
        row = {}
        if "@id" in ds:
            row["Id"] = ds["@id"]
        def recurse(node):
            if isinstance(node, dict):
                if "@name" in node and "#text" in node:
                    k, v = node["@name"], node["#text"] or ""
                    row.setdefault(k, []).append(v)
                for child in node.values():
                    recurse(child)
            elif isinstance(node, list):
                for item in node:
                    recurse(item)
        recurse(ds)
        records.append(row)
    return records

def build_dataframe(records):
    df = pd.DataFrame(records)
    def flatten(x):
        flat = []
        if isinstance(x, list):
            for item in x:
                flat.extend(flatten(item))
        else:
            flat.append(x)
        return flat
    def join_vals(vs):
        flat_list = flatten(vs) if isinstance(vs, list) else [vs]
        clean = [
            item for item in flat_list
            if item is not None
            and not (isinstance(item, float) and pd.isna(item))
            and item != ""
        ]
        return "; ".join(str(item) for item in clean)
    for col in df.columns:
        if df[col].apply(lambda v: isinstance(v, list) or (not isinstance(v, str) and pd.isna(v))).any():
            df[col] = df[col].apply(join_vals)
    return df

def main():
    jobs = [
        ("publication_metadata_output.xml", "publication_all_metadata_flat.csv"),
        ("dataset_metadata_output.xml",    "dataset_all_metadata_flat.csv"),
    ]

    for xml_in, csv_out in jobs:
        if not os.path.exists(xml_in):
            print(f"‚ö†Ô∏è  Skipping, file not found: {xml_in}")
            continue

        print(f"üîç  Processing {xml_in} ‚Ä¶")
        datasets = parse_xml_datasets(xml_in)
        records  = extract_records(datasets)
        df       = build_dataframe(records)

        # ‚Üê strip off any trailing "Type" in column names:
        df.rename(columns=lambda c: c[:-4] if c.endswith("Type") else c,
                  inplace=True)

        df.to_csv(csv_out, index=False)
        print(f"‚úÖ  Wrote {csv_out} ({len(df)} rows √ó {len(df.columns)} cols)")

if __name__ == "__main__":
    main()


üîç  Processing publication_metadata_output.xml ‚Ä¶
‚úÖ  Wrote publication_all_metadata_flat.csv (1246 rows √ó 35 cols)
üîç  Processing dataset_metadata_output.xml ‚Ä¶
‚úÖ  Wrote dataset_all_metadata_flat.csv (2133 rows √ó 72 cols)


In [3]:
from rdflib import Graph
g = Graph()
try:
    g.parse("../r2rml/Dataset/outputs/dataset.trig", format="turtle")
except Exception as e:
    print(e)


acronym: disturbed_area; meaning: soil%20texture%20was%20visible does not look like a valid URI, trying to serialize this will break.
acronym: NIRS; meaning: Near-Infrared%20Spectrometry does not look like a valid URI, trying to serialize this will break.
acronym: OSFI-ALB-2013%2C%20stumps; meaning: observational%20systematic%20forest%20inventory%20Alb does not look like a valid URI, trying to serialize this will break.
acronym: BERC; meaning: Biodiversity%20Exploratory%20research%20consortium%2C does not look like a valid URI, trying to serialize this will break.
acronym: NA; meaning: Missing%20data%2C%20because%20already%20mown%20or%20not%20accessible%20because%20of%20grazing%20animals does not look like a valid URI, trying to serialize this will break.
acronym: -; meaning: - does not look like a valid URI, trying to serialize this will break.
acronym: AraDiv; meaning: Araneae%20Diversity does not look like a valid URI, trying to serialize this will break.
acronym: NA; meaning: NA do

In [5]:
# Jupyter cell: extract link info from JSON column into new CSV columns

import json
from pathlib import Path
import pandas as pd

# ---------- Config ----------
INPUT_PATH = Path("./linkAPI.csv")
OUTPUT_PATH = INPUT_PATH.with_name(INPUT_PATH.stem + ".with_links" + INPUT_PATH.suffix)
JSON_COL = "dataset_links_api_response"
DEDUP = False           # set to True if you want unique values in columns 2‚Äì4
IN_SEP = ","            # change to "\t" for TSV input
OUT_SEP = ","           # change to "\t" for TSV output
ENCODING = "utf-8"
# ----------------------------

def unique_preserve(seq):
    """Deduplicate while preserving order."""
    seen = set()
    out = []
    for x in seq:
        if x not in seen:
            seen.add(x)
            out.append(x)
    return out

def extract_links(json_text, dedup=False):
    """
    Parse one JSON cell and return a dict with:
      - links_dir_refids         e.g., "from [4806]; from [5024]; to [4839]; to [4840]"
      - links_target_ids         e.g., "1000; 20907; 20826"
      - links_source_ids         e.g., "31013; 31051; 31152; 1000"
      - links_reference_types    e.g., "IsSupplementTo; IsDerivedFrom"
    """
    empty = {
        "links_dir_refids": "",
        "links_target_ids": "",
        "links_source_ids": "",
        "links_reference_types": "",
    }

    if json_text is None:
        return empty

    # Cope with already-parsed dicts (rare) or strings (common)
    if isinstance(json_text, (dict, list)):
        obj = json_text
    else:
        s = str(json_text).strip()
        if not s:
            return empty
        try:
            obj = json.loads(s)
        except Exception:
            return empty

    links = obj.get("links")
    if not isinstance(links, dict):
        return empty

    dir_refids, target_ids, source_ids, ref_types = [], [], [], []

    for direction in ("from", "to"):
        arr = links.get(direction)
        if not isinstance(arr, list):
            continue
        for item in arr:
            if not isinstance(item, dict):
                continue

            ref_id = item.get("refId")
            if ref_id is not None:
                dir_refids.append(f"{direction} [{ref_id}]")

            tgt = item.get("target") or {}
            tgt_id = tgt.get("id")
            if tgt_id is not None:
                target_ids.append(str(tgt_id))

            src = item.get("source") or {}
            src_id = src.get("id")
            if src_id is not None:
                source_ids.append(str(src_id))

            rtype = item.get("referenceType")
            if rtype:
                ref_types.append(str(rtype))

    if dedup:
        target_ids = unique_preserve(target_ids)
        source_ids = unique_preserve(source_ids)
        ref_types = unique_preserve(ref_types)
        # Typically keep direction+refId ordered (and not deduped).
        # If you want that too, uncomment:
        # dir_refids = unique_preserve(dir_refids)

    return {
        "links_dir_refids": "; ".join(dir_refids),
        "links_target_ids": "; ".join(target_ids),
        "links_source_ids": "; ".join(source_ids),
        "links_reference_types": "; ".join(ref_types),
    }

# ---- Load, transform, save ----
df = pd.read_csv(INPUT_PATH, sep=IN_SEP, encoding=ENCODING, dtype=str, keep_default_na=False)
if JSON_COL not in df.columns:
    raise KeyError(f"Column '{JSON_COL}' not found. Available columns: {list(df.columns)}")

results = df[JSON_COL].apply(lambda cell: extract_links(cell, dedup=DEDUP))
res_df = pd.DataFrame(results.tolist())

out_df = pd.concat([df, res_df], axis=1)
out_df.to_csv(OUTPUT_PATH, sep=OUT_SEP, index=False, encoding=ENCODING)

print(f"Processed {len(df):,} rows.")
print(f"Wrote: {OUTPUT_PATH}")
display(out_df[[
    "links_dir_refids",
    "links_target_ids",
    "links_source_ids",
    "links_reference_types"
]].head(10))


Processed 1,761 rows.
Wrote: linkAPI.with_links.csv


Unnamed: 0,links_dir_refids,links_target_ids,links_source_ids,links_reference_types
0,from [4806]; from [5024]; from [5027]; to [483...,1000; 1000; 1000; 20907; 20826,31013; 31051; 31152; 1000; 1000,IsSupplementTo; IsSupplementTo; IsSupplementTo...
1,from [3603]; from [4720],1580; 1580,30649; 30959,Is supplement to; IsSupplementTo
2,,,,
3,,,,
4,,,,
5,,,,
6,,,,
7,,,,
8,,,,
9,to [2484],15146,2480,link


In [7]:
# Jupyter cell: extract link info from JSON column into new CSV columns
# Now also adds: links_target_types, links_source_types (values: "dataset"/"publication")

import json
from pathlib import Path
import pandas as pd

# ---------- Config ----------
INPUT_PATH = Path("./linkAPI.csv")
OUTPUT_PATH = INPUT_PATH.with_name(INPUT_PATH.stem + ".with_links" + INPUT_PATH.suffix)
JSON_COL = "dataset_links_api_response"
DEDUP = False           # set to True if you want unique values in columns (ids/types separately)
IN_SEP = ","            # change to "\t" for TSV input
OUT_SEP = ","           # change to "\t" for TSV output
ENCODING = "utf-8"
# ----------------------------

# Optional: fallback mapping if 'type' is missing
TYPEID_MAP = {
    1: "dataset",
    10: "publication",
}

def unique_preserve(seq):
    """Deduplicate while preserving order."""
    seen = set()
    out = []
    for x in seq:
        if x not in seen:
            seen.add(x)
            out.append(x)
    return out

def normalize_type(obj_dict):
    """
    Return lower-cased type string ('dataset'/'publication') if present.
    Fallback to TYPEID_MAP via 'typeId' if 'type' is missing.
    """
    if not isinstance(obj_dict, dict):
        return None
    t = obj_dict.get("type")
    if isinstance(t, str) and t.strip():
        return t.strip().lower()
    tid = obj_dict.get("typeId")
    if isinstance(tid, int) and tid in TYPEID_MAP:
        return TYPEID_MAP[tid]
    # also handle numeric strings like "1" or "10"
    try:
        tid_int = int(tid)
        return TYPEID_MAP.get(tid_int)
    except Exception:
        return None

def extract_links(json_text, dedup=False):
    """
    Parse one JSON cell and return a dict with:
      - links_dir_refids          e.g., "from [4806]; from [5024]; to [4839]; to [4840]"
      - links_target_ids          e.g., "1000; 20907; 20826"
      - links_source_ids          e.g., "31013; 31051; 31152; 1000"
      - links_reference_types     e.g., "IsSupplementTo; IsDerivedFrom"
      - links_target_types        e.g., "dataset; dataset; dataset"
      - links_source_types        e.g., "publication; publication; publication"
    """
    empty = {
        "links_dir_refids": "",
        "links_target_ids": "",
        "links_source_ids": "",
        "links_reference_types": "",
        "links_target_types": "",
        "links_source_types": "",
    }

    if json_text is None:
        return empty

    # Allow already-parsed objects or strings
    if isinstance(json_text, (dict, list)):
        obj = json_text
    else:
        s = str(json_text).strip()
        if not s:
            return empty
        try:
            obj = json.loads(s)
        except Exception:
            return empty

    links = obj.get("links")
    if not isinstance(links, dict):
        return empty

    dir_refids, target_ids, source_ids, ref_types = [], [], [], []
    target_types, source_types = [], []

    for direction in ("from", "to"):
        arr = links.get(direction)
        if not isinstance(arr, list):
            continue
        for item in arr:
            if not isinstance(item, dict):
                continue

            # direction + refId
            ref_id = item.get("refId")
            if ref_id is not None:
                dir_refids.append(f"{direction} [{ref_id}]")

            # target
            tgt = item.get("target") or {}
            tgt_id = tgt.get("id")
            if tgt_id is not None:
                target_ids.append(str(tgt_id))
            tt = normalize_type(tgt)
            if tt:
                target_types.append(tt)

            # source
            src = item.get("source") or {}
            src_id = src.get("id")
            if src_id is not None:
                source_ids.append(str(src_id))
            st = normalize_type(src)
            if st:
                source_types.append(st)

            # referenceType
            rtype = item.get("referenceType")
            if rtype:
                ref_types.append(str(rtype))

    if dedup:
        target_ids   = unique_preserve(target_ids)
        source_ids   = unique_preserve(source_ids)
        ref_types    = unique_preserve(ref_types)
        target_types = unique_preserve(target_types)
        source_types = unique_preserve(source_types)
        # Usually keep dir_refids as-is (direction matters)

    return {
        "links_dir_refids":      "; ".join(dir_refids),
        "links_target_ids":      "; ".join(target_ids),
        "links_source_ids":      "; ".join(source_ids),
        "links_reference_types": "; ".join(ref_types),
        "links_target_types":    "; ".join(target_types),
        "links_source_types":    "; ".join(source_types),
    }

# ---- Load, transform, save ----
df = pd.read_csv(INPUT_PATH, sep=IN_SEP, encoding=ENCODING, dtype=str, keep_default_na=False)
if JSON_COL not in df.columns:
    raise KeyError(f"Column '{JSON_COL}' not found. Available columns: {list(df.columns)}")

results = df[JSON_COL].apply(lambda cell: extract_links(cell, dedup=DEDUP))
res_df = pd.DataFrame(results.tolist())

out_df = pd.concat([df, res_df], axis=1)
out_df.to_csv(OUTPUT_PATH, sep=OUT_SEP, index=False, encoding=ENCODING)

print(f"Processed {len(df):,} rows.")
print(f"Wrote: {OUTPUT_PATH}")
display(out_df[[
    "links_dir_refids",
    "links_target_ids",
    "links_target_types",
    "links_source_ids",
    "links_source_types",
    "links_reference_types",
]].head(10))


Processed 2,133 rows.
Wrote: linkAPI.with_links.csv


Unnamed: 0,links_dir_refids,links_target_ids,links_target_types,links_source_ids,links_source_types,links_reference_types
0,from [4806]; from [5024]; from [5027]; to [483...,1000; 1000; 1000; 20907; 20826,dataset; dataset; dataset; dataset; dataset,31013; 31051; 31152; 1000; 1000,publication; publication; publication; dataset...,IsSupplementTo; IsSupplementTo; IsSupplementTo...
1,from [3603]; from [4720],1580; 1580,dataset; dataset,30649; 30959,publication; publication,Is supplement to; IsSupplementTo
2,,,,,,
3,,,,,,
4,,,,,,
5,,,,,,
6,,,,,,
7,,,,,,
8,,,,,,
9,to [2484],15146,dataset,2480,dataset,link
