# CKAN Metadata to Graph JSON Converter

This code will convert CKAN metadata into a graph json structure. The output is two files:

- `conformed_graph.json`: this file is a raw graph which is generally compatible with LPG viewers (a basic 2D viewer has been set up at the top of the lpg directory for this purpose)
- `viz_graph.json`: this file is configured to work with the Graph3d visualizer

## There are four main sections to the code:

1. **Get the metadata** - uses a CSV file in the project directory called `report_list.csv`, can be generated manually or using CKAN API Request Builder: [GSQ Labs](https://geological-survey-of-queensland.github.io)

2. **Convert each metadata into the graph schema**

3. **Conform the metadata into one graph object**  
   \> output: `conformed_graph.json`
   This can be loaded into the 2D_lpg_viewer at the top of the lpg directory in the [gsq-knowledge-graph-tools]( https://github.com/geological-survey-of-queensland/gsq-knowledge-graph-tools) repo

4. **Convert the `conformed_graph.json` into a Graph3d file**  
   \> output: `vis_graph.json`
   This can be loaded into the 3D Graph Viewer: [GSQ Labs](https://geological-survey-of-queensland.github.io)

In [None]:
#Step 1: - generate the metadata json combined

In [None]:
import pandas as pd
import requests
import json

# Initialize an empty dict for the combined metadata
metadata_combined = {}

# Path to the csv file
csv_file = 'report_list.csv'

# Read the csv file using pandas
try:
    df = pd.read_csv(csv_file)

    # Counter for successful package retrievals
    successful_packages = 0

    # Loop through the values in the 'name' column of the dataframe
    for name in df['name']:
        # Define the API endpoint with the name appended
        api_endpoint = f'https://geoscience.data.qld.gov.au/api/3/action/package_show?id={name}'
        
        # Try to make a GET request to the API endpoint
        try:
            response = requests.get(api_endpoint)
            
            # If the response is successful, add the JSON object to the combined metadata
            if response.status_code == 200:
                package_data = response.json()
                metadata_combined[name] = package_data
                successful_packages += 1
            else:
                # Print an error message if the response is unsuccessful
                print(f'Error with package {name}: Received status code {response.status_code}')
        
        except requests.RequestException as e:
            # Print an error message if there's a RequestException
            print(f'Error with package {name}: {e}')
        except json.JSONDecodeError:
            # Print an error message if the response is not valid JSON
            print(f'Error with package {name}: Invalid JSON response')

    # Print the total number of packages added
    print(f'A total of {successful_packages} packages have been added to metadata_combined.')

except FileNotFoundError:
    print(f'The file {csv_file} does not exist.')
except pd.errors.EmptyDataError:
    print(f'The file {csv_file} is empty or does not contain the \'name\' column.')
except Exception as e:
    # General exception handling for debugging purposes
    print(f'An unexpected error occurred: {e}')

# Debugging print to show the structure of metadata_combined, this can be removed/commented out
# print(json.dumps(metadata_combined, indent=4))

In [None]:
#Step 2:  convert the ckan package objects to graph format

In [None]:
import json
import uuid

# Initialize a variable to hold the graphs
metagraph_objects = []
successful_objects = 0

# Helper function to stringify JSON objects, leave other types as is
def stringify_if_needed(value):
    return json.dumps(value) if isinstance(value, dict) else value

# Function to create a labelled property graph from a metadata object
def create_lpg(metadata):
    graph = {
        'nodes': [],
        'edges': []
    }
    package_node = {'label': 'package', 'properties': {}}
    
    # Store the package ID once identified
    package_id = None

    # Process the metadata object
    for key, value in metadata.items():
        if key == 'id':
            package_id = value  # We expect the ID to be a string already
            package_node['id'] = package_id
        # Assuming 'resources' is part of the top-level keys
        elif key == 'resource_authority_permit':
            # Create a unique node for resource_authority_permit
            resource_authority_permit_id = 'permit ' + value  # Prepending "permit " to create a unique ID
            rap_node = {'id': resource_authority_permit_id, 'label': 'resource_authority_permit', 'properties': {key: value}}
            graph['nodes'].append(rap_node)
            # Create an edge from the package to the resource_authority_permit
            edge_id = str(uuid.uuid4())
            graph['edges'].append({
                'id': edge_id,
                'source': package_id,
                'target': resource_authority_permit_id,
                'label': 'has_permit'
            })
        elif key == 'resources':
            for resource in value:
                if isinstance(resource, dict) and 'id' in resource:
                    resource_node = {'id': resource['id'], 'label': 'resource', 'properties': {}}
                    for prop_key, prop_value in resource.items():
                        resource_node['properties'][prop_key] = stringify_if_needed(prop_value)
                    graph['nodes'].append(resource_node)
                    edge_id = str(uuid.uuid4())
                    graph['edges'].append({
                        'id': edge_id,
                        'source': package_id,
                        'target': resource_node['id'],
                        'label': 'has_resource'
                    })
        # For other keys that are not special cases
        elif isinstance(value, list) and all(isinstance(item, str) for item in value):
            # Process list of strings as separate nodes with edges
            for item in value:
                node_id = item
                graph['nodes'].append({'id': node_id, 'label': key, 'properties': {}})
                edge_id = str(uuid.uuid4())
                graph['edges'].append({
                    'id': edge_id,
                    'source': metadata["id"],
                    'target': node_id,
                    'label': f'has_{key}'
                })
        # For non-array, string/bool properties of the package node
        elif isinstance(value, (bool, str)):
            package_node['properties'][key] = stringify_if_needed(value)

    graph['nodes'].append(package_node)
    return graph

# Iterate through metadata_combined and apply the above function
for package_id, metadata in metadata_combined.items():
    try:
        # Check if 'result' key exists and process its content
        if 'result' in metadata:
            graph = create_lpg(metadata['result'])
            metagraph_objects.append(graph)
            successful_objects += 1
        else:
            print(f'Error: Package {package_id} does not have a "result" key.')
    except Exception as e:
        print(f'Error processing package {package_id}: {e}')

print(f'Number of objects processed successfully: {successful_objects}')

In [None]:
#Step 3: conform all these separate graph objects into one graph object, removing dupes etc.

In [None]:
from collections import defaultdict

# Initialize conformed_graph structure
conformed_graph = {"nodes": [], "edges": []}

# Helper function to check if a node with the same ID already exists
def node_exists(nodes_list, node_id):
    return any(node['id'] == node_id for node in nodes_list)

# Create a mapping from node ids to nodes to efficiently check for existing nodes
node_id_to_node = {}

# Combine nodes from all graphs, checking for duplicates
for graph in metagraph_objects:
    for node in graph['nodes']:
        if node['id'] not in node_id_to_node:
            conformed_graph['nodes'].append(node)
            node_id_to_node[node['id']] = node

# Combine edges from all graphs
for graph in metagraph_objects:
    for edge in graph['edges']:
        # Ensure that both 'source' and 'target' have corresponding nodes
        if edge['source'] in node_id_to_node and edge['target'] in node_id_to_node:
            conformed_graph['edges'].append(edge)
        else:
            # Normally, you would log this or handle it in some way
            print(f"Warning: Edge with source {edge['source']} and target {edge['target']} was omitted due to missing node reference.")

# Print the total number of nodes and edges in the conformed_graph object
print(f"Total number of nodes: {len(conformed_graph['nodes'])}")
print(f"Total number of edges: {len(conformed_graph['edges'])}")

In [None]:
#Step 4: convert the graph-3d version

In [None]:
import os
import json

# Default node and edge structures based on the given example
default_node_structure = {
    "id": None,
    "count": 1,
    "description": None,
    "properties": {},
    "size": 20,
    "group": None,
    "visible": True,
    "overrideColor": False,
    "color": None,
    "opacity": 0.5
}

default_edge_structure = {
    "source": None,
    "target": None,
    "description": None,
    "count": 1,
    "properties": {},
    "thickness": 3,
    "value": 1,
    "visible": True,
    "overrideColor": False,
    "color": "#33FFFF",
    "particleColor": "#FFFF33",
    "linkLabelVisible": True
}

# Define a function to get a unique color for each group
def assign_color(group):
    color_map = {
        "package": "#ff00ff",
        "resource": "#0c64e8",
        "resource_authority_permit": "#0000cd"
        # You can add more predefined group colors here
    }
    return color_map.get(group, "#d3d3d3")  # Default color if group not found

# Initialize vis_graph with the given structures
vis_graph = {
    "nodes": [],
    "links": []
}

# Process the conformed graph into vis_graph format
existing_node_ids = set()  # Keep track of existing node IDs to avoid duplicates
for node in conformed_graph["nodes"]:
    if node["id"] not in existing_node_ids:
        vis_node = default_node_structure.copy()
        # Use 'title' if it exists, if not 'name', and if neither, fall back to `node['id']`
        description = node["properties"].get("title", node["properties"].get("name", node["id"]))
        vis_node.update({
            "id": node["id"],
            "description": description,
            "group": node["label"],
            "color": assign_color(node["label"]),
            "properties": node["properties"]  # Assuming properties is a dict, converting to a list
        })
        vis_graph["nodes"].append(vis_node)
        existing_node_ids.add(node["id"])

for edge in conformed_graph["edges"]:
    vis_edge = default_edge_structure.copy()
    vis_edge.update({
        "source": edge["source"],
        "target": edge["target"],
        "description": edge["label"]
    })
    vis_graph["links"].append(vis_edge)

# Output directory
output_directory = "output"
os.makedirs(output_directory, exist_ok=True)

# Function to save JSON content to file
def save_json_content(filename, content):
    filepath = os.path.join(output_directory, filename)
    with open(filepath, 'w') as file:
        json.dump(content, file, indent=4)
    return filepath

# Save both graphs as JSON files
try:
    conformed_graph_path = save_json_content("conformed_graph.json", conformed_graph)
    print(f"{conformed_graph_path} has been saved successfully.")
    
    vis_graph_path = save_json_content("vis_graph.json", vis_graph)
    print(f"{vis_graph_path} has been saved successfully.")

except Exception as e:
    print("An error occurred while saving the JSON files:", e)