In [60]:
import requests
import json
import jsonref
import pandas as pd

In [61]:
# Purpose: Loads JSON data from either a local file path or a remote URL.
# How It Works: This function checks if the provided URL starts with http: or https:, 
# indicating that it's a remote URL. If so, it uses the requests library to fetch the content. 
# If not, it assumes the URL is a local file path and opens the file to load JSON data. 
# It returns the loaded JSON object.

def load_json(source):
    """
    Load JSON data from a given source which can be a URL or a local file path.
    Raises appropriate exceptions for HTTP errors or file reading errors.
    """
    if source.startswith(('http:', 'https:')):
        try:
            response = requests.get(source)
            response.raise_for_status()
            return response.json()
        except requests.RequestException as e:
            raise RuntimeError(f"Failed to fetch data from {source}: {str(e)}")
    else:
        try:
            with open(source, 'r') as file:
                return json.load(file)
        except FileNotFoundError:
            raise FileNotFoundError(f"No file found at {source}")
        except json.JSONDecodeError as e:
            raise json.JSONDecodeError(f"Error decoding JSON from {source}: {str(e)}")


# resolve_fragment
# Purpose: Resolves a JSON Pointer fragment within a JSON document.
# How It Works: Given a document and a fragment (a path like `/properties/name`), 
# it splits the fragment and iteratively accesses nested properties within the JSON document 
# to return the specific part of the document referred to by the fragment.

def resolve_fragment(document, fragment):
    parts = fragment.strip('/').split('/')
    sub_document = document
    for part in parts:
        try:
            sub_document = sub_document[part]
        except KeyError:
            raise KeyError(f"Fragment '{fragment}' not found in the document.")
    return sub_document

In [62]:

# General JSON reference loader that handles HTTP, HTTPS, or local file URIs.
# Returns the JSON object loaded from the URI.

def custom_loader_general(uri, base_uri=None, **kwargs):
    if uri.startswith(("http://", "https://")):
        return load_json(uri)
    elif uri.startswith("file://"):
        try:
            with open(uri.replace("file://", ""), "r") as file:
                return json.load(file)
        except Exception as e:
            raise ValueError(f"Error reading file at {uri}: {str(e)}")
    else:
        raise ValueError(f"Unhandled URI: {uri}")

# Custom loader for JSON $ref tags that specifically handle component definitions.
# Fetches 'components.json' from a predefined URL and resolves any specific fragments.

def custom_loader_for_components(uri, base_uri=None, **kwargs):
    if uri.startswith("urn:components"):
        components_url = 'https://raw.githubusercontent.com/openownership/data-standard/main/schema/components.json'
        components = load_json(components_url)
        fragment = uri.partition('#')[2] if '#' in uri else ''
        return resolve_fragment(components, fragment) if fragment else components
    else:
        return custom_loader_general(uri, base_uri)

# Custom loader for handling statement-specific URIs and delegating component URIs
# to the custom_loader_for_components. Returns a dictionary representing the reference
# for placeholders or the resolved JSON for component URIs.

def custom_loader_for_components(uri, base_uri=None, **kwargs):
    if uri.startswith("urn:components"):
        components_url = 'https://raw.githubusercontent.com/openownership/data-standard/main/schema/components.json'
        components = load_json(components_url)
        fragment = uri.partition('#')[2] if '#' in uri else ''
        return resolve_fragment(components, fragment) if fragment else components
    else:
        return custom_loader_general(uri, base_uri)



In [63]:

# process_schema
# Purpose: Loads and processes a JSON schema using a specified loader function 
# to handle `$ref` references.
# How It Works: This function utilizes jsonref.JsonRef.replace_refs() to process 
# the JSON schema, replacing `$ref` references using the provided custom loader.

def process_schema(file_url, loader):
    base_uri = "file://" + file_url if not file_url.startswith(("http:", "https:")) else file_url
    schema = jsonref.JsonRef.replace_refs(load_json(file_url), loader=loader, base_uri=base_uri)
    return schema

# find_and_remove_unused_defs
# Purpose: Cleans the schema by removing unused definitions within the `$defs` section 
# of a JSON schema.
# How It Works: Iterates through definitions in `$defs` and checks if they are referenced 
# anywhere in the schema. If a definition is not referenced, it is removed.

def find_and_remove_unused_defs(schema):
    schema_str = json.dumps(schema, default=str)
    definitions = list(schema.get('$defs', {}).keys())
    for def_key in definitions:
        ref_string = f'"$ref": "#/$defs/{def_key}"'
        if ref_string not in schema_str:
            del schema['$defs'][def_key]
    if not schema['$defs']:
        del schema['$defs']
    return schema


In [64]:
# flatten_properties
# Purpose: Recursively extracts properties from the JSON schema and flattens them into a list of dictionaries, suitable for conversion into a tabular format.
# How It Works: Handles nested properties and arrays by recursively calling itself, collecting properties' details such as path, title, type, description, and whether they are required.

def flatten_properties(properties, required_fields=None, path='', parent_required=None):
    items = []
    if required_fields is None:
        required_fields = parent_required if parent_required else []

    for key, value in properties.items():
        full_path = f"{path}/{key}" if path else key
        if isinstance(value, dict):
            is_required = 'Yes' if key in required_fields else 'No'
            property_details = {
                'Property Path': full_path,
                'Title': value.get('title', ''),
                'Description': value.get('description', ''),
                'Type': value.get('type', ''),
                'Required': is_required
            }
            if 'properties' in value:
                items.extend(flatten_properties(value['properties'], value.get('required', []), full_path))
            if 'items' in value and isinstance(value['items'], dict):
                items.extend(flatten_properties({'item': value['items']}, [], f"{full_path}/item", value.get('required', [])))
            if 'enum' in value:
                property_details['Enum'] = ', '.join(str(e) for e in value['enum'])
            items.append(property_details)
        elif isinstance(value, list):
            # Handle cases where value is a list of properties
            for item in value:
                if isinstance(item, dict):
                    items.extend(flatten_properties(item, required_fields, full_path, parent_required))
        else:
            print(f"Skipping {key}: expected a dict but got {type(value).__name__}")

    return items

# schema_to_dataframe
# Purpose: Converts the list of dictionaries (flattened schema properties) into a pandas DataFrame.
# How It Works: This function takes the output from flatten_properties and uses it to create a DataFrame. This DataFrame then provides a structured and easy-to-analyze view of the schema properties.

def schema_to_dataframe(schema):
    if 'properties' in schema:
        properties = schema['properties']
        required = schema.get('required', [])
    elif 'items' in schema and isinstance(schema['items'], dict):
        # New handling for schemas where properties are defined under 'items'
        if '$ref' in schema['items']:
            ref_path = schema['items']['$ref'].strip('#/').split('/')
            properties = schema
            for part in ref_path:
                properties = properties.get(part, {})
            properties = properties.get('properties', {})
            required = properties.get('required', [])
        else:
            properties = schema['items'].get('properties', {})
            required = schema['items'].get('required', [])
    else:
        properties = {}
        required = []

    if properties:
        flattened_data = flatten_properties(properties, required)
    else:
        flattened_data = []
        print("No properties to flatten.")
    return pd.DataFrame(flattened_data)


In [69]:
# Load, process, and display a JSON schema from a specified file URL.
# Takes the file path or URL of the JSON schema and returns a dataframe with the flattened schema

def display_schema_from_url(file_url, loader_function):
    try:
        schema = process_schema(file_url, loader_function)
        df = schema_to_dataframe(schema)
        return df
    except Exception as e:
        print(f"Failed to process schema: {str(e)}")
        return pd.DataFrame()

In [73]:
# Link to the JSON schema and specify the custom loader for components
schema_url = 'https://raw.githubusercontent.com/openownership/data-standard/main/schema/relationship-record.json'
custom_loader = custom_loader_for_components

# Use the display_schema_from_url function to process and display the schema
df_relationship = display_schema_from_url(schema_url, custom_loader)
df_relationship

Unnamed: 0,Property Path,Title,Description,Type,Required,Enum
0,isComponent,Is component,Whether this relationship is a component of a ...,boolean,Yes,
1,componentRecords/item/item,,,string,No,
2,componentRecords,Component Record IDs,The `recordId` values of all component records...,array,No,
3,subject,Subject,The `recordId` for the subject of the relation...,,Yes,
4,interestedParty,Interested Party,The `recordId` for the interested party in the...,,Yes,
5,interests/item/item/type,Type of Interest,"The nature of the interest, using the interest...",string,No,"shareholding, votingRights, appointmentOfBoard..."
6,interests/item/item/directOrIndirect,Direct or Indirect,How directly the interest is exercised by the ...,string,No,"direct, indirect, unknown"
7,interests/item/item/beneficialOwnershipOrControl,Beneficial Ownership or Control,Whether this interest (alone or with others) m...,boolean,No,
8,interests/item/item/details,Details,"The local name given to this kind of interest,...",string,No,
9,interests/item/item/share/exact,Exact percentage,The exact share of this interest held (if avai...,number,No,


In [71]:
# Link to the JSON schema and specify the custom loader for components
schema_url = 'https://raw.githubusercontent.com/openownership/data-standard/main/schema/entity-record.json'
custom_loader = custom_loader_for_components

# Use the display_schema_from_url function to process and display the schema
df_entity = display_schema_from_url(schema_url, custom_loader)
df_entity

Unnamed: 0,Property Path,Title,Description,Type,Required,Enum
0,isComponent,Is component,Whether this entity is a component in an indir...,boolean,Yes,
1,entityType/type,Type,"The general form of the entity, using the enti...",string,Yes,"registeredEntity, legalEntity, arrangement, an..."
2,entityType/subtype,Subtype,"The particular form of the entity, where relev...",string,No,"governmentDepartment, stateAgency, other, trus..."
3,entityType/details,Details,This may be used to provide a local name for t...,string,No,
4,entityType,Entity Type,The form of the entity described in the Statem...,object,Yes,
5,unspecifiedEntityDetails/reason,Reason,The reason that a person or entity cannot be s...,string,Yes,"noBeneficialOwners, subjectUnableToConfirmOrId..."
6,unspecifiedEntityDetails/description,Description,Additional information about the absence of de...,string,No,
7,unspecifiedEntityDetails,Unspecified or unknown person or entity,A `reason` MUST be supplied.,object,No,
8,name,Entity Name,The declared name of this entity.,string,No,
9,alternateNames/item/item,Name,A name this entity is known by.,string,No,


In [72]:
# Link to the JSON schema and specify the custom loader for components
schema_url = 'https://raw.githubusercontent.com/openownership/data-standard/main/schema/statement.json'
custom_loader = custom_loader_statement

# Use the display_schema_from_url function to process and display the schema
df_statement = display_schema_from_url(schema_url, custom_loader)
df_statement

Unnamed: 0,Property Path,Title,Description,Type,Required,Enum
0,statementId,Statement Identifier,A persistent globally unique identifier for th...,string,Yes,
1,statementDate,Statement Date,The date on which this statement was declared ...,string,Yes,
2,annotations/item/item/statementPointerTarget,Statement Fragment Pointer,An RFC6901 JSON Pointer (https://tools.ietf.or...,string,No,
3,annotations/item/item/creationDate,Creation Date,"The date on which this Annotation was created,...",string,No,
4,annotations/item/item/createdBy/name,Name,"The name of the person, organisation or agent ...",string,No,
5,annotations/item/item/createdBy/uri,URI,"An optional URI to identify the person, organi...",string,No,
6,annotations/item/item/createdBy,Created By,"The person, organisation or agent that created...",object,No,
7,annotations/item/item/motivation,Motivation,"The reason for this Annotation, using the anno...",string,No,"commenting, correcting, identifying, linking, ..."
8,annotations/item/item/description,Description,A free text description providing extra inform...,string,No,
9,annotations/item/item/transformedContent,Transformed Content,A representation of the Annotation target afte...,string,No,


In [74]:
df_statement['Source'] = 'Statement'
df_entity['Source'] = 'Entity'
df_relationship['Source'] = 'Relationship'

df_final = pd.concat([df_statement, df_entity, df_relationship], ignore_index=True)

display(df_final)

Unnamed: 0,Property Path,Title,Description,Type,Required,Enum,Source
0,statementId,Statement Identifier,A persistent globally unique identifier for th...,string,Yes,,Statement
1,statementDate,Statement Date,The date on which this statement was declared ...,string,Yes,,Statement
2,annotations/item/item/statementPointerTarget,Statement Fragment Pointer,An RFC6901 JSON Pointer (https://tools.ietf.or...,string,No,,Statement
3,annotations/item/item/creationDate,Creation Date,"The date on which this Annotation was created,...",string,No,,Statement
4,annotations/item/item/createdBy/name,Name,"The name of the person, organisation or agent ...",string,No,,Statement
...,...,...,...,...,...,...,...
98,interests/item/item/share,Percentage Share,The proportion of this type of interest held b...,object,No,,Relationship
99,interests/item/item/startDate,Start Date,The date from which this interest was active. ...,string,No,,Relationship
100,interests/item/item/endDate,End Date,The date from which this interest ceased to ex...,string,No,,Relationship
101,interests/item/item,Interest,A description of an interest held by an intere...,object,No,,Relationship


In [76]:
df_final.to_csv('mapping.csv',index=False)