In [44]:
import requests
import pandas as pd

# Loading JSON schema file from Github

def load_json_from_url(url):
    response = requests.get(url)
    return response.json()

# Resolves JSON schema $refs based on definitions

def resolve_ref(ref, definitions):
    ref_path = ref.strip('#/').split('/')
    data = definitions
    for step in ref_path:
        data = data.get(step, {})
    return data



In [29]:
def extract_schema_data_statement(schema, definitions, path='', required_fields=[]):
    rows = []
    if 'properties' in schema:
        properties = schema['properties']
        for prop_name, prop_details in properties.items():
            full_path = f"{path}.{prop_name}" if path else prop_name
            is_required = prop_name in required_fields
            if '$ref' in prop_details:
                # Resolve the $ref and continue extraction with the resolved schema
                resolved_details = resolve_ref(prop_details['$ref'], definitions)
                rows += extract_schema_data_statement(resolved_details, definitions, full_path, required_fields)
            elif 'properties' in prop_details:
                # Recursively extract nested properties
                rows += extract_schema_data_statement(prop_details, definitions, full_path, required_fields)
            elif 'items' in prop_details and isinstance(prop_details['items'], dict):
                # Handle arrays of items which are objects
                items_details = prop_details['items']
                if '$ref' in items_details:
                    resolved_items = resolve_ref(items_details['$ref'], definitions)
                    rows += extract_schema_data_statement(resolved_items, definitions, f"{full_path}[]", required_fields)
                else:
                    rows += extract_schema_data_statement(items_details, definitions, f"{full_path}[]", required_fields)
            else:
                # Handle simple properties and properties with enum values
                prop_type = prop_details.get('type', 'N/A')
                description = prop_details.get('description', 'No description available')
                enums = prop_details.get('enum', [])
                enum_values = ', '.join(map(str, enums)) if enums else 'N/A'
                rows.append({
                    'Field': full_path,
                    'Type': prop_type,
                    'Description': description,
                    'Required': 'Yes' if is_required else 'No',
                    'Allowed Values': enum_values
                })
    return rows

In [30]:
# Statement Schema Processing
url = "https://raw.githubusercontent.com/openownership/data-standard/main/schema/statement.json"
json_schema = load_json_from_url(url)
definitions = json_schema.get('$defs', {})
statement_schema = definitions.get('Statement', {})
required_fields = statement_schema.get('required', [])
data = extract_schema_data_statement(statement_schema, definitions, required_fields=required_fields)
df_statement = pd.DataFrame(data)
df_statement

Unnamed: 0,Field,Type,Description,Required,Allowed Values
0,statementId,string,A persistent globally unique identifier for th...,Yes,
1,statementDate,string,The date on which this statement was declared ...,Yes,
2,publicationDetails.publicationDate,string,The date on which this statement was published...,No,
3,publicationDetails.bodsVersion,string,The version of the Beneficial Ownership Data S...,No,
4,publicationDetails.license,string,A link to the license that applies to this Sta...,No,
5,publicationDetails.publisher.name,string,The name of the publisher.,No,
6,publicationDetails.publisher.url,string,"The URL where details of the full dataset, or ...",No,
7,declaration,string,An identifier or reference for a declaration w...,No,
8,declarationSubject,string,A `recordId` value for the subject of a benefi...,Yes,
9,recordId,string,A unique identifier for the record (within the...,Yes,


In [45]:
def extract_schema_data_entity(schema, definitions, path='', required_fields=[]):
    rows = []
    if 'properties' in schema:
        properties = schema['properties']
        for prop_name, prop_details in properties.items():
            full_path = f"{path}.{prop_name}" if path else prop_name
            is_required = prop_name in required_fields
            if '$ref' in prop_details:
                # Resolve the $ref and continue extraction with the resolved schema
                resolved_details = resolve_ref(prop_details['$ref'], definitions)
                rows += extract_schema_data_entity(resolved_details, definitions, full_path, required_fields)
            elif 'properties' in prop_details or 'items' in prop_details:
                # Recursively handle nested properties or arrays of items
                rows += extract_schema_data_entity(prop_details, definitions, full_path, required_fields)
            else:
                # Extract basic property information
                prop_type = prop_details.get('type', 'N/A')
                description = prop_details.get('description', 'No description available')
                enums = prop_details.get('enum', [])
                enum_values = ', '.join(map(str, enums)) if enums else 'N/A'
                rows.append({
                    'Field': full_path,
                    'Type': prop_type,
                    'Description': description,
                    'Required': 'Yes' if is_required else 'No',
                    'Allowed Values': enum_values
                })
    return rows

In [46]:
# Entity schema processing
url = "https://raw.githubusercontent.com/openownership/data-standard/main/schema/entity-record.json"
json_schema = load_json_from_url(url)
definitions = json_schema.get('$defs', {})
required_fields = json_schema.get('required', [])
data = extract_schema_data_entity(json_schema, definitions, required_fields=required_fields)
df_entity = pd.DataFrame(data)
display(df_entity)


Unnamed: 0,Field,Type,Description,Required,Allowed Values
0,isComponent,boolean,Whether this entity is a component in an indir...,Yes,
1,entityType.type,string,"The general form of the entity, using the enti...",No,"registeredEntity, legalEntity, arrangement, an..."
2,entityType.subtype,string,"The particular form of the entity, where relev...",No,"governmentDepartment, stateAgency, other, trus..."
3,entityType.details,string,This may be used to provide a local name for t...,No,
4,name,string,The declared name of this entity.,No,
5,foundingDate,string,"The date on which this entity was founded, cre...",No,
6,dissolutionDate,string,The date on which this entity was dissolved or...,No,
7,uri,string,Where a persistent URI (https://en.wikipedia.o...,No,
8,formedByStatute.name,string,The name of the law.,No,
9,formedByStatute.date,string,The date on which the law came into force. The...,No,


In [63]:
def extract_schema_data_relationship(schema, definitions, path='', required_fields=[]):
    rows = []
    if 'properties' in schema:
        properties = schema['properties']
        for prop_name, prop_details in properties.items():
            full_path = f"{path}.{prop_name}" if path else prop_name
            is_required = prop_name in required_fields
            if 'oneOf' in prop_details:
                # Handle oneOf by concatenating all possible types
                type_descriptions = []
                for option in prop_details['oneOf']:
                    if '$ref' in option:
                        ref_info = resolve_ref(option['$ref'], definitions)
                        type_description = f"Ref to {option['$ref']}"
                    elif 'type' in option:
                        type_description = option['type']
                    type_descriptions.append(type_description)
                prop_type = 'oneOf: ' + ', '.join(type_descriptions)
            elif '$ref' in prop_details:
                resolved_details = resolve_ref(prop_details['$ref'], definitions)
                rows += extract_schema_data_relationship(resolved_details, definitions, full_path, required_fields)
                prop_type = f"Ref to {prop_details['$ref']}"
            else:
                prop_type = prop_details.get('type', 'N/A')

            description = prop_details.get('description', 'No description available')
            rows.append({
                'Field': full_path,
                'Type': prop_type,
                'Description': description,
                'Required': 'Yes' if is_required else 'No'
            })

    return rows

In [64]:
# Relationship schema processing
url = "https://raw.githubusercontent.com/openownership/data-standard/main/schema/relationship-record.json"
json_schema = load_json_from_url(url)
definitions = json_schema.get('$defs', {})
required_fields = json_schema.get('required', [])
data = extract_schema_data_relationship(json_schema, definitions, required_fields=required_fields)
df_relationship = pd.DataFrame(data)
display(df_relationship)
df_relationship.to_csv('test.csv')


Unnamed: 0,Field,Type,Description,Required
0,isComponent,boolean,Whether this relationship is a component of a ...,Yes
1,componentRecords,array,The `recordId` values of all component records...,No
2,subject,"oneOf: string, Ref to urn:components#/$defs/Un...",The `recordId` for the subject of the relation...,Yes
3,interestedParty,"oneOf: string, Ref to urn:components#/$defs/Un...",The `recordId` for the interested party in the...,Yes
4,interests,array,A description of the interests held by the int...,No


In [65]:
def extract_schema_data_components(schema, definitions, path='', required_fields=[]):
    rows = []
    if 'properties' in schema:
        properties = schema['properties']
        for prop_name, prop_details in properties.items():
            full_path = f"{path}.{prop_name}" if path else prop_name
            is_required = prop_name in required_fields
            if '$ref' in prop_details:
                # Resolve the $ref and continue extraction with the resolved schema
                resolved_details = resolve_ref(prop_details['$ref'], definitions)
                rows += extract_schema_data_components(resolved_details, definitions, full_path, required_fields)
            elif 'properties' in prop_details or 'items' in prop_details:
                # Recursively handle nested properties or arrays of items
                rows += extract_schema_data_components(prop_details, definitions, full_path, required_fields)
            else:
                # Extract basic property information
                prop_type = prop_details.get('type', 'N/A')
                description = prop_details.get('description', 'No description available')
                enums = prop_details.get('enum', [])
                enum_values = ', '.join(map(str, enums)) if enums else 'N/A'
                rows.append({
                    'Field': full_path,
                    'Type': prop_type,
                    'Description': description,
                    'Required': 'Yes' if is_required else 'No',
                    'Allowed Values': enum_values
                })
    return rows


In [66]:
# Components schema processing
url = "https://raw.githubusercontent.com/openownership/data-standard/main/schema/components.json"
json_schema = load_json_from_url(url)
definitions = json_schema.get('$defs', {})
data = []
for component_name, component_schema in definitions.items():
    required_fields = component_schema.get('required', [])
    component_data = extract_schema_data_components(component_schema, definitions, path=component_name, required_fields=required_fields)
    data.extend(component_data)

df_components = pd.DataFrame(data)
display(df_components)


Unnamed: 0,Field,Type,Description,Required,Allowed Values
0,Address.type,string,"The function of the address, using the address...",No,"placeOfBirth, residence, registered, service, ..."
1,Address.address,string,"The address, with each line or component separ...",No,
2,Address.postCode,string,The postal code for this address.,No,
3,Country.name,string,The name of the country,Yes,
4,Country.code,string,The 2-letter country code (ISO 3166-1) for thi...,No,
5,Jurisdiction.name,string,The name of the jurisdiction,Yes,
6,Jurisdiction.code,string,The 2-letter country code (ISO 3166-1) or the ...,No,
7,Identifier.id,string,"The identifier for a person or entity, as issu...",No,
8,Identifier.scheme,string,"For entities, a code from org-id.guide (https:...",No,
9,Identifier.schemeName,string,The name of the identifier-issuing authority.,No,


In [67]:
df_statement['Source'] = 'Statement'
df_entity['Source'] = 'Entity'
df_relationship['Source'] = 'Relationship'
df_components['Source'] = 'Components'

df_final = pd.concat([df_statement, df_entity, df_relationship, df_components], ignore_index=True)

display(df_final)

Unnamed: 0,Field,Type,Description,Required,Allowed Values,Source
0,statementId,string,A persistent globally unique identifier for th...,Yes,,Statement
1,statementDate,string,The date on which this statement was declared ...,Yes,,Statement
2,publicationDetails.publicationDate,string,The date on which this statement was published...,No,,Statement
3,publicationDetails.bodsVersion,string,The version of the Beneficial Ownership Data S...,No,,Statement
4,publicationDetails.license,string,A link to the license that applies to this Sta...,No,,Statement
5,publicationDetails.publisher.name,string,The name of the publisher.,No,,Statement
6,publicationDetails.publisher.url,string,"The URL where details of the full dataset, or ...",No,,Statement
7,declaration,string,An identifier or reference for a declaration w...,No,,Statement
8,declarationSubject,string,A `recordId` value for the subject of a benefi...,Yes,,Statement
9,recordId,string,A unique identifier for the record (within the...,Yes,,Statement


In [68]:
df_final.to_csv('mapping.csv',index=False)