In [1]:
"""
The ingestion has been done in two steps:
    1. SDB metadata into custom schemas to OSDU
    2. Osdu  custom schemas into OSDU well known schemas

Therefore, we must address any record incosistency (missing) encounter in these stpes versus what there is in SDB.

QC WORKFLOW:

    1. Request records from SDB
        1.1. Make visible inconsitencies in SDB like ids uniqueness/duplicates so that we know which many recors we must expect
        to be ingested into custom schemas. 
        ** --> If any inconsistency here we report to SDB/DISKOS
        
    2. Compare which unique records have been ingested into OSDU to custom schemas.
        ** --> If any inconsistency here we address it into the custom schema ingestion pipeline.
        
    3. Compare which recoirds from custom schemas have been transformed succesfully into wks records and which ones have not.
        Take custom (sdb original) records relationships here.
        ** --> If any inconsistency here we address it into the custom schema ingestion pipeline.
"""

## Maybe resquest all ids and connect them in power bi to see inconsistencies??

'\nThe ingestion has been done in two steps:\n    1. SDB metadata into custom schemas to OSDU\n    2. Osdu  custom schemas into OSDU well known schemas\n\nTherefore, we must address any record incosistency (missing) encounter in these stpes versus what there is in SDB.\n\nQC WORKFLOW:\n\n    1. Request records from SDB\n        1.1. Make visible inconsitencies in SDB like ids uniqueness/duplicates so that we know which many recors we must expect\n        to be ingested into custom schemas. \n        ** --> If any inconsistency here we report to SDB/DISKOS\n        \n    2. Compare which unique records have been ingested into OSDU to custom schemas.\n        ** --> If any inconsistency here we address it into the custom schema ingestion pipeline.\n        \n    3. Compare which recoirds from custom schemas have been transformed succesfully into wks records and which ones have not.\n        Take custom (sdb original) records relationships here.\n        ** --> If any inconsistency here w

In [3]:
"""Uncomment if running in Azure ML"""
import sys
sys.path.append(r'/mnt/batch/tasks/shared/LS_root/mounts/clusters/jopm/code/Users/JOPM/osdu_jopm_rottation')
sys.path

from utils.sdb_diskos_service import LandmarkService
from src.libs.osdu_service.osdu_http_client import OsduHttpClient
from dotenv import load_dotenv
import json
import os

load_dotenv()

True

In [4]:
"""
Utilities
"""

def unique_ids_counts(records, dimension):
    return list(set([record[dimension] for record in records]))

def get_non_unique_values(records, dimension):
    duplicates = []
    value_counts = {} 

    for record in records:
        value = record[dimension]
        value_counts[value] = value_counts.get(value, 0) + 1

    for value, count in value_counts.items():
        if count > 1:
            duplicates.append(value)

    return duplicates

def derive_osdu_id_from_source_record(record, entity_name):
    """
    Funtion intended to convert sdb/diskos unique ids into osdu unique ids to later compare them and
    check which are missing or have been removed.
    """

    schema_kind = entities.get(entity_name).get('relativePath')    
    id = f"{osdu_env}:{schema_kind}:{record[entities.get(entity_name).get('enitity_unique_identifier')]}"

    return id

def compare_dictionaries(dict1, dict2, path=""):
    
    diff = {}

    for key in dict1:
        new_path = f"{path}.{key}" if path else key
        if key not in dict2:
            diff[new_path] = {
                "record1_value": dict1[key],
                "record2_value": None
            }
        elif dict1[key] != dict2[key]:
            if isinstance(dict1[key], dict) and isinstance(dict2[key], dict):
                sub_diff = compare_dictionaries(dict1[key], dict2[key], new_path)
                diff.update(sub_diff)
            else:
                diff[new_path] = {
                    "record1_value": dict1[key],
                    "record2_value": dict2[key]
                }

    for key in dict2:
        new_path = f"{path}.{key}" if path else key
        if key not in dict1:
            diff[new_path] = {
                "record1_value": None,
                "record2_value": dict2[key]
            }

    return diff

#### 1. Get records from SDB

In [5]:
last_run_date_time = "2023-07-25T09:10:16.945Z"
last_run_date_time = None
query_records_created_prior_latsrundate = True

osdu_env = "npequinor-dev"
source = "sdb"
# source = "diskos"

osdu_client = OsduHttpClient(osdu_env, client_type="token-client")

In [6]:
entities = {
    "seismicsurveys": {
        # Attibutes to make the sourse API request
        "relativePath": "seismicsurveys",
        "filters_dict": {"surveyDimension": ["eq", "'3D'"]},
        "return_cols": ["surveyId"],
        "expand_on": None,
        "lastrundatetime": last_run_date_time,
        "before_rundate": query_records_created_prior_latsrundate,
        "enitity_unique_identifier": "surveyId",
        
        # Attributes to connect to osdu
        "custom_schema": f"eqnr:iEnergy-{source}:seismicsurveys:1.0.0",
        "osdu_well_known_schema": [
            "osdu:wks:master-data--SeismicAcquisitionSurvey:1.2.0"
        ]
    },
    "seismicpoststackdatasets": {
        # Attibutes to make the sourse API request
        "relativePath": "seismicpoststackdatasets",
        "filters_dict": {"surveyDimension": ["eq", "'3D'"]},
        "return_cols": ["seismicPoststackDatasetId"],
        "expand_on": None,
        "lastrundatetime": last_run_date_time,
        "before_rundate": query_records_created_prior_latsrundate,
        "enitity_unique_identifier": "seismicPoststackDatasetId",
        
        # Attributes to connect to osdu
        "custom_schema": f"eqnr:iEnergy-{source}:seismicpoststackdatasets:1.0.0",
        "osdu_well_known_schema": [
            "osdu:wks:dataset--FileCollection.SEGY:1.0.0",                    # --->> missing in ingestion workflow - to be fixed
            "osdu:wks:work-product-component--SeismicTraceData:1.3.0"
        ]
    },
    "poststackcubes": {
        # Attibutes to make the sourse API request
        "relativePath": "poststackcubes",
        "filters_dict": None,
        "return_cols": ["poststackCubeId"],
        "expand_on": None,
        "lastrundatetime": last_run_date_time,
        "before_rundate": query_records_created_prior_latsrundate,
        "enitity_unique_identifier": "poststackCubeId",
        
        # Attributes to connect to osdu
        "custom_schema": f"eqnr:iEnergy-{source}:poststackcubes:1.0.0",
        "osdu_well_known_schema": [
            "osdu:wks:work-product-component--SeismicTraceData:1.3.0"
        ]
    },
    "poststackcubegeometries": {
        # Attibutes to make the sourse API request
        "relativePath": "poststackcubegeometries",
        "filters_dict": None,
        "return_cols": ["poststackCubeId"],
        "expand_on": None,
        "lastrundatetime": last_run_date_time,
        "before_rundate": query_records_created_prior_latsrundate,
        "enitity_unique_identifier": "poststackCubeId",
        
        # Attributes to connect to osdu
        "custom_schema": f"eqnr:iEnergy-{source}:poststackcubegeometries:1.0.0",
        "osdu_well_known_schema": [
            "osdu:wks:work-product-component--SeismicTraceData:1.3.0",
            "osdu:wks:dataset--FileCollection.SEGY:1.0.0"
        ]
    },
    "seismicprojects": {
        # Attibutes to make the sourse API request
        "relativePath": "seismicprojects",
        "filters_dict": {"seismicDimension": ["eq", "'3D'"]},
        "return_cols": ["projectId"],
        "expand_on": None,
        "lastrundatetime": last_run_date_time,
        "before_rundate": query_records_created_prior_latsrundate,
        "enitity_unique_identifier": "projectId",
        
        # Attributes to connect to osdu
        "custom_schema": f"eqnr:iEnergy-{source}:seismicprojects:1.0.0",
        "osdu_well_known_schema": [
            "osdu:wks:master-data--SeismicProcessingProject:1.2.0"
        ],
    },
    "navigationsets": {
        # Attibutes to make the sourse API request
        "relativePath": "navigationsets",
        "filters_dict": {"surveyDimension": ["eq", "'3D'"]},
        "return_cols": ["navigationSetId"],
        "expand_on": None,
        "lastrundatetime": last_run_date_time,
        "before_rundate": query_records_created_prior_latsrundate,
        "enitity_unique_identifier": "navigationSetId",
        
        # Attributes to connect to osdu
        "custom_schema": f"eqnr:iEnergy-{source}:navigationsets:1.0.0",
        "osdu_well_known_schema": [
            "osdu:wks:dataset--FileCollection.Generic:1.0.0"                      # --->> Issue they do not have ancestry custom records in DEV - Uncomment if running in test
        ]
    },
    "binsetgrids": {
        # Attibutes to make the sourse API request
        "relativePath": "binsetgrids",
        "filters_dict": None,
        "return_cols": ["binsetGrid3dId"],
        "expand_on": None,
        "lastrundatetime": last_run_date_time,
        "before_rundate": query_records_created_prior_latsrundate,
        "enitity_unique_identifier": "binsetGrid3dId",
        
        # Attributes to connect to osdu
        "custom_schema": f"eqnr:iEnergy-{source}:binsetgrids:1.0.0",
        "osdu_well_known_schema": [
            "osdu:wks:work-product-component--SeismicBinGrid:1.0.0"               # --->> Issue they do not have ancestry custom records in DEV - Uncomment if running in test
        ]
    }
}

In [7]:
"""
Requesting records from source system.
"""

sdb_records = dict()

with LandmarkService(os.environ[f"{source}_metadata_url"], os.environ[f"{source}_ds_security_url"], os.environ[f"{source}_refresh_token"]) as landmark:
    
    for entity_name in entities:
        records = landmark.get_records(**entities.get(entity_name))
        sdb_records[entity_name] = records

    ## TODO: Leave the query by batch option commented out as it might be needed when we switch into requesting data from DISKOS (larger data)



In [8]:
"""2. Records ids uniqueness"""

entities_names = list(entities.keys())
entities_unique_identifiers = list([entity.get('enitity_unique_identifier') for entity in entities.values()])

ids_counts = dict()
for pair in list(zip(entities_names, entities_unique_identifiers)):
    ids_counts[pair[0]] = len(unique_ids_counts(sdb_records[pair[0]], pair[1]))
    
for key in ids_counts.keys():
    print(f"Entity: {key} has {len(sdb_records[key])} records, from which {ids_counts[key]} are have unique identifiers (ids)")

Entity: seismicsurveys has 1975 records, from which 1975 are have unique identifiers (ids)
Entity: seismicpoststackdatasets has 19464 records, from which 19452 are have unique identifiers (ids)
Entity: poststackcubes has 19383 records, from which 19371 are have unique identifiers (ids)
Entity: poststackcubegeometries has 19383 records, from which 19371 are have unique identifiers (ids)
Entity: seismicprojects has 2403 records, from which 2403 are have unique identifiers (ids)
Entity: navigationsets has 3248 records, from which 3248 are have unique identifiers (ids)
Entity: binsetgrids has 2143 records, from which 2143 are have unique identifiers (ids)


In [9]:
"""
2. Records ids uniqueness.
Let's find the non-unique ids for each entity.
"""

def get_non_unique_values(records, dimension):
    duplicates = []
    value_counts = {} 

    for record in records:
        value = record[dimension]
        value_counts[value] = value_counts.get(value, 0) + 1

    for value, count in value_counts.items():
        if count > 1:
            duplicates.append(value)

    return duplicates

ids_duplicates = dict()
for pair in list(zip(entities_names, entities_unique_identifiers)):
    ids_duplicates[pair[0]] = get_non_unique_values(sdb_records[pair[0]], pair[1])
    
for key in ids_duplicates.keys():
    print(f"Entity: {key} - Duplicated ids: {ids_duplicates[key]}")

Entity: seismicsurveys - Duplicated ids: []
Entity: seismicpoststackdatasets - Duplicated ids: ['31361036', '31361032', '31361024', '31361022', '31361018', '31361016', '31361037', '31361033', '31361025', '31361023', '31361019', '31361017']
Entity: poststackcubes - Duplicated ids: ['1657858064', '1657858063', '1657858062', '1657858061', '1657858070', '1657858069', '1657858068', '1657858067', '1657858082', '1657858081', '1657858078', '1657858077']
Entity: poststackcubegeometries - Duplicated ids: ['1657858064', '1657858063', '1657858062', '1657858061', '1657858070', '1657858069', '1657858068', '1657858067', '1657858082', '1657858081', '1657858078', '1657858077']
Entity: seismicprojects - Duplicated ids: []
Entity: navigationsets - Duplicated ids: []
Entity: binsetgrids - Duplicated ids: []


#### 2. Get records from OSDU - custom schemas

In [10]:
"""
Let's first request the custom records from OSDU and store them in a dictionary.
"""

osdu_ingested_custom_records = dict()

for entity in entities.keys():
    
    schema_kind = entities.get(entity).get("custom_schema")
    print(f"---- Working on entity: {entity} of custom schema kind: {schema_kind} ----")
    
    search_payload =  {
        "kind": schema_kind,
        "returnedFields": ["id"]
    }
    osdu_cutom_records = osdu_client.app_query_returning_json("search/v2/query_with_cursor", search_payload)
    osdu_ingested_custom_records[entity] = osdu_cutom_records

    print(f"> Retrieved {len(osdu_cutom_records)} records from OSDU of kind {schema_kind} \n")

---- Working on entity: seismicsurveys of custom schema kind: eqnr:iEnergy-sdb:seismicsurveys:1.0.0 ----
> Retrieved 1971 records from OSDU of kind eqnr:iEnergy-sdb:seismicsurveys:1.0.0 

---- Working on entity: seismicpoststackdatasets of custom schema kind: eqnr:iEnergy-sdb:seismicpoststackdatasets:1.0.0 ----
> Retrieved 18886 records from OSDU of kind eqnr:iEnergy-sdb:seismicpoststackdatasets:1.0.0 

---- Working on entity: poststackcubes of custom schema kind: eqnr:iEnergy-sdb:poststackcubes:1.0.0 ----
> Retrieved 18828 records from OSDU of kind eqnr:iEnergy-sdb:poststackcubes:1.0.0 

---- Working on entity: poststackcubegeometries of custom schema kind: eqnr:iEnergy-sdb:poststackcubegeometries:1.0.0 ----
> Retrieved 18824 records from OSDU of kind eqnr:iEnergy-sdb:poststackcubegeometries:1.0.0 

---- Working on entity: seismicprojects of custom schema kind: eqnr:iEnergy-sdb:seismicprojects:1.0.0 ----
> Retrieved 2401 records from OSDU of kind eqnr:iEnergy-sdb:seismicprojects:1.0.0

Bad pipe message: %s [b'\xdd\xba*\x1d\x13rF&\xfaD8w ]}\xcff\xa7 h\xf0n\xdf\xab\xe5\x04\x06\x86\xbb\n\xc9E\xc2\x9fz\x82\xde25\x07Bn\x8c\x90Tz\x08\x9c\xf3\xb1\x1d\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 \x98\x99\xad\x9b']
Bad pipe message: %s [b'\xd2\xc0\xa2H\x88m]\ny\xc8\xa5-q\xec\x93!\x13\xcd\x00\x00\xa6\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad', b"\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0s\xc0w\x00\xc4\x00\xc3\xc0#\xc0'\x00g\x00@\xc0r\xc0v\x00\xbe\x00\xbd\xc0\n\xc0\x14\x009\x008\x

In [11]:
"""
Let's find out which record ids from the source (SDB or DISKOS) are missing in the OSDU custom schemas and the other way around.
"""

sdb_osdu_unique_ids_custom_records = dict()

for pair in list(zip(entities_names, entities_unique_identifiers)):
    entity_name = pair[0]
    entities_unique_identifier = pair[1]

    print(f"Comparing {entity_name} entity {source.upper()} source records versus OSDU cutom records")
    
    original_records_source = sdb_records.get(entity_name)
    original_records_source_unique = list(set([record.get(entities_unique_identifier) for record in original_records_source]))
    osdu_records_derived_from_source = list(set([derive_osdu_id_from_source_record(record, entity_name) for record in original_records_source]))

    ingested_records_osdu = [record.get('id') for record in osdu_ingested_custom_records.get(entity_name)]
    common_record_ids = list(set(osdu_records_derived_from_source).intersection(ingested_records_osdu))

    missing_in_osdu = [record for record in osdu_records_derived_from_source if record not in ingested_records_osdu]
    missing_in_source = [record for record in ingested_records_osdu if record not in osdu_records_derived_from_source]
    
    sdb_osdu_unique_ids_custom_records[entity_name] = {
        "Source records count": len(original_records_source),
        "Source unique records ids count": len(original_records_source_unique),
        "OSDU custom ingested unique ids count": len(ingested_records_osdu),
        "Common records count (source-custom)": len(common_record_ids),
        "Ingeted into OSDU (custom) but missing in source": {
            "count": len(missing_in_source),
            "ids": missing_in_source
        },
        "Present in source but missing in OSDU (custom)": {
            "count": len(missing_in_osdu),
            "ids": missing_in_osdu
        }
    }

Comparing seismicsurveys entity SDB source records versus OSDU cutom records
Comparing seismicpoststackdatasets entity SDB source records versus OSDU cutom records
Comparing poststackcubes entity SDB source records versus OSDU cutom records
Comparing poststackcubegeometries entity SDB source records versus OSDU cutom records
Comparing seismicprojects entity SDB source records versus OSDU cutom records
Comparing navigationsets entity SDB source records versus OSDU cutom records
Comparing binsetgrids entity SDB source records versus OSDU cutom records


In [12]:
print(json.dumps(sdb_osdu_unique_ids_custom_records, indent=4))

{
    "seismicsurveys": {
        "Source records count": 1975,
        "Source unique records ids count": 1975,
        "OSDU custom ingested unique ids count": 1971,
        "Common records count (source-custom)": 1971,
        "Ingeted into OSDU (custom) but missing in source": {
            "count": 0,
            "ids": []
        },
        "Present in source but missing in OSDU (custom)": {
            "count": 4,
            "ids": [
                "npequinor-dev:seismicsurveys:1518672454",
                "npequinor-dev:seismicsurveys:1518355860",
                "npequinor-dev:seismicsurveys:1518607427",
                "npequinor-dev:seismicsurveys:1518815961"
            ]
        }
    },
    "seismicpoststackdatasets": {
        "Source records count": 19464,
        "Source unique records ids count": 19452,
        "OSDU custom ingested unique ids count": 18886,
        "Common records count (source-custom)": 18875,
        "Ingeted into OSDU (custom) but missing in sou

#### 3. Get records from OSDU - well known schemas

In [13]:
"""
Let's first request the well-known-records (ids and ancestry attributes) from OSDU and store them in a dictionary.
"""

wks_osdu = []
for entity in entities.keys():
    wks_osdu.extend(entities.get(entity).get('osdu_well_known_schema'))

osdu_ingested_wks_records = dict()
columns = ["tags.sourceSystem", "tags.EqnrSource", "data.Source"]  # Three possible columns to query to match the source

for wks_kind in list(set(wks_osdu)):
    print(f"\n--- Working in wks: {wks_kind} ---")

    for column in columns:
        search_payload =  {
            "kind": wks_kind,
            "query": f"{column}:\"{source.upper()}\"",
            "returnedFields": ["id", "ancestry"]
        }

        osdu_wks_records = osdu_client.app_query_returning_json("search/v2/query_with_cursor", search_payload)

        if wks_kind not in osdu_ingested_wks_records.keys():
            osdu_ingested_wks_records[wks_kind] = osdu_wks_records
            print(f"> Retrieved {len(osdu_wks_records)} records from OSDU of kind {wks_kind}")
        else:
            if len(osdu_wks_records) > len(osdu_ingested_wks_records.get(wks_kind)):
                osdu_ingested_wks_records[wks_kind] = osdu_wks_records
                print(f"> Retrieved {len(osdu_wks_records)} records from OSDU of kind {wks_kind}")


--- Working in wks: osdu:wks:dataset--FileCollection.SEGY:1.0.0 ---
> Retrieved 18885 records from OSDU of kind osdu:wks:dataset--FileCollection.SEGY:1.0.0

--- Working in wks: osdu:wks:dataset--FileCollection.Generic:1.0.0 ---
> Retrieved 3245 records from OSDU of kind osdu:wks:dataset--FileCollection.Generic:1.0.0

--- Working in wks: osdu:wks:master-data--SeismicProcessingProject:1.2.0 ---
> Retrieved 2400 records from OSDU of kind osdu:wks:master-data--SeismicProcessingProject:1.2.0

--- Working in wks: osdu:wks:work-product-component--SeismicBinGrid:1.0.0 ---
> Retrieved 2072 records from OSDU of kind osdu:wks:work-product-component--SeismicBinGrid:1.0.0

--- Working in wks: osdu:wks:master-data--SeismicAcquisitionSurvey:1.2.0 ---
> Retrieved 1971 records from OSDU of kind osdu:wks:master-data--SeismicAcquisitionSurvey:1.2.0

--- Working in wks: osdu:wks:work-product-component--SeismicTraceData:1.3.0 ---
> Retrieved 18832 records from OSDU of kind osdu:wks:work-product-component-

In [14]:
"""
Let's now find out which custom records are actually referenced by the well known records and let's check for their relationship.

    - X number of custom records/total of this custom kind found in ancestry of this wks kind.
    - Custom records by kind that are not found in any ancestry of their respective wks.
"""

def get_ancestry_unique_ids(osdu_wks_kind):

    osdu_wks_records = osdu_ingested_wks_records.get(osdu_wks_kind)
    osdu_wks_record_ids = [record.get('id') for record in osdu_wks_records]                                                                                     # records should contain the ancestry attribute within eahc of them
    osdu_ancestry_ids = [record.rsplit(":", maxsplit=1)[0] for wks_record in osdu_wks_records for record in wks_record.get('ancestry').get('parents')]          # this is potentially going to give issues as MAYBE not all custom records have a version appended to the id
    osdu_ancestry_unique_ids = list(set(osdu_ancestry_ids))
    
    return osdu_wks_record_ids, osdu_ancestry_unique_ids

def contrast_custom_to_wks_records(osdu_wks_name, entity_name):
    
    osdu_wks_total_record_ids, ancestry_ids = get_ancestry_unique_ids(osdu_wks_name)
    osdu_custom_records_ids = [record.get('id') for record in osdu_ingested_custom_records.get(entity_name)]

    custom_ids_found_wks_ancestry = list(set(osdu_custom_records_ids).intersection(ancestry_ids))

    custom_ids_missing_in_wks_ansestry = list(set([record for record in osdu_custom_records_ids if record not in ancestry_ids]))

    return osdu_custom_records_ids, osdu_wks_total_record_ids, custom_ids_found_wks_ancestry, custom_ids_missing_in_wks_ansestry

In [15]:
# NOTE: Most likely these datasets do not have a wks record because they do not have a related cube/cubegeometry (filepath), therefore no SEGY file generic can be created
# this means that we can create the seismic trace wks records but they wont have a cubegeometry and therefore filepath and therefore SEGY file to be referenced.

osdu_custom_to_wks_summary = dict()

for entity_name in entities.keys():

    entity_summary = []

    print(f"----- Working in entity: {entity_name} -----")

    osdu_related_wks = entities.get(entity_name).get('osdu_well_known_schema')

    print(f"Schemas that should be analysed: {osdu_related_wks}")

    for wks_i in osdu_related_wks:
        print(f"> Working in wks: {wks_i} -----")

        osdu_custom_records_ids, osdu_wks_total_record_ids, custom_ids_found_wks_ancestry, custom_ids_missing_in_wks_ansestry = contrast_custom_to_wks_records(wks_i, entity_name)

        summary_i = {
            "Osdu related WKS": wks_i,
            "Osdu custom records ids (ingested)": len(osdu_custom_records_ids),
            "Osdu wks records ids (total)": len(osdu_wks_total_record_ids),
            "Osdu custom ids found in wks ancestry (mapped properly)": len(custom_ids_found_wks_ancestry),
            "Osdu custom ids NOT found in wks ancestry (poptential issues during mapping)": custom_ids_missing_in_wks_ansestry
        }

        entity_summary.append(summary_i)

    
    osdu_custom_to_wks_summary[entity_name] = entity_summary

----- Working in entity: seismicsurveys -----
Schemas that should be analysed: ['osdu:wks:master-data--SeismicAcquisitionSurvey:1.2.0']
> Working in wks: osdu:wks:master-data--SeismicAcquisitionSurvey:1.2.0 -----
----- Working in entity: seismicpoststackdatasets -----
Schemas that should be analysed: ['osdu:wks:dataset--FileCollection.SEGY:1.0.0', 'osdu:wks:work-product-component--SeismicTraceData:1.3.0']
> Working in wks: osdu:wks:dataset--FileCollection.SEGY:1.0.0 -----
> Working in wks: osdu:wks:work-product-component--SeismicTraceData:1.3.0 -----
----- Working in entity: poststackcubes -----
Schemas that should be analysed: ['osdu:wks:work-product-component--SeismicTraceData:1.3.0']
> Working in wks: osdu:wks:work-product-component--SeismicTraceData:1.3.0 -----
----- Working in entity: poststackcubegeometries -----
Schemas that should be analysed: ['osdu:wks:work-product-component--SeismicTraceData:1.3.0', 'osdu:wks:dataset--FileCollection.SEGY:1.0.0']
> Working in wks: osdu:wks:wo

In [16]:
print(json.dumps(osdu_custom_to_wks_summary, indent=4))

{
    "seismicsurveys": [
        {
            "Osdu related WKS": "osdu:wks:master-data--SeismicAcquisitionSurvey:1.2.0",
            "Osdu custom records ids (ingested)": 1971,
            "Osdu wks records ids (total)": 1971,
            "Osdu custom ids found in wks ancestry (mapped properly)": 1971,
            "Osdu custom ids NOT found in wks ancestry (poptential issues during mapping)": []
        }
    ],
    "seismicpoststackdatasets": [
        {
            "Osdu related WKS": "osdu:wks:dataset--FileCollection.SEGY:1.0.0",
            "Osdu custom records ids (ingested)": 18886,
            "Osdu wks records ids (total)": 18885,
            "Osdu custom ids found in wks ancestry (mapped properly)": 18885,
            "Osdu custom ids NOT found in wks ancestry (poptential issues during mapping)": [
                "npequinor-dev:seismicpoststackdatasets:1507739908"
            ]
        },
        {
            "Osdu related WKS": "osdu:wks:work-product-component--SeismicTr

In [17]:
"""
Let's combine both summaries, source to custom AND cutom to wks.
"""

full_summary = dict()

for entity_name in entities.keys():
    full_summary[entity_name] = {
        "Source to custom summary": sdb_osdu_unique_ids_custom_records.get(entity_name),
        "Custom to WKS summary": osdu_custom_to_wks_summary.get(entity_name)
    }

print(json.dumps(full_summary, indent=4))

{
    "seismicsurveys": {
        "Source to custom summary": {
            "Source records count": 1975,
            "Source unique records ids count": 1975,
            "OSDU custom ingested unique ids count": 1971,
            "Common records count (source-custom)": 1971,
            "Ingeted into OSDU (custom) but missing in source": {
                "count": 0,
                "ids": []
            },
            "Present in source but missing in OSDU (custom)": {
                "count": 4,
                "ids": [
                    "npequinor-dev:seismicsurveys:1518672454",
                    "npequinor-dev:seismicsurveys:1518355860",
                    "npequinor-dev:seismicsurveys:1518607427",
                    "npequinor-dev:seismicsurveys:1518815961"
                ]
            }
        },
        "Custom to WKS summary": [
            {
                "Osdu related WKS": "osdu:wks:master-data--SeismicAcquisitionSurvey:1.2.0",
                "Osdu custom records 

In [20]:
"""

NOTE for myself for later: 

By seeing the poststackcubegeometries I can guess that some of the datasets have been created but the tarce record failed for these 8 records.
Potential explanations:
    - The reason could be that some the datasets failed first and that caused the trace records to fail too.
    - OR the trace records thenselves failed although the datasets were created properly.

    #TODO: Try ingesting these records in DEV

"""

'\n\nNOTE for myself for later: \n\nBy seeing the poststackcubegeometries I can guess that some of the datasets have been created but the tarce record failed for these 8 records.\nPotential explanations:\n    - The reason could be that some the datasets failed first and that caused the trace records to fail too.\n    - OR the trace records thenselves failed although the datasets were created properly.\n\n    #TODO: Try ingesting these records in DEV\n\n'