## Check CRS's from SDB in OSDU

Simple QC notebook to check which CRS's found in SDB can be found in OSDU.

In [None]:
"""Uncomment if running in Azure ML"""
# import sys
# sys.path.append(r'/mnt/batch/tasks/shared/LS_root/mounts/clusters/jopm/code/Users/JOPM/osdu_jopm_rottation')
# sys.path

In [1]:
import os
import re
from urllib.parse import urljoin
from fuzzywuzzy import fuzz
from libs.ladmark_service.landmark_client import LandmarkService
from libs.osdu_service.osdu_http_client import OsduHttpClient

# Adding .env file variables as environment variables
from dotenv import load_dotenv
load_dotenv()



True

### Additional Utilities

In [2]:
"""
OSDU search utilities.
"""

def search_cursor(osdu_client: OsduHttpClient, search_url_cursor, search_payload):
    results = []
    cursor = ""

    search_payload["limit"] = 1

    while True:
        search_payload["cursor"] = cursor
        source_data = osdu_client.osdu_client.post_returning_json(
            search_url_cursor, search_payload)
        if source_data["results"]:
            results.extend(source_data['results'])
        cursor = source_data.get('cursor')
        if cursor is None:
            break
    return results


"""
Search for CRS in OSDU utilities.
"""

def search_reference(osdu_client: OsduHttpClient, schema_kind, query_criteria, query_criteria_equals, returned_fields):
    
    osdu_base_url = osdu_client.env_variables["url"]
    search_url = urljoin(osdu_base_url, "api/search/v2/query")
    query = '{}:"\{}\"'.format(query_criteria, query_criteria_equals)

    payload = {
        "kind": schema_kind,
        "returnedFields": returned_fields,
        "limit": 1000,
        "query": query
    }

    resp = osdu_client.osdu_client.post_returning_json(
        search_url,
        payload,
    )
    return resp["results"]


def find_best_match(value, match_on, possible_options, start_prob=50):
    return_index = None
    highest_prob = start_prob
    
    if len(possible_options) == 0:
        return dict(), None
    else:
        for i in range(len(possible_options)):
            prob_cal = fuzz.ratio(possible_options[i].get("data").get(match_on), value)
            if prob_cal >= highest_prob:  # Check for a higher probability
                highest_prob = prob_cal
                return_index = i
                
        if return_index is not None:
            return possible_options[return_index], highest_prob
        else:
            return dict(), None
        
        
def process_crs_string(input_string) -> str:
    processed_string = re.findall(r'\b[A-Za-z]+\b', input_string)
    if isinstance(processed_string, list):
        if len(processed_string) != 0:
            return processed_string[0]
    return input_string
        
        
def remove_spaces(value):
    return re.sub(r'\s+', ' ', value)


def format_for_search(value):
    value = re.sub(r'\s+', ' ', value)
    value_list = value.split(" ")
    formated_string = "(" + " OR ".join(['"' + item + '"' for item in value_list]) + ")"
    return formated_string

In [36]:
last_run_date_time = "2023-07-26T09:10:16.945Z"
last_run_date_time = None
query_records_created_prior_latsrundate = True

osdu_env = "npequinor-dev"
source = "sdb"

entities = {
    "poststackcubegeometries": {
        "relativePath": "poststackcubegeometries",
        "filters_dict": None,
        "lastrundatetime": last_run_date_time,
        "expand_on": None,
        "id_key": "poststackCubeId",
        "mini_batch_size": 500,
        "return_cols": None
    }
}

#### 1. CRS's from post stack cube geometries

In [37]:
"""
Let's get all the poststack geametry cubes records.
Note: We loop by using the seismic surveys id's ONLY because the endpoint geometry cubes endpoint
fails for some reason when requesting all the geometry cubes all at one.
"""

source_records = []
with LandmarkService(
    os.environ[f"{source}_metadata_url"],
    os.environ[f"{source}_ds_security_url"],
    os.environ[f"{source}_refresh_token"]
    ) as landmark:
    
    # generator = landmark.get_records_by_batch_server_pagination(**entities.get("poststackcubegeometries"))
    # generator = landmark.get_records_by_mini_batch(**entities.get("poststackcubegeometries"))
    generator = landmark.get_records_by_batch_client_pagination(**entities.get("poststackcubegeometries"))
    
    try:
        while True:
            records = next(generator)
            source_records.extend(records)
            break                           # remove break statement to request all batches
    except StopIteration:
        pass




In [38]:
"""
Let's now check for the geometries to see what how many records have all m and z elements being == 0.0
"""

poststackCubeId = [cube["poststackCubeId"] for cube in source_records]
# m = cube_geometries[0]["geometry"].get("linestringzm")[0].get("m")
# z = cube_geometries[0]["geometry"].get("linestringzm")[0].get("z")

all_zeros_m = 0; all_zeros_z = 0
for cube_geom in source_records:
    try:
        m = cube_geom["geometry"].get("linestringzm")[0].get("m")
        z = cube_geom["geometry"].get("linestringzm")[0].get("z")
        
        if all (float(x) == 0.0 for x in m):
            all_zeros_m += 1
        if all (float(x) == 0.0 for x in z):
            all_zeros_z += 1
    except:
        continue

print(
    f"""
    Total records: {len(source_records)}
    Total unique records: {len(list(set(poststackCubeId)))}
    Records with all z values equal to zeros: {all_zeros_z}
    Records with all m values equal to zeros:: {all_zeros_m}
    """
)


    Total records: 500
    Total unique records: 500
    Records with all z values equal to zeros: 500
    Records with all m values equal to zeros:: 500
    


In [39]:
"""
Let's now check for th crs to se if we can find a good way to mapp them:
"""

unique_crs = list(set([i["coordinateSystem"] for i in source_records]))

print(
    f"Unique crs in SDB post stack cube geometry data: {len(unique_crs)}"
)

Unique crs in SDB post stack cube geometry data: 16


In [40]:
"""
Let's bring in all the CoordinateReferenceSystem reference data currently in OSDU DEV.
Note: We will use the OSDU CLI (public client) for this.
"""

osdu_client = OsduHttpClient("npequinor-dev", client_type="public-client")

"""
Example: Finding best match for CRS = "SAD69          T1867"
Note that the start_prob parameter can be used as threshold
for the minimun matching probability to return results.
"""
original_sdb_crs = "SAD69          T1867"

osdu_matched_crs, match_probabilty = find_best_match(
    remove_spaces(original_sdb_crs),
    match_on="Name",
    possible_options=search_reference(
                osdu_client=osdu_client,
                schema_kind="osdu:wks:reference-data--CoordinateReferenceSystem:*",
                query_criteria="data.\\*", query_criteria_equals=format_for_search(original_sdb_crs),
                returned_fields=["id", "data.Name"]
            ),
    start_prob=70
    )

print(f"""
      Sdb crs searched for: {original_sdb_crs},
      OSDU match crs: {osdu_matched_crs},
      Matching probability: {match_probabilty}
      """)


      Sdb crs searched for: SAD69          T1867,
      OSDU match crs: {'data': {'Name': 'ST_SAD69_T1877'}, 'id': 'npequinor-dev:reference-data--CoordinateReferenceSystem:BoundGeographic2D:EPSG::4618_EPSG::1877'},
      Matching probability: 72
      


In [41]:
"""
Let's check which SDB unique CRS's can be found in OSDU.
"""

crs_mapping_list = []
match_count = 0

for original_crs in unique_crs:
    osdu_crs = {}
    osdu_matched_crs, match_probabilty = find_best_match(
    remove_spaces(original_crs),
    match_on="Name",
    possible_options=search_reference(
                osdu_client=osdu_client,
                schema_kind="osdu:wks:reference-data--CoordinateReferenceSystem:*",
                query_criteria="data.\\*", query_criteria_equals=format_for_search(original_crs),
                returned_fields=["id", "data.Name"]
            ),
    start_prob=70
    )

    record_crs = {
        "SDB CRS": original_crs,
        "OSDU CRS": {
            "Name": osdu_matched_crs.get("data", {}).get("Name"),
            "id": osdu_matched_crs.get("id"),
            "Matching prob": match_probabilty
        }
    }
    
    crs_mapping_list.append(record_crs)
    match_count += 1
    
    print(f"""
        SDB crs name: {original_crs},
        MATCH-PROB: {match_probabilty},
        OSDU crs id: {osdu_matched_crs.get("id")},
        OSDU CRS NAME: {osdu_matched_crs.get("data").get("Name") if len(osdu_matched_crs) != 0 else None}
        """
        )    


        SDB crs name: Pulkovo1942    T1808,
        MATCH-PROB: None,
        OSDU crs id: None,
        OSDU CRS NAME: None
        

        SDB crs name: ED50           T1311,
        MATCH-PROB: 78,
        OSDU crs id: npequinor-dev:reference-data--CoordinateReferenceSystem:BoundGeographic2D:EPSG::4230_EPSG::1311,
        OSDU CRS NAME: ST_ED50_T1311
        

        SDB crs name: Camacupa       T1327,
        MATCH-PROB: 74,
        OSDU crs id: npequinor-dev:reference-data--CoordinateReferenceSystem:BoundGeographic2D:EPSG::4220_EPSG::1327,
        OSDU CRS NAME: ST_Camacupa1948_T1327
        

        SDB crs name: Camacupa       T1324,
        MATCH-PROB: 81,
        OSDU crs id: npequinor-dev:reference-data--CoordinateReferenceSystem:Geographic2D:EPSG::4220,
        OSDU CRS NAME: Camacupa 1948
        

        SDB crs name: ED50_N62       T1612,
        MATCH-PROB: None,
        OSDU crs id: None,
        OSDU CRS NAME: None
        

        SDB crs name: ST_Camacupa_T132

In [42]:
"""
Let's have a look on how the mapping crs looks like for further QC.
"""
import json

print(json.dumps(crs_mapping_list, indent = 4))

[
    {
        "SDB CRS": "Pulkovo1942    T1808",
        "OSDU CRS": {
            "Name": null,
            "id": null,
            "Matching prob": null
        }
    },
    {
        "SDB CRS": "ED50           T1311",
        "OSDU CRS": {
            "Name": "ST_ED50_T1311",
            "id": "npequinor-dev:reference-data--CoordinateReferenceSystem:BoundGeographic2D:EPSG::4230_EPSG::1311",
            "Matching prob": 78
        }
    },
    {
        "SDB CRS": "Camacupa       T1327",
        "OSDU CRS": {
            "Name": "ST_Camacupa1948_T1327",
            "id": "npequinor-dev:reference-data--CoordinateReferenceSystem:BoundGeographic2D:EPSG::4220_EPSG::1327",
            "Matching prob": 74
        }
    },
    {
        "SDB CRS": "Camacupa       T1324",
        "OSDU CRS": {
            "Name": "Camacupa 1948",
            "id": "npequinor-dev:reference-data--CoordinateReferenceSystem:Geographic2D:EPSG::4220",
            "Matching prob": 81
        }
    },
    {
      

In [43]:
"""
Let's see which are the found and non-found crs's within OSDU.
"""
crs_found = [i for i in crs_mapping_list if i.get("OSDU CRS").get("Name") != None]
crs_non_found = [i for i in crs_mapping_list if i.get("OSDU CRS").get("Name") == None]

print(f"""
      Total unique crs's: {len(unique_crs)}
      Crs's found in OSDU: {len(crs_found)}
      Crs's not found in OSDU: {len(crs_non_found)}
      """)


      Total unique crs's: 16
      Crs's found in OSDU: 9
      Crs's not found in OSDU: 7
      


In [44]:
"""
Let's see how many post stack cube geometries,
crs of which can be referenced based on current osdu crs data.
"""

geo_cubes_with_ref_crs = 0
geo_cubes_with_no_ref_crs = 0
for cube in source_records:
    if cube["coordinateSystem"] in [i.get('SDB CRS') for i in crs_mapping_list if i.get("OSDU CRS").get("Name") != None]:
        geo_cubes_with_ref_crs += 1
    else:
        geo_cubes_with_no_ref_crs += 1
        
print(f"""
      Total post stack cube geometries: {len(source_records)}
      Records with referenced crs in osdu: {geo_cubes_with_ref_crs}
      Records with NO referenced crs in osdu: {geo_cubes_with_no_ref_crs}
      """)


      Total post stack cube geometries: 500
      Records with referenced crs in osdu: 403
      Records with NO referenced crs in osdu: 97
      


#### 2. CRS's from binsetgrids

In [46]:
entities = {
    "binsetgrids": {
        # Attibutes to make the sourse API request
        "relativePath": "binsetgrids",
        "filters_dict": None,
        "lastrundatetime": last_run_date_time,
        "expand_on": None,
        "id_key": "binsetGrid3dId",
        "mini_batch_size": 500,
        "return_cols": None
    }
}

In [48]:
source_records = []
with LandmarkService(
    os.environ[f"{source}_metadata_url"],
    os.environ[f"{source}_ds_security_url"],
    os.environ[f"{source}_refresh_token"]
    ) as landmark:
    
    # generator = landmark.get_records_by_batch_server_pagination(**entities.get("poststackcubegeometries"))
    # generator = landmark.get_records_by_mini_batch(**entities.get("poststackcubegeometries"))
    generator = landmark.get_records_by_batch_client_pagination(**entities.get("binsetgrids"))
    
    try:
        while True:
            records = next(generator)
            source_records.extend(records)
            break                           # remove break statement to request all batches
        
        print(f"Total Binsetgrids retrieved: {len(source_records)}")
        
    except StopIteration:
        pass



Total Binsetgrids retrieved: 500


In [49]:
"""
Let's now check for th crs to se if we can find a good way to mapp them:
"""

unique_crs2 = list(set([i["originalCRS"] for i in source_records]))

print(
    f"Unique crs in SDB binsetgrids: {len(unique_crs2)}"
)

Unique crs in SDB binsetgrids: 105


In [50]:
"""
Example: Finding best match for CRS = "ST_NZGD2000_NZTM_P2193_T1565"
Note that the start_prob parameter can be used as threshold
for the minimun matching probability to return results.
"""
original_sdb_crs = "ST_NZGD2000_NZTM_P2193_T1565"

osdu_matched_crs, match_probabilty = find_best_match(
    original_sdb_crs,
    match_on="Name",
    possible_options=search_reference(
                osdu_client=osdu_client,
                schema_kind="osdu:wks:reference-data--CoordinateReferenceSystem:*",
                query_criteria="data.\\*", query_criteria_equals=original_sdb_crs,
                returned_fields=["id", "data.Name"]
            ),
    start_prob=50
    )

print(f"""
      Sdb crs searched for: {original_sdb_crs},
      OSDU match crs: {osdu_matched_crs},
      Matching probability: {match_probabilty}
      """)


      Sdb crs searched for: ST_NZGD2000_NZTM_P2193_T1565,
      OSDU match crs: {'data': {'Name': 'ST_NZGD2000_NZTM_P2193_T1565'}, 'id': 'npequinor-dev:reference-data--CoordinateReferenceSystem:BoundProjected:EPSG::2193_EPSG::1565'},
      Matching probability: 100
      


In [51]:
"""
Let's check which SDB unique CRS's can be found in OSDU.
"""

crs_mapping_list2 = []
match_count = 0

for original_crs in unique_crs2:
    osdu_crs = {}
    osdu_matched_crs, match_probabilty = find_best_match(
    original_crs,
    match_on="Name",
    possible_options=search_reference(
                osdu_client=osdu_client,
                schema_kind="osdu:wks:reference-data--CoordinateReferenceSystem:*",
                query_criteria="data.\\*", query_criteria_equals=original_crs,
                returned_fields=["id", "data.Name"]
            ),
    start_prob=50
    )

    record_crs = {
        "SDB CRS": original_crs,
        "OSDU CRS": {
            "Name": osdu_matched_crs.get("data", {}).get("Name"),
            "id": osdu_matched_crs.get("id"),
            "Matching prob": match_probabilty
        }
    }
    
    crs_mapping_list2.append(record_crs)
    match_count += 1
    
    print(f"""
        SDB crs name: {original_crs},
        MATCH-PROB: {match_probabilty},
        OSDU crs id: {osdu_matched_crs.get("id")},
        OSDU CRS NAME: {osdu_matched_crs.get("data").get("Name") if len(osdu_matched_crs) != 0 else None}
        """
        )    


        SDB crs name: SAD69_UTM24S_P29194_T1864,
        MATCH-PROB: None,
        OSDU crs id: None,
        OSDU CRS NAME: None
        

        SDB crs name: ST_WGS84_UTM34N_P32634,
        MATCH-PROB: 100,
        OSDU crs id: npequinor-dev:reference-data--CoordinateReferenceSystem:Projected:EPSG::32634,
        OSDU CRS NAME: ST_WGS84_UTM34N_P32634
        

        SDB crs name: ST_Pulkovo1942_3GK51E_P2592_T1808_Depr,
        MATCH-PROB: None,
        OSDU crs id: None,
        OSDU CRS NAME: None
        

        SDB crs name: Aratu_UTM23S_P20823_T1549,
        MATCH-PROB: None,
        OSDU crs id: None,
        OSDU CRS NAME: None
        

        SDB crs name: ST_CampoInchauspe_Argentina_P22192_T1127,
        MATCH-PROB: None,
        OSDU crs id: None,
        OSDU CRS NAME: None
        

        SDB crs name: ST_WGS84_UTM37S_P32737,
        MATCH-PROB: 100,
        OSDU crs id: npequinor-dev:reference-data--CoordinateReferenceSystem:Projected:EPSG::32737,
        OSDU 

In [52]:
"""
Let's have a look on how the mapping crs looks like for further QC.
"""
import json

print(json.dumps(crs_mapping_list2, indent = 4))

[
    {
        "SDB CRS": "SAD69_UTM24S_P29194_T1864",
        "OSDU CRS": {
            "Name": null,
            "id": null,
            "Matching prob": null
        }
    },
    {
        "SDB CRS": "ST_WGS84_UTM34N_P32634",
        "OSDU CRS": {
            "Name": "ST_WGS84_UTM34N_P32634",
            "id": "npequinor-dev:reference-data--CoordinateReferenceSystem:Projected:EPSG::32634",
            "Matching prob": 100
        }
    },
    {
        "SDB CRS": "ST_Pulkovo1942_3GK51E_P2592_T1808_Depr",
        "OSDU CRS": {
            "Name": null,
            "id": null,
            "Matching prob": null
        }
    },
    {
        "SDB CRS": "Aratu_UTM23S_P20823_T1549",
        "OSDU CRS": {
            "Name": null,
            "id": null,
            "Matching prob": null
        }
    },
    {
        "SDB CRS": "ST_CampoInchauspe_Argentina_P22192_T1127",
        "OSDU CRS": {
            "Name": null,
            "id": null,
            "Matching prob": null
        }
 

In [53]:
"""
Let's see which are the found and non-found crs's within OSDU.
"""
crs_found2 = [i for i in crs_mapping_list2 if i.get("OSDU CRS").get("Name") != None]
crs_non_found2 = [i for i in crs_mapping_list2 if i.get("OSDU CRS").get("Name") == None]

print(f"""
      Total unique crs's: {len(unique_crs2)}
      Crs's found in OSDU: {len(crs_found2)}
      Crs's not found in OSDU: {len(crs_non_found2)}
      """)


      Total unique crs's: 105
      Crs's found in OSDU: 62
      Crs's not found in OSDU: 43
      


In [54]:
"""
Let's see how many bingrids crs's,
crs of which can be referenced based on current osdu crs data.
"""

geo_cubes_with_ref_crs2 = 0
geo_cubes_with_no_ref_crs2 = 0
for binsetgrid in source_records:
    if binsetgrid["originalCRS"] in [i.get('SDB CRS') for i in crs_mapping_list2 if i.get("OSDU CRS").get("Name") != None]:
        geo_cubes_with_ref_crs2 += 1
    else:
        geo_cubes_with_no_ref_crs2 += 1
        
print(f"""
      Total binsetgrids: {len(source_records)}
      Records with referenced crs in osdu: {geo_cubes_with_ref_crs2}
      Records with NO referenced crs in osdu: {geo_cubes_with_no_ref_crs2}
      """)


      Total binsetgrids: 500
      Records with referenced crs in osdu: 370
      Records with NO referenced crs in osdu: 130
      
