## Aggregations by TAGS dimension - OSDU

In [1]:
from libs.osdu_service.osdu_http_client import OsduHttpClient
from datetime import datetime
import pandas as pd

# Adding .env file variables as environment variables
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
"""Utility functions."""

def get_all_schemas(osdu_client):
    schema_endpoint = "schema-service/v1/schema?limit=10000"
    
    res = osdu_client.app_get_returning_json(
        service_relative_uri=schema_endpoint
    )
    
    return res


def record_counts_by_kind(osdu_client, schema_kind=None):
    if not schema_kind:
        schema_kind = "*:*:*:*"
    
    payload = {
        "kind": schema_kind,
        "limit": 1,
        'aggregateBy': "kind",
        "query": "*",
        "returnedFields":["aggregateBy"]
    }
    
    resp = osdu_client.app_post_returning_json(
            "search/v2/query",
            payload
        )
    
    return resp['aggregations']


def get_keys_from_records(response)-> list:
    
    aggregated_by = []
    try:
        records = response['results']
        for record in records:
            record_keys = [f"tags.{dim}" for dim in record['tags'].keys()]
            aggregated_by.extend(record_keys)
            # print(record_keys)
        return list(set(aggregated_by))
    except:
        return list(set(aggregated_by))    


def add_environment(records, schema_kind, counts, aggregation_dim, osdu_env):
    
    if records != None:
        for record in records:
            record['osdu_environment'] = osdu_env
            record['schema_kind'] = schema_kind
            record['aggregatedBy'] = aggregation_dim
            record['recordCounts'] = counts
        
        return records
    
    records = {}
    records['key'] = "null"
    records['count'] = counts
    records['osdu_environment'] = osdu_env
    records['schema_kind'] = schema_kind
    records['aggregatedBy'] = aggregation_dim
    records['recordCounts'] = counts
    
    return [records]


def _hanlde_aggregations(aggregation_dim, response, counts, schema_kind, osdu_env):
    aggregations = response['aggregations']
    aggregations = add_environment(aggregations, schema_kind, counts, aggregation_dim, osdu_env)
    return aggregations


def get_count_by_schema_kind(schema_kind, all_aggregated):
    counts = 0
    for schema in all_aggregated:
        if schema['key'] == schema_kind:
            counts = schema['count']
            return counts
    

In [3]:
"""Get all schemas ids."""

import json

osdu_envs = ["npequinor-test"] #, "npequinor-dev", "equinor-data"]

for env in osdu_envs:
    
    print(f'Working on OSDU environment {env}')
    
    schema_ids = []
    osdu_env = OsduHttpClient(env, client_type='public-client')
    all_schemas = get_all_schemas(osdu_env)    
    schema_ids = [schema['schemaIdentity']['id'] for schema in all_schemas['schemaInfos']]
    all_records_agg_by_schema = record_counts_by_kind(osdu_env)
    
    """Let's work on the custom tags first. We assume all records have the same
    custom tags for a particular schema, but this might change later as we add more 
    custom tags to OSDU records."""

    complete_recods_added = []
    # sample_schema = ['osdu:wks:reference-data--ActivityCode:1.0.0', "osdu:wks:work-product-component--WellLog:1.2.0"]
    # for schema in sample_schema:
    # for schema in [schema for schema in schema_ids if 'osdu:wks:work-product-component--WellLog' in schema]:
    for schema in schema_ids[:]:
        # print(f"Working on schema: {schema}")
        print(f'\r{schema_ids.index(schema)+1}/{len(schema_ids)} - Working on schema: {schema}', end='', flush=True)

        payload = {
            'kind': f"{schema}",
            'limit': 1000,
            "returnedFields":["tags"]
        }

        sample_schema_record = osdu_env.app_post_returning_json(
            service_relative_uri="search/v2/query",
            payload=payload
        )

        if sample_schema_record['totalCount'] > 0:
            aggregated_by = get_keys_from_records(sample_schema_record)
            removed_keys = ['path', 'date']     # remove keys that refer to dates or file paths
            aggregated_by = [i for i in aggregated_by if not any(val in i.lower() for val in removed_keys)]
            
            if not aggregated_by:
                record_counts = get_count_by_schema_kind(schema, all_records_agg_by_schema)
                recods_added = add_environment(None, schema, record_counts, 'tags', env)
                complete_recods_added.extend(recods_added)
            
            else:
                for aggregation in aggregated_by:
                    payload['aggregateBy'] = aggregation
                    resp_aggregated = []
                    resp_aggregated = osdu_env.app_post_returning_json(
                        service_relative_uri="search/v2/query",
                        payload=payload
                    )
                    
                    record_counts = get_count_by_schema_kind(schema, all_records_agg_by_schema)
                    recods_added = _hanlde_aggregations(aggregation, resp_aggregated, record_counts, schema, env)
                    complete_recods_added.extend(recods_added)
        else:
            # print(f"\rNo results found in schema {schema}" , end='', flush=True)
            continue
        
        # break

Working on OSDU environment npequinor-test
158/525 - Working on schema: osdu:wks:dataset--File.OGC.GeoTIFF:1.0.0.0.0rs:1.0.0:1.0.01.0.0

In [None]:
df_aggergations = pd.DataFrame.from_records(complete_recods_added)
df_aggergations

Unnamed: 0,key,count,osdu_environment,schema_kind,aggregatedBy,recordCounts
0,,7394,npequinor-test,osdu:wks:reference-data--ActivityCode:1.0.0,tags,7394
1,,10,npequinor-test,osdu:wks:reference-data--ActivityLevel:1.0.0,tags,10
2,,2,npequinor-test,osdu:wks:reference-data--ActivityOutcome:1.0.0,tags,2
3,,9,npequinor-test,osdu:wks:reference-data--ActivityOutcomeDetail...,tags,9
4,,48,npequinor-test,osdu:wks:reference-data--AdditiveType:1.0.0,tags,48
...,...,...,...,...,...,...
331,No,1136,npequinor-test,eqnr:smda-api-v2.0:fields:1.0.0,tags.inside,1136
332,No,1136,npequinor-test,eqnr:smda-api-v2.0:fields:1.0.0,tags.exportControl,1136
333,,1136,npequinor-test,eqnr:smda-api-v2.0:fields:1.0.0,tags.sensitivityFlag,1136
334,Not assessed yet,1136,npequinor-test,eqnr:smda-api-v2.0:fields:1.0.0,tags.intellectualPropertyRight,1136


In [None]:
# df_aggergations[df_aggergations.aggregatedBy == "tags"]#[:30]
# df_aggergations[df_aggergations.aggregatedBy != "tags"]
# len(df_aggergations[df_aggergations.aggregatedBy != "tags"].schema_kind.unique())
# len(df_aggergations.schema_kind.unique())

174