In [0]:
import json
import pandas as pd
from pyspark.sql import Row

In [0]:
catalog='operations'
schema= 'finance'
table_name = 'dim_tag'

json_files = ['dim_tag_assets','dim_tag_liabilities','dim_tag_equity']

df_x = [] 

for file in json_files:
    with open(f"json/{file}.json","r") as f:
        qry = json.load(f)

    rows = []

    for main_category, subcats in qry.items():
        for sub_category, items_list in subcats.items():
            for obj in items_list:
                for item_name, labels in obj.items():
                    for label in labels:
                        rows.append({
                            "main_category": main_category,
                            "sub_category": sub_category,
                            "item_category": item_name,
                            "item_label": label
                        })

    df_x.append(pd.DataFrame(rows))
df = spark.createDataFrame(pd.concat(df_x,ignore_index=True)).createOrReplaceTempView('tags_df')

In [0]:
tags_df = spark.sql(f"""
select   sha2(concat_ws('|', tags_df.item_label), 256) AS tag_key_hash
,bigint(substr(xxhash64(concat_ws('|', tags_df.item_label)), 1, 18)) AS tag_bigint_key
,'Balance Sheet' as tag_level_0
,main_category as tag_level_1 
,sub_category as tag_level_2 
,item_category as tag_level_3
,tags_df.item_label as terse_label
,totals_indicator.total_indicator as total_indicator
from tags_df 
left join 
    (select distinct 
    pre.plabel as presented_label
    ,tag.tlabel as item_label --terse label
    ,case when pre.plabel like '%Total%' then 1 
    when pre.plabel like '%TOTAL' then 1 
    when pre.plabel like '%total%' then 1 
    else 0 end as total_indicator
    from operations.finance_staging.raw_pre_tbl pre
    left join operations.finance_staging.raw_tag_tbl tag on tag.tag = pre.tag ) totals_indicator on totals_indicator.item_label = tags_df.item_label
""")

In [0]:
(
    tags_df
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(f"{catalog}.{schema}.{table_name}")
)