## 
#### Alright: You need to hash out a legit dimension for tags/labels/whatever its called in this notebook
- Observe all variations/combination in standard_label, presented_label, terse_label and using the most logical one (probably presented label?) fit all variations into a higher category
- ALSO: Using a forced schema assert specific categories that are not always necessarily recorded in the balance sheet (Non Current Assets for example)

In [0]:
import pandas as pd


#SCHEMA

In [0]:
#Below is the schema that you need to fill out for the balance sheet. This includes all major categories (report_label column) that you need for balance sheet historical analysis. 
#Eventually report_labels gets pivoted to be its own column in the final balance sheet table

schema = {
    'report':['balance_sheet','balance_sheet','balance_sheet','balance_sheet','balance_sheet','balance_sheet','balance_sheet','balance_sheet','balance_sheet','balance_sheet','balance_sheet','balance_sheet',],

    'report_class': ['asset','asset','asset','asset','asset','asset','asset','asset','asset','asset','asset','asset'],

    'report_sub_class': ['current_asset','current_asset','current_asset','current_asset','current_asset','total_current_asset','non_current_asset','non_current_asset','non_current_asset','non_current_asset','total_non_current_asset','total_assets'],

    'report_label': ['cash_and_equivalents','short_term_investment','accounts_receivable','inventory','other_current_assets','total_current_asset','property_plant_equipment','intangible_assets','goodwill','other_non_current_assets','total_non_current_assets','total_assets']
}

df = spark.createDataFrame(pd.DataFrame(schema)).createOrReplaceTempView('totals')

#### WRITE BALANCE SHEET SCHEMA TABLE

In [0]:
balance_sheet_schema = spark.sql(f"""select distinct 
totals.*
,cast(bigint(substr(xxhash64(concat_ws('|', totals.report_sub_class)), 1, 18)) as bigint) AS report_sub_class_bigint_key 
from totals""")

(
    balance_sheet_schema
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("operations.finance_staging.dim_balance_sheet")
)

##PRESENTED LABELS
- organize the most common presented labels (count resutls from main fact table) into their respective categories. 
- example: "Total currents assets", "Total Current Assets", "Total Current assets", "Total current assets" are the four most common versions of total current assets available in presented label. Organize them into the report sub class of total_current_asset and report_class of asset, report balance sheet. 

- BONUS NOTE: WHy did I chose Presented Label rather than Standard Label or Terse Label. Well both attempt to standardize themself. I need the most raw form of this data as it is shown on the financial statement before it is standardized by GAAP Taxonomies. Sort of circumvents the complexity of adding GAAP Taxonomies by year. 

In [0]:
#Record here every variation of 'Total Current Assets' that could be used as a presented label on the balance sheet. 

presented_labels = {
    "report":['balance_sheet','balance_sheet','balance_sheet'],
    "report_class":['asset','asset','asset'],
    "report_sub_class":['total_current_asset','total_current_asset','total_current_asset'],

    "presented_label":[ 
        "Total Current Assets",
        "Total Current assets",
        "Total current assets"
        ]}

presented_labels = spark.createDataFrame(pd.DataFrame(presented_labels)).createOrReplaceTempView('presented_labels')

In [0]:
presented_labels=spark.sql(f"""
select distinct 
presented_labels.*
,bigint(substr(xxhash64(concat_ws('|', report_sub_class)), 1, 18)) AS report_sub_class_bigint_key 
,bigint(substr(xxhash64(concat_ws('|', presented_label)), 1, 18)) AS presented_label_bigint_key 
from presented_labels""")

(
    presented_labels
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("operations.finance_staging.dim_presented_labels")
)

In [0]:
%sql 
with cte as ( 
select 
distinct 
bs.reported_period as date_key
,dc.company_name
,dc.company_stock_symbol

,bs.presented_label as presented_label
,bs.terse_label  as terse_label

,dpl.report_sub_class
,dbs.report_sub_class
,dbs.report_label

,bs.report_number
,bs.report_line_number
,bs.value
from 
operations.finance_staging.fact_staging_financial_statement_tbl bs
left join operations.finance.dim_company dc on dc.company_bigint_key = bs.company_bigint_key
left join operations.finance_staging.dim_presented_labels dpl on dpl.presented_label_bigint_key = bs.presented_label_bigint_key
left join operations.finance_staging.dim_balance_sheet dbs on dbs.report_sub_class_bigint_key = dpl.report_sub_class_bigint_key
where 
financial_statement = 'BS'  and value_segment is null
and 
reported_period = end_reported_period 
and 
dc.sp_500_indicator = 1)

/*
,pivot_cte AS (
    SELECT *
    FROM cte
    PIVOT (
        SUM(value) AS total
        FOR tag_level_3_combined IN (
            'Total Current Assets' as total_current_assets
        )
    )
)
select distinct * from pivot_cte */
select distinct cte.presented_label, report_label from cte