# Imports and Variables

In [0]:
import json
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.functions import explode
from pyspark.sql.functions import col, to_date, concat_ws, sha2, explode

# Function Definition

In [0]:
def updateGoldTable(table_name, columns_to_extract, columns_alias, MonthYear, recent_dataframe):

    if recent_dataframe.count() > 0:
        # Select all the columns within the bronze columns
        #temp_df = recent_dataframe.select(columns_to_extract.split(","))
#
        #clm_paths = columns_to_extract.split(",")
        #clm_alias = columns_alias.split(",")
        #temp_df = temp_df.select(*[col(clm_paths[idx]).alias(clm_alias[idx]) for idx in range(len(clm_paths))])

        temp_df = recent_dataframe
        
        temp_df.createOrReplaceTempView("vw_LatestSilverRecords")
        
        spark.sql(f"DELETE FROM gold.{table_name} WHERE LastExtractionMonth={MonthYear}")

        spark.sql(f"INSERT INTO gold.{table_name} SELECT * FROM vw_LatestSilverRecords")

        # Update Integration Control LastExtractionDate
        spark.sql(f"UPDATE control.integrationcontrolcosts SET LastExtractionDate = (SELECT MAX(DISTINCT LastExtractionDate) FROM vw_LatestSilverRecords) WHERE control.integrationcontrolcosts.TableName LIKE '{table_name}' and Gold=1")
        
        return 0

# Getting the elements to the extracted from the control table

In [0]:
tables_to_extract = spark.sql("SELECT TableName, ColumnsToExtract, ColumnsAlias, ColumnsMergeSource, ColumnsMergeTarget, LastExtractionDate FROM control.integrationcontrolcosts WHERE Active = 1 AND Gold = 1 ORDER BY TableName ASC").cache()
display(tables_to_extract)

TableName,ColumnsToExtract,ColumnsAlias,ColumnsMergeSource,ColumnsMergeTarget,LastExtractionDate
azure_costs_2,"DepartmentName,AccountName,AccountOwnerId,SubscriptionGuid,SubscriptionName,ResourceGroup,ResourceLocation,AvailabilityZone,UsageDateTime,ProductName,MeterCategory,MeterSubcategory,MeterId,MeterName,MeterRegion,UnitOfMeasure,UsageQuantity,ResourceRate,PreTaxCost,CostCenter,ConsumedService,ResourceType,InstanceId,Tags,OfferId,AdditionalInfo,ServiceInfo1,ServiceInfo2,Currency,FilePath,LastExtractionDate,LastExtractionMonth","DepartmentName,AccountName,AccountOwnerId,SubscriptionGuid,SubscriptionName,ResourceGroup,ResourceLocation,AvailabilityZone,UsageDateTime,ProductName,MeterCategory,MeterSubcategory,MeterId,MeterName,MeterRegion,UnitOfMeasure,UsageQuantity,ResourceRate,PreTaxCost,CostCenter,ConsumedService,ResourceType,InstanceId,Tags,OfferId,AdditionalInfo,ServiceInfo1,ServiceInfo2,Currency,FilePath,LastExtractionDate,LastExtractionMonth,UniqueID",,,2025-02-02T01:56:28Z


# Explode Silver Tables "Values" into a Gold Table

In [0]:
from pyspark.sql.functions import col, to_date, concat_ws, sha2, lit
# Itera sobre as tabelas a serem extraídas
for table in tables_to_extract.collect():
   # Obtém as últimas extrações para cada tabela
   month_to_extract = spark.sql(
       f"SELECT DISTINCT LastExtractionMonth FROM silver.{table['TableName']} WHERE LastExtractionDate > '{table['LastExtractionDate']}'"
   )
   # Itera sobre cada mês a ser extraído
   for month in month_to_extract.collect():
       # Seleciona os registros mais recentes para o mês
       recent_entries = spark.sql(
           f"""
           SELECT *
           FROM silver.{table['TableName']}
           WHERE LastExtractionMonth = '{month['LastExtractionMonth']}'
           AND LastExtractionDate = (SELECT MAX(LastExtractionDate) FROM silver.{table['TableName']} WHERE LastExtractionMonth = '{month['LastExtractionMonth']}')
           """
       )
       
       # Cast para tipo numérico nas colunas necessárias
       numeric_columns = ['UsageQuantity', 'ResourceRate', 'PreTaxCost']
       for col_name in numeric_columns:
           if col_name in recent_entries.columns:
               recent_entries = recent_entries.withColumn(col_name, col(col_name).cast('float'))

       # Converte a data para o formato correto, se existir
       if 'UsageDateTime' in recent_entries.columns:
           recent_entries = recent_entries.withColumn('UsageDateTime', to_date(col('UsageDateTime'), 'yyyy-MM-dd'))

       # Cria a coluna 'UniqueCombination' se necessário
       if 'SubscriptionName' in recent_entries.columns and 'ResourceGroup' in recent_entries.columns:
           recent_entries = recent_entries.withColumn('UniqueCombination', concat_ws('_', col('SubscriptionName'), col('ResourceGroup')))

       # Preenche valores nulos em 'ResourceGroup'
       if 'ResourceGroup' in recent_entries.columns:
           recent_entries = recent_entries.fillna({"ResourceGroup": "N/A"})

       # Cria a coluna 'UniqueID' se necessário
       if 'UniqueCombination' in recent_entries.columns:
           recent_entries = recent_entries.withColumn('UniqueID', sha2(col('UniqueCombination'), 256))
           recent_entries = recent_entries.drop('UniqueCombination')

       # Garante que a DataFrame tenha as mesmas colunas da tabela de destino
       #target_table_columns = spark.table(f"gold.{table['TableName']}").columns
       #for col_name in target_table_columns:
       #    if col_name not in recent_entries.columns:
       #        recent_entries = recent_entries.withColumn(col_name, lit(None).cast("string"))

       # Chama a função para atualizar a tabela Gold
       tn = updateGoldTable(
           table['TableName'],
           table['ColumnsToExtract'],
           table['ColumnsAlias'],
           month['LastExtractionMonth'],
           recent_entries
       )

       # display(recent_entries)

In [0]:
%sql
SELECT DISTINCT UsageDateTime, LastExtractionDate
FROM gold.azure_costs_2
ORDER BY UsageDateTime DESC

UsageDateTime,LastExtractionDate
2025-02-02,2025-02-03T01:57:18Z
2025-02-01,2025-02-03T01:57:18Z
2025-01-31,2025-02-03T01:58:09Z
2025-01-30,2025-02-03T01:58:09Z
2025-01-29,2025-02-03T01:58:09Z
2025-01-28,2025-02-03T01:58:09Z
2025-01-27,2025-02-03T01:58:09Z
2025-01-26,2025-02-03T01:58:09Z
2025-01-25,2025-02-03T01:58:09Z
2025-01-24,2025-02-03T01:58:09Z


In [0]:
%sql
select distinct FilePath, LastExtractionDate, LastExtractionMonth from gold.azure_costs_2 order by LastExtractionMonth desc

FilePath,LastExtractionDate,LastExtractionMonth
abfss://costs@enxpowerbidatalakeqa.dfs.core.windows.net/powerbi-costs/enxcf-CF_Subscription_Costs/20250201-20250228/enxcf-CF_Subscription_Costs_752b65e9-3e08-49e2-a7b4-54d38fbc5512.csv,2025-02-03T01:57:18Z,202502
abfss://costs@enxpowerbidatalakeqa.dfs.core.windows.net/powerbi-costs/enxcf-CF_Subscription_Costs/20250101-20250131/enxcf-CF_Subscription_Costs_308e1884-6ed9-4823-bd81-cd59140c5dd9.csv,2025-02-03T01:58:09Z,202501
abfss://costs@enxpowerbidatalakeqa.dfs.core.windows.net/powerbi-costs/powerbi-costs/20241201-20241231/dez2024,2025-01-10T17:00:53Z,202412
abfss://costs@enxpowerbidatalakeqa.dfs.core.windows.net/powerbi-costs/powerbi-costs/20241101-20241130/nov2024,2025-01-12T14:57:38Z,202411
abfss://costs@enxpowerbidatalakeqa.dfs.core.windows.net/powerbi-costs/powerbi-costs/20241001-20241031/powerbi-costs_October.csv,2024-11-12T13:56:21Z,202410
abfss://costs@enxpowerbidatalakeqa.dfs.core.windows.net/powerbi-costs/powerbi-costs/20240901-20240930/powerbi-costs_september.csv,2024-11-12T13:56:45Z,202409
abfss://costs@enxpowerbidatalakeqa.dfs.core.windows.net/powerbi-costs/powerbi-costs/20240801-20240831/powerbi-costs_ago2024.csv,2024-12-02T14:37:13Z,202408
abfss://costs@enxpowerbidatalakeqa.dfs.core.windows.net/powerbi-costs/powerbi-costs/20240701-20240731/powerbi-costs_jul2024.csv,2024-12-02T14:38:22Z,202407
abfss://costs@enxpowerbidatalakeqa.dfs.core.windows.net/powerbi-costs/powerbi-costs/20240601-20240630/powerbi-costs_jun2024.csv,2024-12-02T14:38:52Z,202406
abfss://costs@enxpowerbidatalakeqa.dfs.core.windows.net/powerbi-costs/powerbi-costs/20240501-20240531/powerbi-costs_may2024.csv,2024-12-02T14:39:26Z,202405
