In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [0]:
import pandas as pd
from abc import ABC, abstractmethod

from lib.template.audit_to_historic import *

In [0]:
rundate = '20130208_000000'

domain = 'domain'
environment = 'dev'

schema_name = 'silver_analytics'
catalog_name = domain + '_' + environment

table_name = 'historic_visitas'

print('Processing ' + table_name)
factory_audit_to_historic = FactoryAuditToHistoric()
factory_audit_to_historic.create()

table_processor = AuditToHistoricTemplate(catalog_name=catalog_name, schema_name=schema_name, table_name=table_name)  
table_processor.set_component_factory(factory_audit_to_historic) 
table_processor.process()

print('')
#print(table_processor.metadata)
print(table_processor.metrics)
print('')
table_processor.dataframe.show(3)
print('')

In [0]:
df = table_processor.dataframe
print(df.count())
print(df.show(3))

In [0]:


#EXTRACT OR CREATE HISTORIC TABLE
primary_key = 'Email'
valid_from = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
valid_to = datetime.datetime.strptime('2200-01-01 00:00:00', "%Y-%m-%d %H:%M:%S")

try:
    original_dataframe = spark.read.table(f'{table_processor.metadata['catalog_name']}.{table_processor.metadata['schema_name']}.{table_processor.metadata['table_name']}')
    original_dataframe.count()

except:
    print('excepcion')
    new_original_dataframe = table_processor.dataframe
    new_original_dataframe = new_original_dataframe.withColumn('is_current', lit(True))
    new_original_dataframe = new_original_dataframe.withColumn('valid_from', lit(valid_from))
    new_original_dataframe = new_original_dataframe.withColumn('valid_to', lit(valid_to))

    new_original_dataframe.write.format('delta').mode('overwrite').saveAsTable(f'{table_processor.metadata['catalog_name']}.{table_processor.metadata['schema_name']}.{table_processor.metadata['table_name']}')

#READ CURRENT HISTORIC FILE
array_history_columns = []
array_source_columns = []

for column in table_processor.metadata['field_data']:
    if table_processor.metadata['field_data'][column]['track_changes'] == True:
        array_history_columns.append(column)
        array_source_columns.append('source_' + column)

original_dataframe = spark.read.table(f'{table_processor.metadata['catalog_name']}.{table_processor.metadata['schema_name']}.{table_processor.metadata['table_name']}')
table_processor.metrics['initial_rows'] = original_dataframe.count()
print(table_processor.metrics['initial_rows'])


new_dataframe = table_processor.dataframe.select([col(column).alias('source_' + column) for column in table_processor.dataframe.columns])

new_dataframe_2 = new_dataframe
new_dataframe_2 = new_dataframe_2.withColumn('source_is_current', lit(True))
new_dataframe_2 = new_dataframe_2.withColumn('source_valid_from', lit(valid_from))
new_dataframe_2 = new_dataframe_2.withColumn('source_valid_to', lit(valid_to))

source_primary_key = 'source_' + primary_key
merge_dataframe = original_dataframe.join(new_dataframe_2, (new_dataframe_2[source_primary_key] == original_dataframe[primary_key]), how='fullouter') 

merge_dataframe = merge_dataframe.withColumn('concat_ws', concat_ws('+', *array_history_columns))
merge_dataframe = merge_dataframe.withColumn('concat_ws_source', concat_ws('+', *array_source_columns))

merge_dataframe = merge_dataframe.withColumn('action', when(concat_ws('+', *array_history_columns) == \
                                                            concat_ws('+', *array_source_columns), 'NO_ACTION')
                                                .when(merge_dataframe['is_current'] == False, 'NO ACTION')
                                                .when(merge_dataframe[source_primary_key].isNull() & merge_dataframe.is_current, 'DELETE')
                                                .when(merge_dataframe[primary_key].isNull(), 'INSERT')
                                                .otherwise('UPDATE'))

#test = merge_dataframe.filter(col('source_Email') == 'migue_235@yahoo.com')
#test.show(truncate=False)

print('Rows with no action: ' + str(merge_dataframe.filter(col('action') == 'NO_ACTION').count()))
print('Rows to insert: ' + str(merge_dataframe.filter(col('action') == 'INSERT').count()))
print('Rows to update: ' + str(merge_dataframe.filter(col('action') == 'UPDATE').count()))
print('Rows to delete: ' + str(merge_dataframe.filter(col('action') == 'DELETE').count()))

array_history_columns = array_history_columns + ['is_current', 'valid_from', 'valid_to']
array_source_columns = array_source_columns + ['source_is_current', 'source_valid_from', 'source_valid_to']

#RECORDS WITH NO ACTION
df_merge_part_01 = merge_dataframe.filter(merge_dataframe.action == 'NO_ACTION').select(array_history_columns)

#RECORDS TO INSERT
df_merge_part_02A = merge_dataframe.filter(merge_dataframe.action == 'INSERT').select(array_source_columns)
df_merge_part_02B = df_merge_part_02A.select([col(column).alias(column.replace('source_', '')) for column in df_merge_part_02A.columns])

#RECORDS TO DELETE
df_merge_part_03 = merge_dataframe.filter(merge_dataframe.action == 'DELETE').select(array_history_columns)
df_merge_part_03 = df_merge_part_03.withColumn('is_current', lit(False))
df_merge_part_03 = df_merge_part_03.withColumn('valid_to', lit(valid_to))

#RECORDS TO EXPIRE AND INSERT
df_merge_part_04A = merge_dataframe.filter(merge_dataframe.action == 'UPDATE').select(array_source_columns)
df_merge_part_04B = df_merge_part_04A.select([col(column).alias(column.replace('source_', '')) for column in df_merge_part_02A.columns])

df_merge_part_04C = merge_dataframe.filter(merge_dataframe.action == 'UPDATE')
df_merge_part_04C = df_merge_part_04C.withColumn('valid_to', merge_dataframe['source_valid_from'])
df_merge_part_04C = df_merge_part_04C.withColumn('is_current', lit(False))
df_merge_part_04C = df_merge_part_04C.select(array_history_columns)

#print('>>>>> 01')
#print(df_merge_part_01.count())
#df_merge_part_01.show(3)

#print('>>>>> 02A')
#print(df_merge_part_02A.count())
#df_merge_part_02A.show(3)

#print('>>>>> 03')
#print(df_merge_part_03.count())
#df_merge_part_03.show(3)

#print('>>>>> 04B')
#print(df_merge_part_04B.count())
#df_merge_part_04A.show(3)

#print('>>>>> 04C')
#print(df_merge_part_04C.count())
#df_merge_part_04C.show(3)

#UNION
dataframe_historic = df_merge_part_01.unionAll(df_merge_part_02B).unionAll(df_merge_part_03).unionAll(df_merge_part_04B).unionAll(df_merge_part_04C)

print('>>>>> HIST')
count = dataframe_historic.count()
print(count)

In [0]:
dataframe_historic.filter(col('Email') == 'migue_235@yahoo.com').show(10)
dataframe_historic.filter(col('is_current') == False).show(10)

In [0]:
spark.sql('DROP TABLE domain_dev.silver_analytics.historic_visitas')

In [0]:
result = spark.sql('SELECT * FROM governance_prod.metrics.ingestions')
print(result.show(10, truncate=False))

In [0]:
result = spark.sql('SELECT * FROM domain_dev.silver_analytics.historic_visitas')
print(result.count())
result.show(3)

In [0]:
result = spark.sql("SELECT * FROM domain_dev.silver_analytics.audit_visitas WHERE metadata_batch_id='8'")
result.show(3)

In [0]:
samanta.llaguno@a