In [0]:
import pyspark
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from delta.tables import DeltaTable

#rundate = '20130208_000000'
#rundate = '20130214_000000'

dbutils.widgets.text('rundate', '20130208_000000', 'rundate of the execution')
rundate = dbutils.widgets.get('rundate')

In [0]:
#COMPUTE TABLE WITH VISITOR METRICS
current_year = int(rundate[0:4])
current_month = int(rundate[4:6])

visits = spark.sql('SELECT * FROM domain_dev.silver_analytics.audit_visitas WHERE metadata_audit_passed = TRUE')

visitor = visits.groupby('Email').agg(min('FechaEnvio').alias('fechaPrimeraVisita'), 
                                      max('FechaEnvio').alias('fechaUltimaVisita'), 
                                      count('Email').alias('visitasTotales'))

visits_by_month = visits.select('Email', 'FechaEnvio') 
visits_by_month = visits_by_month.withColumn('year', year('FechaEnvio'))
visits_by_month = visits_by_month.withColumn('month', month('FechaEnvio'))

visits_on_this_year = visits_by_month.filter(visits_by_month['year'] == current_year)
visits_on_this_year = visits_on_this_year.groupBy('Email').agg(count('Email').alias('visitasAnioActual'))

visits_on_this_month = visits_by_month.filter((visits_by_month['year'] == current_year) & (visits_by_month['month'] == current_month))
visits_on_this_month = visits_on_this_month.groupBy('Email').agg(count('Email').alias('visitasMesActual'))

visitor = visitor.join(visits_on_this_year, on='Email', how='left')
visitor = visitor.join(visits_on_this_month, on='Email', how='left')
visitor = visitor.fillna(0, subset=['visitasAnioActual', 'visitasMesActual'])


In [0]:
#EXPORT TABLE
complete_name = 'domain_dev.gold_analytics.visitor'
visitor.write.format('delta').mode('overwrite').saveAsTable(complete_name)