In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [0]:
%pip install duckdb

In [0]:
import duckdb

import pyspark
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from delta.tables import DeltaTable

In [0]:
rundate = '20130208_000000'

In [0]:
visits = spark.sql('SELECT * FROM domain_dev.silver_analytics.audit_visitas WHERE metadata_audit_passed = TRUE')
print(visits.count())
visits.show(3)

In [0]:
current_year = int(rundate[0:4])
current_month = int(rundate[4:6])

visitor = visits.groupby('Email').agg(min('FechaEnvio').alias('fechaPrimeraVisita'), 
                                      max('FechaEnvio').alias('fechaUltimaVisita'), 
                                      count('Email').alias('visitasTotales'))

visits_by_month = visits.select('Email', 'FechaEnvio') 
visits_by_month = visits_by_month.withColumn('year', year('FechaEnvio'))
visits_by_month = visits_by_month.withColumn('month', month('FechaEnvio'))

visits_on_this_year = visits_by_month.filter(visits_by_month['year'] == current_year)
visits_on_this_year = visits_on_this_year.groupBy('Email').agg(count('Email').alias('visitasAnioActual'))

visits_on_this_month = visits_by_month.filter((visits_by_month['year'] == current_year) & (visits_by_month['month'] == current_month))
visits_on_this_month = visits_on_this_month.groupBy('Email').agg(count('Email').alias('visitasMesActual'))

#print(visits_on_this_month.show(3))

visitor = visitor.join(visits_on_this_year, on='Email', how='left')
visitor = visitor.join(visits_on_this_month, on='Email', how='left')
visitor = visitor.fillna(0, subset=['visitasAnioActual', 'visitasMesActual'])
print(visitor.show(3))

In [0]:
print(visitor.count())
print(visitor.select('Email').dropDuplicates().count())

In [0]:
visitante
email, fechaPrimeraVisita, fechaUltimaVisita, visitasTotales, visitasAnioActual, visitasMesActual
estadística
email,jyv,Badmail,Baja,Fecha envío,Fecha open,Opens,Opens virales,Fecha click,Clicks,Clicks virales,Links,IPs,Navegadores,Plataformas


In [0]:
complete_name = 'domain_dev.gold_analytics.visitor'
visitor.write.format('delta').mode('overwrite').saveAsTable(complete_name)

In [0]:
query = f"""WITH filter AS (
                SELECT DISTINCT Email FROM domain_dev.silver_analytics.audit_visitas a
                WHERE a.metadata_batch_id IN 
                (
                    SELECT batch_id FROM governance_prod.metrics.ingestions 
                    WHERE rundate = '{rundate}' AND catalog_name='domain_dev' AND schema_name='silver_analytics' 
                    AND table_name='audit_visitas'
                )
            )
            SELECT v.* FROM domain_dev.gold_analytics.visitor v
            INNER JOIN filter f ON v.Email = f.Email
         """

output = spark.sql(query)
output = output.toPandas()
print(output.shape)
output.head(3)

In [0]:
conn = duckdb.connect('/Workspace/Users/armando.n90@gmail.com/users_case/local_device/home/mysql/analytics.db')

query = """CREATE TABLE IF NOT EXISTS visitor (
                Email STRING PRIMARY KEY,
                fechaPrimeraVisita TIMESTAMP,
                fechaUltimaVisita TIMESTAMP,
                visitasTotales INTEGER,
                visitasAnioActual INTEGER,
                visitasMesActual INTEGER)"""

conn.execute(query)
conn.close()

In [0]:
conn = duckdb.connect('/Workspace/Users/armando.n90@gmail.com/users_case/local_device/home/mysql/analytics.db')

query = """MERGE INTO visitor AS target USING output AS source
           ON target.Email = source.Email
           WHEN MATCHED THEN
                UPDATE SET
                    fechaPrimeraVisita = source.fechaPrimeraVisita,
                    fechaUltimaVisita = source.fechaUltimaVisita,
                    visitasTotales = source.visitasTotales,
                    visitasAnioActual = source.visitasAnioActual,
                    visitasMesActual = source.visitasMesActual
           WHEN NOT MATCHED THEN INSERT
        """
conn.execute(query)

conn.close()

In [0]:
conn = duckdb.connect('/Workspace/Users/armando.n90@gmail.com/users_case/local_device/home/mysql/analytics.db')

query = """SELECT * FROM visitor"""
results = conn.execute(query).df()

conn.close()

In [0]:
print(results.shape)
display(results)

In [0]:
conn = duckdb.connect('/Workspace/Users/armando.n90@gmail.com/users_case/local_device/home/mysql/analytics.db')

conn.execute('DELETE FROM visitor').df()

conn.close()

In [0]:
display(results)

In [0]:
conn.close()