In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
"""
To extract campaign response data (Sents, Opens, Clicks & Bounces) for newsletters (email) into one view
"""

In [0]:
"""
1. contactPersonOhubId is the id used to communicate with Wunderman 
2. Join opens, clicks, bounces with sents using deliveryLogId or campaignConcatId (as of 16 April 2019) 
(In the past, contactPersonOhubId and deliverylogid were used)
3. Filters used:
   - Sends - isActive = true
   - CommunicationChannel = Email
   - bounces.bounceDate IS NULL (Only successful deliveries)
"""

In [0]:
destination_table                = "data_acm.ohub2_campaign_response_data_processed"

In [0]:
df = spark.sql("""
SELECT sends.countryCode, 
       sends.campaignConcatId, 
       sends.contactPersonConcatId,
       sends.contactPersonOhubId, 
       sends.deliveryLogId, 
       sends.deliveryId, 
       sends.campaignId,
       sends.campaignName, 
       sends.deliveryName, 
       sends.waveName,     
       sends.sendDate,
       COUNT(DISTINCT CASE WHEN opens.openDate IS NOT NULL THEN openDate END) AS NoofOpens,
       COUNT(DISTINCT CASE WHEN clicks.clickDate IS NOT NULL THEN clickDate END) AS NoofClicks,
       MIN(opens.openDate)   as Min_OpenDate, 
       MAX(opens.openDate)   as Max_OpenDate, 
       MIN(clicks.clickDate) as Min_ClickDate, 
       MAX(clicks.clickDate) as Max_ClickDate
FROM data_datascience_prod.campaignsends sends
LEFT JOIN data_datascience_prod.campaignbounces bounces ON sends.deliveryLogId = bounces.deliveryLogId
LEFT JOIN data_datascience_prod.campaignopens   opens   ON sends.deliveryLogId = opens.deliveryLogId
LEFT JOIN data_datascience_prod.campaignclicks  clicks  ON sends.deliveryLogId = clicks.deliveryLogId
WHERE sends.isActive = "true"
AND   sends.communicationChannel = "Email"
AND   bounces.bounceDate IS NULL
GROUP BY sends.countryCode, 
         sends.campaignConcatId, 
         sends.contactPersonConcatId,
         sends.contactPersonOhubId, 
         sends.deliveryLogId, 
         sends.deliveryId, 
         sends.campaignId,
         sends.campaignName, 
         sends.deliveryName, 
         sends.waveName,     
         sends.sendDate
        
""")

In [0]:
df                    = df.withColumn("waveName", lower(col("waveName")))
df                    = df.withColumn("newsletter", expr("substring(waveName, 0, 4)"))
df                    = df.withColumn("isNewsletter", when((col("waveName").contains("newsletter") | col("newsletter").contains("20") | col("waveName").contains("inspiration_dan") | col("waveName").contains("promotion_dan")) ,1).otherwise(0))
df                    = df.drop("newsletter")

In [0]:
df.write.mode("overwrite").saveAsTable(destination_table)