In [1]:
import json
from os.path import abspath
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.window import Window as W
from pyspark.sql.types import MapType,StringType,ArrayType


from datetime import datetime
from datetime import timedelta

today=datetime.now().strftime("%Y-%m-%d")
yesterday=(datetime.now()-timedelta(1)).strftime("%Y-%m-%d")

print(today,yesterday,"starting tasks at:",datetime.now())

pd.set_option('display.max_rows', 1000)
pd.options.display.float_format = '{:.2f}'.format

warehouse_location_path = '/home/jovyan/work/spark-warehouse'

warehouse_location = abspath(warehouse_location_path)

print(warehouse_location)

spark = SparkSession \
    .builder \
    .appName("shopee-category-search-extract") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("spark.debug.maxToStringFields",200) \
    .config("spark.sql.debug.maxToStringFields",2000) \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.extraJavaOptions","-Dio.netty.tryReflectionSetAccessible=true -Xms4096m") \
    .config("spark.driver.extraJavaOptions","-Dio.netty.tryReflectionSetAccessible=true -Xms4096m") \
    .enableHiveSupport() \
    .getOrCreate()

spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic")


2022-11-03 2022-11-02 starting tasks at: 2022-11-03 10:09:35.896232
/home/jovyan/work/spark-warehouse


In [2]:
import prestodb

cols=['date_info','account_id','campaign_id','campaign_name','adset_id','adset_name','ad_id','ad_name','impressions',\
      'spend','frequency','reach','link_clicks','unique_link_clicks','unique_page_engagement','landing_page_view','web_gmv','web_purchase']

def get_presto_data(sql):
    conn=prestodb.dbapi.connect(
        host='ec2-54-218-99-163.us-west-2.compute.amazonaws.com',
        port=8889,
        user='root',
        catalog='hive',
        schema='marketing',
    )
    cur = conn.cursor()
    cur.execute(sql)
    rows = cur.fetchall()

    conn.close()
    return rows

In [5]:
def get_last_x_day_str(x = 1):
    return (datetime.now() - timedelta(x)).strftime('%Y-%m-%d')

last_60d = get_last_x_day_str(60)
last_7d = get_last_x_day_str(7)
last_2d = get_last_x_day_str(2)
last_1d = get_last_x_day_str(1)

sql="""
select 
    a.*,b.web_gmv,b.web_purchase
from
(
    select date_info,account_id,campaign_id,campaign_name,adset_id,
        adset_name,ad_id,ad_name,
        sum(cast(impressions as int)) impressions,
        sum(cast(spend as double)) spend,
        sum(cast(frequency as double)) frequency,
        sum(cast(reach as int)) reach,
        sum(cast(split(regexp_extract(actions,'"link_click","value":"\d+',0),'":"')[2] as int)) as link_clicks,
        sum(cast(split(regexp_extract(unique_actions,'"link_click","value":"\d+',0),'":"')[2] as int)) as unique_link_clicks,
        sum(cast(split(regexp_extract(unique_actions,'"page_engagement","value":"\d+',0),'":"')[2] as int)) as unique_page_engagement,
        sum(cast(split(regexp_extract(actions,'"landing_page_view","value":"\d+',0),'":"')[2] as int)) as landing_page_view
    from dw_ods.marketing_facebook_marketing_report
    where date_info >='{start_date}'
        and date_info <='{end_date}'
        and breakdown = 'none'
    group by 1,2,3,4,5,6,7,8
    ) a
left join(
    select da.date_id,
        utm_id,
        utm_content,
        sum(ord.amount_total) as web_gmv,
        count(distinct da.so_name) web_purchase
    from dw_mid.amoeba_optimizer_profit_msite_attri_realtime_da da,
        dw_dwd.sale_msite_order_amoeba_optimizer_profit_realtime_da ord
    where da.date_id >='{start_date}'
        and da.date_id <='{end_date}'
        and ord.date_id >='{start_date}'
        and ord.date_id <='{end_date}'
        and da.so_name = ord.so_name
        and utm_id is not null
        and utm_term is not null
        and lower(utm_source) = 'facebook'
    group by 1,2,3
)b on a.date_info = b.date_id and a.ad_id = b.utm_content and a.campaign_id = b.utm_id
order by spend desc
""".format(start_date=last_60d,end_date=last_1d)

print("getting data with sql:\n",sql)

rows = get_presto_data(sql)
print("how many data:", len(rows),"\n sample line:", rows[0])

rdd = spark.sparkContext.parallelize(rows)
df=rdd.toDF(cols)

df=df.withColumn("date_written", F.lit(datetime.now().strftime('%Y-%m-%d')))

df.printSchema()
df.write.format("parquet").mode("overwrite").partitionBy("date_written").save(warehouse_location_path + "/fb-ad-level-data/")

getting data with sql:
 
select 
    a.*,b.web_gmv,b.web_purchase
from
(
    select date_info,account_id,campaign_id,campaign_name,adset_id,
        adset_name,ad_id,ad_name,
        sum(cast(impressions as int)) impressions,
        sum(cast(spend as double)) spend,
        sum(cast(frequency as double)) frequency,
        sum(cast(reach as int)) reach,
        sum(cast(split(regexp_extract(actions,'"link_click","value":"\d+',0),'":"')[2] as int)) as link_clicks,
        sum(cast(split(regexp_extract(unique_actions,'"link_click","value":"\d+',0),'":"')[2] as int)) as unique_link_clicks,
        sum(cast(split(regexp_extract(unique_actions,'"page_engagement","value":"\d+',0),'":"')[2] as int)) as unique_page_engagement,
        sum(cast(split(regexp_extract(actions,'"landing_page_view","value":"\d+',0),'":"')[2] as int)) as landing_page_view
    from dw_ods.marketing_facebook_marketing_report
    where date_info >='2022-09-04'
        and date_info <='2022-11-02'
        and breakdown 

In [9]:
model_expr="""
CASE WHEN total_impressions >= 5000 AND total_web_purchase <= 0 THEN 'poor-no-trans'
WHEN imp_per_purchase >= 5000 AND total_web_purchase >4 AND daily_impression >= 7000 THEN 'poor-with-trans:top'
WHEN imp_per_purchase >= 5000 AND total_web_purchase >4 AND daily_impression >= 3000 THEN 'poor-with-trans:middle'
WHEN imp_per_purchase >= 5000 AND total_web_purchase >4 THEN 'poor-with-trans:tail'
WHEN total_impressions >= 3000 
    AND (add_cart <=0 or total_impressions/add_cart >= 500)
    AND total_web_purchase <= 0 THEN 'poor-low-site-usage'
WHEN total_web_purchase > 0 AND daily_impression >= 7000 THEN 'normal-with-trans:top'
WHEN total_web_purchase > 0 THEN 'normal-with-trans:middle-tail'
ELSE 'normal-no-trans-yet' END
"""

df=spark.read.parquet(warehouse_location_path + "/fb-ad-level-data/")\
.where("""lower(campaign_name) like '%_msite_%' 
        and lower(campaign_name) not like '%shopify_%' 
        and lower(campaign_name) not like '%独立站%' 
        and spend > 0.0""")\
.selectExpr("*",\
            'split(campaign_name,"_")[0] as targeted_country',\
            'split(campaign_name,"_")[3] as audience_type',\
            'split(campaign_name,"_")[4] as opt_type')

df.groupBy('audience_type')\
.agg(F.countDistinct("ad_id").alias("ads"),\
    F.round(F.sum("spend"),2).alias("spend"),\
    F.sum("reach").alias("reach"),\
    F.round(1000.00*F.sum("spend")/F.sum("reach"),2).alias("cost_per_1k_reach"),\
    F.round(F.sum("web_gmv"),2).alias("web_gmv"),\
    F.sum("web_purchase").alias("web_purchase"),\
    F.round(F.sum("web_gmv")/F.sum("spend"),2).alias("roi"),\
    F.round(F.sum("spend")/F.sum("web_purchase"),2).alias("cpp"),\
    F.round(F.sum("web_gmv")/F.sum("web_purchase"),2).alias("aov"),\
    )\
.withColumn("spend_%", F.round(100.00*F.col("spend")/F.sum("spend").over(W.partitionBy(F.lit(0))),2))\
.withColumn("gmv_%", F.round(100.00*F.col("web_gmv")/F.sum("web_gmv").over(W.partitionBy(F.lit(0))),2))\
.orderBy(F.desc("spend")).show(100,truncate=False)

+-------------+----+----------+---------+-----------------+----------+------------+----+-----+-----+-------+-----+
|audience_type|ads |spend     |reach    |cost_per_1k_reach|web_gmv   |web_purchase|roi |cpp  |aov  |spend_%|gmv_%|
+-------------+----+----------+---------+-----------------+----------+------------+----+-----+-----+-------+-----+
|A            |1188|2598623.74|210965899|12.32            |3137411.55|85106       |1.21|30.53|36.86|71.27  |61.28|
|E            |86  |582404.92 |46201812 |12.61            |928360.1  |28452       |1.59|20.47|32.63|15.97  |18.13|
|G            |189 |222880.5  |5823836  |38.27            |294741.95 |8027        |1.32|27.77|36.72|6.11   |5.76 |
|B            |240 |190124.47 |13159786 |14.45            |302168.76 |7793        |1.59|24.4 |38.77|5.21   |5.9  |
|Active-3d    |4   |50176.42  |1780224  |28.19            |457061.05 |9978        |9.11|5.03 |45.81|1.38   |8.93 |
|AG           |6   |1812.37   |110710   |16.37            |null      |null      