In [1]:
import json
from os.path import abspath
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.window import Window as W
from pyspark.sql.types import MapType,StringType,ArrayType


from datetime import datetime
from datetime import timedelta

today=datetime.now().strftime("%Y-%m-%d")
yesterday=(datetime.now()-timedelta(1)).strftime("%Y-%m-%d")

print(today,yesterday,"starting tasks at:",datetime.now())

pd.set_option('display.max_rows', 1000)
pd.options.display.float_format = '{:.2f}'.format

warehouse_location_path = '/home/jovyan/work/spark-warehouse'

warehouse_location = abspath(warehouse_location_path)

print(warehouse_location)

spark = SparkSession \
    .builder \
    .appName("shopee-category-search-extract") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("spark.debug.maxToStringFields",200) \
    .config("spark.sql.debug.maxToStringFields",2000) \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.extraJavaOptions","-Dio.netty.tryReflectionSetAccessible=true -Xms4096m") \
    .config("spark.driver.extraJavaOptions","-Dio.netty.tryReflectionSetAccessible=true -Xms4096m") \
    .enableHiveSupport() \
    .getOrCreate()

spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic")


2022-10-31 2022-10-30 starting tasks at: 2022-10-31 08:33:15.714589
/home/jovyan/work/spark-warehouse


In [23]:
import prestodb

cols=['product_no','days_has_spend_2021','ads_using_this_product_2021','impressions_2021','clicks_2021','spend_2021','spend_gap_2021_2022',\
      'is_active_2022','why_inactive_2022','product_type','image_link','suppliyer_type','imp_30d','imp_detail_30d','illegal_tags',\
      'product_no_2022','days_has_spend_2022','ads_using_this_product_2022','impressions_2022','clicks_2022','spend_2022']

def get_presto_data(sql):
    conn=prestodb.dbapi.connect(
        host='ec2-54-218-99-163.us-west-2.compute.amazonaws.com',
        port=8889,
        user='root',
        catalog='hive',
        schema='marketing',
    )
    cur = conn.cursor()
    cur.execute(sql)
    rows = cur.fetchall()

    conn.close()
    return rows

In [24]:
def get_last_x_day_str(x = 1):
    return (datetime.now() - timedelta(x)).strftime('%Y-%m-%d')

last_60d = get_last_x_day_str(60)
last_7d = get_last_x_day_str(7)
last_1d = get_last_x_day_str(1)

sql="""
select a.*,
    case when spend_2022 is not null then a.spend_2021 - b.spend_2022 else a.spend_2021 end as spend_gap_2021_2022,
    case when c.is_active_2022 is not null then c.is_active_2022 else 'not-found-in-catalog-2022' end as is_active_2022,
    why_inactive_2022,
    case when product_type is not null then product_type else 'not-found-in-catalog-2022' end as product_type,
    image_link,suppliyer_type,imp_30d,imp_detail_30d,illegal_tags,
    b.*
from
(
    select split(breakdown_value,',')[1] as product_no,
        count(distinct date_info) as days_has_spend_2021,
        count(distinct adset_id) as ads_using_this_product_2021,
        sum(cast(impressions as int)) as impressions_2021,
        sum(cast(clicks as int)) as clicks_2021,
        sum(cast(spend as double)) as spend_2021
    from dw_ods.marketing_facebook_marketing_report
    where date_info between '2021-10-01' and '2021-10-30'
        and breakdown='product_id'
        and clicks > '0' 
        and impressions >'0'
        and spend > '0'
        and cast(clicks as int) > 0
    group by 1
    having sum(cast(spend as double)) >= 100) a
left join
(
    select split(breakdown_value,',')[1] as product_no_2022,
        count(distinct date_info) as days_has_spend_2022,
        count(distinct adset_id) as ads_using_this_product_2022,
        sum(cast(impressions as int)) as impressions_2022,
        sum(cast(clicks as int)) as clicks_2022,
        sum(cast(spend as double)) as spend_2022
    from dw_ods.marketing_facebook_marketing_report
    where date_info between '2022-10-01' and '2022-10-30'
        and breakdown='product_id'
        and clicks > '0' 
        and impressions >'0'
        and spend > '0'
        and cast(clicks as int) > 0
    group by 1
    having sum(cast(spend as double)) > 0
)b on a.product_no=b.product_no_2022
left join(
    select id,product_type,image_link,
        case when availability = 'in stock' and status = 'active' then 'active' else 'in-active' end is_active_2022,
        case when availability = 'in stock' and status = 'active' then '' else filter_reason end why_inactive_2022
    from marketing.facebook_catalog_app_main a 
    where dt= date_format(current_date - interval '1' day, '%Y-%m-%d')
)c on a.product_no = c.id
left join (
    select pno,
           cast(imp_30d as int)imp_30d,
           cast(imp_detail_pv_30d as int)imp_detail_30d,
           case when write_uid = 8 then '0货源' else write_name end suppliyer_type,
           illegal_tags
    from jiayundw_dm.product_profile_df
        where date_id=date_format(current_date - interval '1' day,'%Y-%m-%d')
)d on a.product_no=d.pno
""".format(last_60d=last_60d,last_7d=last_7d,last_1d=last_1d)

print("getting data with sql:\n",sql)

rows = get_presto_data(sql)
print("how many data:", len(rows),"\n sample line:", rows[0])

rdd = spark.sparkContext.parallelize(rows)
df=rdd.toDF(cols)

df.printSchema()
df.write.format("parquet").mode("overwrite").save(warehouse_location_path + "/fb-his-data/")

getting data with sql:
 
select a.*,
    case when spend_2022 is not null then a.spend_2021 - b.spend_2022 else a.spend_2021 end as spend_gap_2021_2022,
    case when c.is_active_2022 is not null then c.is_active_2022 else 'not-found-in-catalog-2022' end as is_active_2022,
    why_inactive_2022,
    case when product_type is not null then product_type else 'not-found-in-catalog-2022' end as product_type,
    image_link,suppliyer_type,imp_30d,imp_detail_30d,illegal_tags,
    b.*
from
(
    select split(breakdown_value,',')[1] as product_no,
        count(distinct date_info) as days_has_spend_2021,
        count(distinct adset_id) as ads_using_this_product_2021,
        sum(cast(impressions as int)) as impressions_2021,
        sum(cast(clicks as int)) as clicks_2021,
        sum(cast(spend as double)) as spend_2021
    from dw_ods.marketing_facebook_marketing_report
    where date_info between '2021-10-01' and '2021-10-30'
        and breakdown='product_id'
        and clicks > '0' 
   

In [25]:
df.printSchema()
html_df=df.selectExpr("product_no","round(spend_2021,2) as spend_2021",\
                      "case when spend_2022 is null then 0.0 else round(spend_2022,2) end as spend_2022",\
                      "spend_gap_2021_2022","case when spend_gap_2021_2022/spend_2021 >=0.95 then 'no-spend-in-2022' else 'normal' end as has_spend_2022",\
                      "is_active_2022","why_inactive_2022",'illegal_tags',"imp_30d as imp_30d_2022","`imp_detail_30d` as imp_detail_30d_2022",\
                      "replace(image_link,'.jpg','_350x350.jpg') as img_link","suppliyer_type","product_type")\
.orderBy(F.desc(F.col("spend_2021"))).toPandas()

html_df.fillna(0)

from IPython.core.display import display, HTML
import re

def path_to_image_html(path):
    return '<img src="'+ path + '" width="100" loading="lazy" />'

def pno_to_link(pno):
    link = 'https://www.wholeeshopping.com/product/'+re.findall(r'\d+', pno)[0]
    return '<a href="'+ link + '" target="_blank" >' + pno + '</a>'

def prettier_category(product_type = ""):
    return '<p class="product-type">'+product_type.replace(" > ", " > <br/>")+'</p>'

def is_new_formatter(is_new = 'new-product'):
    color = 'red;'
    if is_new == 'active':
        color = 'green;'
    return '<span style="color:'+ color + '">' + is_new + '</span>'

def has_spend_2022_formatter(has_spend):
    color = 'red;'
    if has_spend == 'normal':
        color = 'green;'
    return '<span style="color:'+ color + '">' + has_spend + '</span>'

html = html_df.to_html(escape=False, formatters=dict(img_link = path_to_image_html, \
                                                     product_no = pno_to_link, \
                                                     is_active_2022 = is_new_formatter, \
                                                     has_spend_2022 = has_spend_2022_formatter, \
                                                     product_type = prettier_category))

# display(HTML(html))

root
 |-- product_no: string (nullable = true)
 |-- days_has_spend_2021: long (nullable = true)
 |-- ads_using_this_product_2021: long (nullable = true)
 |-- impressions_2021: long (nullable = true)
 |-- clicks_2021: long (nullable = true)
 |-- spend_2021: double (nullable = true)
 |-- spend_gap_2021_2022: double (nullable = true)
 |-- is_active_2022: string (nullable = true)
 |-- why_inactive_2022: string (nullable = true)
 |-- product_type: string (nullable = true)
 |-- image_link: string (nullable = true)
 |-- suppliyer_type: string (nullable = true)
 |-- imp_30d: long (nullable = true)
 |-- imp_detail_30d: long (nullable = true)
 |-- illegal_tags: string (nullable = true)
 |-- product_no_2022: string (nullable = true)
 |-- days_has_spend_2022: long (nullable = true)
 |-- ads_using_this_product_2022: long (nullable = true)
 |-- impressions_2022: long (nullable = true)
 |-- clicks_2022: long (nullable = true)
 |-- spend_2022: double (nullable = true)



  from IPython.core.display import display, HTML


In [27]:
from pathlib import Path
home = str(Path.home())

def table_html_to_formatted_doc(html, date = last_1d):
    title = 'Facebook DA 2022 has no spend but 2021 spend-well products - {date}'.format(date = date)
    html = """
    <!DOCTYPE html>
<html lang="en-US">
  <head>
    <title>""" + title + """</title>
    <style>
    table, th, td {
    border: 1px solid black;
    border-collapse: collapse;
    }
    th, td {
    padding: 2px 10px;
    text-align: left;
    }
    </style>
  </head>
  <body><h1>"""+title+"</h1>"+html+"</body></html>"
    
    file = home + '/work/html/fb-no-spend-2022-products-{date}.html'.format(date=date)
    print(file)

    f = open(file, "w")
    f.write(html)
    f.close()
    
    return file
    
table_html_to_formatted_doc(html)

/home/jovyan/work/html/fb-no-spend-2022-products-2022-11-02.html


'/home/jovyan/work/html/fb-no-spend-2022-products-2022-11-02.html'

In [16]:
df.withColumn("has_spend_2022", F.expr("case when spend_gap_2021_2022/spend_2021 >=0.95 then 'no-spend-in-2022' else 'normal' end"))\
.groupBy("has_spend_2022", "is_active_2022")\
.agg(F.countDistinct("product_no").alias("pids"),\
     F.round(F.sum("spend_2021"),2).alias("total_spend_2021"),\
     F.round(F.sum("spend_2022"),2).alias("total_spend_2022"),\
     F.round(F.sum("imp_30d"),2).alias("total_imp_30d_2022"),\
    )\
.withColumn("total_spend_2021_%", F.round(100.00*F.col("total_spend_2021")/F.sum('total_spend_2021').over(W.partitionBy(F.lit(1))),2))\
.withColumn("total_imp_30d_2022_%", F.round(100.00*F.col("total_imp_30d_2022")/F.sum('total_imp_30d_2022').over(W.partitionBy(F.lit(1))),2))\
.orderBy("has_spend_2022", "is_active_2022")\
.show(truncate=False)

+----------------+-------------------------+----+----------------+----------------+------------------+------------------+--------------------+
|has_spend_2022  |is_active_2022           |pids|total_spend_2021|total_spend_2022|total_imp_30d_2022|total_spend_2021_%|total_imp_30d_2022_%|
+----------------+-------------------------+----+----------------+----------------+------------------+------------------+--------------------+
|no-spend-in-2022|active                   |535 |538777.59       |1443.05         |6488018           |45.23             |18.65               |
|no-spend-in-2022|in-active                |274 |290024.03       |57.36           |2212076           |24.35             |6.36                |
|no-spend-in-2022|not-found-in-catalog-2022|202 |166536.09       |null            |348               |13.98             |0.0                 |
|normal          |active                   |241 |184016.89       |116473.91       |24665935          |15.45             |70.92               |

In [8]:
df.printSchema()

root
 |-- product_no: string (nullable = true)
 |-- days_has_spend_2021: long (nullable = true)
 |-- ads_using_this_product_2021: long (nullable = true)
 |-- impressions_2021: long (nullable = true)
 |-- clicks_2021: long (nullable = true)
 |-- spend_2021: double (nullable = true)
 |-- spend_gap_2021_2022: double (nullable = true)
 |-- is_active_2022: string (nullable = true)
 |-- product_type: string (nullable = true)
 |-- image_link: string (nullable = true)
 |-- suppliyer_type: string (nullable = true)
 |-- imp_30d: long (nullable = true)
 |-- imp_detail_30d,: long (nullable = true)
 |-- product_no_2022: string (nullable = true)
 |-- days_has_spend_2022: long (nullable = true)
 |-- ads_using_this_product_2022: long (nullable = true)
 |-- impressions_2022: long (nullable = true)
 |-- clicks_2022: long (nullable = true)
 |-- spend_2022: double (nullable = true)

