In [1]:
import json
from os.path import abspath
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
import pandas as pd
import pyspark.sql.functions as F
import pyspark.sql.window as W
from pyspark.sql.types import MapType,StringType,ArrayType


from datetime import datetime
from datetime import timedelta

today=datetime.now().strftime("%Y-%m-%d")
yesterday=(datetime.now()-timedelta(1)).strftime("%Y-%m-%d")

print(today,yesterday,datetime.now())

pd.set_option('display.max_rows', 1000)
pd.options.display.float_format = '{:.2f}'.format

warehouse_location_path = '/home/jovyan/work/spark-warehouse'

warehouse_location = abspath(warehouse_location_path)

print(warehouse_location)

spark = SparkSession \
    .builder \
    .appName("shopee-category-search-extract") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("spark.debug.maxToStringFields",200) \
    .config("spark.sql.debug.maxToStringFields",2000) \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.extraJavaOptions","-Dio.netty.tryReflectionSetAccessible=true -Xms4096m") \
    .config("spark.driver.extraJavaOptions","-Dio.netty.tryReflectionSetAccessible=true -Xms4096m") \
    .enableHiveSupport() \
    .getOrCreate()

spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic")


2022-10-21 2022-10-20 2022-10-21 10:02:39.605913
/home/jovyan/work/spark-warehouse


In [2]:
import prestodb

cols=['product_no','day_has_session','last_day_has_session','total_impressions','daily_impression','clicks','users','cost',\
      'gmv','total_web_purchase','add_cart','avg_user_gmv','last_7d_cost','last_7d_gmv','image_link','product_type','is_zero_suppliyers']

def get_presto_data(sql):
    conn=prestodb.dbapi.connect(
        host='ec2-54-218-99-163.us-west-2.compute.amazonaws.com',
        port=8889,
        user='root',
        catalog='hive',
        schema='marketing',
    )
    cur = conn.cursor()
    cur.execute(sql)
    rows = cur.fetchall()

    conn.close()
    return rows

In [3]:
def get_last_x_day_str(x = 1):
    return (datetime.now() - timedelta(x)).strftime('%Y-%m-%d')

last_60d = get_last_x_day_str(60)
last_7d = get_last_x_day_str(7)
last_1d = get_last_x_day_str(1)

sql="""
select a.*
    ,b.image_link
    ,b.product_type
    ,c.is_zero_suppliyers
from
(select 
    product_no,
    count(distinct dt) as day_has_session,
    max(dt) as last_day_has_session,
    sum(cast(impression as int)) as impression,
    sum(cast(impression as int))/count(distinct dt) as daily_impression,
    sum(cast(click_pv as int)) as clicks,
    sum(cast(users as int)) as users,
    sum(cast(cost as double)) as cost,
    sum(cast(web_gmv as double)) as gmv,
    sum(cast(web_purchase as int)) as total_web_purchase,
    sum(cast(add_cart_pv as int)) as add_cart,
    round(1.000*sum(cast(gmv as double))/sum(cast(users as int)),3) as avg_user_gmv,
    round(sum(case when dt >= '{last_7d}' then cast(cost as double) else 0.0 end),2)last_7d_cost,
    round(sum(case when dt >= '{last_7d}' then cast(gmv as double) else 0.0 end),2)last_7d_gmv
from marketing.ad_report_analysis_base a
where dt >= '{last_60d}'
    and ad_channel = 'facebook'
    and lower(campaign_name) not like '%_shopify_%'
    and lower(campaign_name) not like '%_deshopify_%'
    and lower(campaign_name) not like '%独立站%'
    and lower(campaign_name) like '%_msite_%'
    and cast(cost as double) > 0
    and cast(impression as int) > 0
group by 1)a
left join(
    select id,image_link,product_type
    from marketing.facebook_catalog_app_main
    where dt = '{last_1d}'
)b on a.product_no = b.id
left join(
    select pno,
        case when write_uid = 8 then 'zero-suppliyers' else 'other-source' end as is_zero_suppliyers,
        count(1) cnt
    from jiayundw_dm.product_profile_df
    where date_id = '{last_1d}'
    group by 1,2
)c on a.product_no = c.pno
where impression>=100
""".format(last_60d=last_60d,last_7d=last_7d,last_1d=last_1d)

print("getting data with sql:\n",sql)

rows = get_presto_data(sql)
print("how many data:", len(rows),"\n sample line:", rows[0])

rdd = spark.sparkContext.parallelize(rows)
df=rdd.toDF(cols)

df=df.withColumn("date_written", F.lit(datetime.now().strftime('%Y-%m-%d')))

df.printSchema()
df.write.format("parquet").mode("overwrite").partitionBy("date_written").save(warehouse_location_path + "/fb-data/")

getting data with sql:
 
select a.*
    ,b.image_link
    ,b.product_type
    ,c.is_zero_suppliyers
from
(select 
    product_no,
    count(distinct dt) as day_has_session,
    max(dt) as last_day_has_session,
    sum(cast(impression as int)) as impression,
    sum(cast(impression as int))/count(distinct dt) as daily_impression,
    sum(cast(click_pv as int)) as clicks,
    sum(cast(users as int)) as users,
    sum(cast(cost as double)) as cost,
    sum(cast(web_gmv as double)) as gmv,
    sum(cast(web_purchase as int)) as total_web_purchase,
    sum(cast(add_cart_pv as int)) as add_cart,
    round(1.000*sum(cast(gmv as double))/sum(cast(users as int)),3) as avg_user_gmv,
    round(sum(case when dt >= '2022-10-14' then cast(cost as double) else 0.0 end),2)last_7d_cost,
    round(sum(case when dt >= '2022-10-14' then cast(gmv as double) else 0.0 end),2)last_7d_gmv
from marketing.ad_report_analysis_base a
where dt >= '2022-08-22'
    and ad_channel = 'facebook'
    and lower(campaign_nam

In [4]:
model_expr="""
CASE WHEN total_impressions >= 5000 AND total_web_purchase <= 0 THEN 'poor-no-trans'
WHEN imp_per_purchase >= 5000 AND total_web_purchase >4 AND daily_impression >= 7000 THEN 'poor-with-trans:top'
WHEN imp_per_purchase >= 5000 AND total_web_purchase >4 AND daily_impression >= 3000 THEN 'poor-with-trans:middle'
WHEN imp_per_purchase >= 5000 AND total_web_purchase >4 THEN 'poor-with-trans:tail'
WHEN total_impressions >= 3000 
    AND (add_cart <=0 or total_impressions/add_cart >= 500)
    AND total_web_purchase <= 0 THEN 'poor-low-site-usage'
WHEN total_web_purchase > 0 AND daily_impression >= 7000 THEN 'normal-with-trans:top'
WHEN total_web_purchase > 0 THEN 'normal-with-trans:middle-tail'
ELSE 'normal-no-trans-yet' END
"""

df=df.withColumn("roi", F.expr("gmv/cost"))\
.withColumn('imp_per_purchase',F.expr('case when total_web_purchase>0 then total_impressions/total_web_purchase else 0 end'))\
.withColumn("last_7d_roi",F.expr("last_7d_gmv/last_7d_cost"))\
.withColumn('is_poor',F.expr(model_expr))

df.printSchema()

root
 |-- product_no: string (nullable = true)
 |-- day_has_session: long (nullable = true)
 |-- last_day_has_session: string (nullable = true)
 |-- total_impressions: long (nullable = true)
 |-- daily_impression: long (nullable = true)
 |-- clicks: long (nullable = true)
 |-- users: long (nullable = true)
 |-- cost: double (nullable = true)
 |-- gmv: double (nullable = true)
 |-- total_web_purchase: long (nullable = true)
 |-- add_cart: long (nullable = true)
 |-- avg_user_gmv: double (nullable = true)
 |-- last_7d_cost: double (nullable = true)
 |-- last_7d_gmv: double (nullable = true)
 |-- image_link: string (nullable = true)
 |-- product_type: string (nullable = true)
 |-- is_zero_suppliyers: string (nullable = true)
 |-- date_written: string (nullable = false)
 |-- roi: double (nullable = true)
 |-- imp_per_purchase: double (nullable = true)
 |-- last_7d_roi: double (nullable = true)
 |-- is_poor: string (nullable = false)



In [5]:
html_df=df.where("is_poor like 'poor%' and last_7d_cost >0 and product_type not like 'Shoes%'")\
.selectExpr("product_no","round(cost,2) as all_time_cost",\
            "last_7d_cost","case when last_7d_cost/cost >=0.8 then 'new-product' else 'not-new' end as is_new","roi",\
            "replace(image_link,'.jpg','_350x350.jpg') as img_link","is_zero_suppliyers","is_poor as is_poor_performence","product_type")\
.orderBy(F.desc(F.col("last_7d_cost"))).toPandas().head(100)

html_df.head(1)

from IPython.core.display import display, HTML
import re

def path_to_image_html(path):
    return '<img src="'+ path + '" width="100" />'

def pno_to_link(pno):
    link = 'https://www.wholeeshopping.com/product/'+re.findall(r'\d+', pno)[0]
    return '<a href="'+ link + '" target="_blank" >' + pno + '</a>'

def prettier_category(product_type = ""):
    return '<p class="product-type">'+product_type.replace(" > ", " > <br/>")+'</p>'

def is_new_formatter(is_new = 'new-product'):
    color = 'red;'
    if is_new != 'new-product':
        color = 'green;'
    return '<span style="color:'+ color + '">' + is_new + '</span>'

html = html_df.to_html(escape=False, formatters=dict(img_link = path_to_image_html, \
                                                     product_no = pno_to_link, \
                                                     is_new = is_new_formatter, \
                                                     product_type = prettier_category))

display(HTML(html))

  from IPython.core.display import display, HTML


Unnamed: 0,product_no,all_time_cost,last_7d_cost,is_new,roi,img_link,is_zero_suppliyers,is_poor_performence,product_type
0,XXX021707564N,16219.15,4018.27,not-new,0.7,,other-source,poor-with-trans:top,Men's Clothing > Sweatshirts & Hoodiess
1,MWB022333318N,25923.34,2981.16,not-new,1.17,,other-source,poor-with-trans:top,Men's Clothing > Waistcoat > Men's Casual Vests
2,MOV021797060N,4533.09,2154.89,not-new,1.07,,other-source,poor-with-trans:top,Men's Clothing > Overcoats
3,WDR025210602N,1912.1,1909.71,new-product,0.36,,zero-suppliyers,poor-with-trans:top,Women's Clothing > Dress
4,WDR025225077N,2011.74,1698.47,new-product,0.42,,zero-suppliyers,poor-with-trans:top,Women's Clothing > Dress
5,WDR023485340N,1547.35,1537.72,new-product,0.43,,zero-suppliyers,poor-with-trans:top,Women's Clothing > Dress
6,MJA022991875N,6197.89,1275.21,not-new,0.97,,zero-suppliyers,poor-with-trans:top,Men's Clothing > Down Jackets
7,WDR023436849N,2401.8,1061.53,not-new,0.33,,other-source,poor-with-trans:tail,Women's Clothing > Dress
8,MTT024207352N,16734.33,1026.07,not-new,0.87,,other-source,poor-with-trans:top,Men's Clothing > T-shirts
9,CBA023850566N,979.53,979.53,new-product,0.16,,zero-suppliyers,poor-with-trans:top,Clothing Accessories & Jewelry > Bracelets & Accessories > Other Finger Rings


In [6]:
from pathlib import Path
home = str(Path.home())

def table_html_to_formatted_doc(html, date = last_1d):
    title = 'Facebook Poor Performence Products - {date}'.format(date = date)
    html = """
    <!DOCTYPE html>
<html lang="en-US">
  <head>
    <title>""" + title + """</title>
    <style>
    table, th, td {
    border: 1px solid black;
    border-collapse: collapse;
    }
    th, td {
    padding: 2px 10px;
    text-align: left;
    }
    </style>
  </head>
  <body>"""+html+"</body></html>"
    
    file = home + '/work/html/fb-poor-performence-{date}.html'.format(date=date)
    print(file)

    f = open(file, "w")
    f.write(html)
    f.close()
    
    return file
    
table_html_to_formatted_doc(html)

/home/jovyan/work/html/fb-poor-performence-2022-10-20.html


'/home/jovyan/work/html/fb-poor-performence-2022-10-20.html'