In [31]:
import json
from os.path import abspath
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.window import Window as W
from pyspark.sql.types import MapType,StringType,ArrayType


from datetime import datetime
from datetime import timedelta

today=datetime.now().strftime("%Y-%m-%d")
yesterday=(datetime.now()-timedelta(1)).strftime("%Y-%m-%d")

print(today,yesterday,"starting tasks at:",datetime.now())

pd.set_option('display.max_rows', 1000)
pd.options.display.float_format = '{:.2f}'.format

warehouse_location_path = '/home/jovyan/work/spark-warehouse'

warehouse_location = abspath(warehouse_location_path)

print(warehouse_location)

spark = SparkSession \
    .builder \
    .appName("shopee-category-search-extract") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("spark.debug.maxToStringFields",200) \
    .config("spark.sql.debug.maxToStringFields",2000) \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.extraJavaOptions","-Dio.netty.tryReflectionSetAccessible=true -Xms4096m") \
    .config("spark.driver.extraJavaOptions","-Dio.netty.tryReflectionSetAccessible=true -Xms4096m") \
    .enableHiveSupport() \
    .getOrCreate()

spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic")


2022-12-12 2022-12-11 starting tasks at: 2022-12-12 12:13:57.789915
/home/jovyan/work/spark-warehouse


In [32]:
import prestodb

cols="date_bjt,pid,image_link,is_zero_suppliyer,product_type,click_15d,last_15d_views,last_7d_views,add_15d,sales_15d,sales,is_new,create_date,gmv,gmv_7d,gmv_per_view,is_online".split(",")

print(cols)

def get_presto_data(sql):
    conn=prestodb.dbapi.connect(
        host='ec2-54-218-99-163.us-west-2.compute.amazonaws.com',
        port=8889,
        user='root',
        catalog='hive',
        schema='marketing',
    )
    cur = conn.cursor()
    cur.execute(sql)
    rows = cur.fetchall()

    conn.close()
    return rows

['date_bjt', 'pid', 'image_link', 'is_zero_suppliyer', 'product_type', 'click_15d', 'last_15d_views', 'last_7d_views', 'add_15d', 'sales_15d', 'sales', 'is_new', 'create_date', 'gmv', 'gmv_7d', 'gmv_per_view', 'is_online']


In [33]:
def get_last_x_day_str(x = 1):
    return (datetime.now() - timedelta(x)).strftime('%Y-%m-%d')

last_60d = get_last_x_day_str(60)
last_7d = get_last_x_day_str(7)
last_2d = get_last_x_day_str(2)
last_1d = get_last_x_day_str(1)

sql="""
select
    date_id as date_bjt,
    pid,
    cast(pid as varchar) || '|' || image_link as image_link,
    case when write_uid = 8 then '0货源' else write_name end as is_zero_suppliyer,
    product_type,
    click_15d,
    last_15d_views,
    last_7d_views,
    add_15d,
    sales_15d,
    sales,
    case when cast(sales_7d as int) * 100 / cast(sales as int) >= 70 then 'new-product' else 'not-new' end as is_new,
    create_date,
    gmv,
    gmv_7d,
    round(cast(gmv_7d as double) / cast(last_7d_views as int), 3) as gmv_per_view,
    is_online
from
    jiayundw_dm.product_profile_df a
    left join(
        select id,image_link,product_type
        from marketing.facebook_catalog_app_main
        where dt = '{last_1d}'
    ) b on a.pno = b.id
    left join(
        select 
            cast(split(regexp_extract(url,'(items|product)/\d+',0),'/')[2] as bigint) as pid_from_url,
            count(case when date_id>=date_format(current_date - interval '7' day,'%Y-%m-%d') then 1 else null end) as last_7d_views,
            count(case when date_id>=date_format(current_date - interval '15' day,'%Y-%m-%d') then 1 else null end) as last_15d_views
        from dw_dwd.flow_user_trace_details_da
        where date_id>=date_format(current_date - interval '15' day,'%Y-%m-%d')
        and regexp_like(mid, '(10|8|16).5$')
        and event_type = 'pageview'
        group by 1
    )c on a.pid=c.pid_from_url
where
    date_id = '{last_2d}'
    and cast(sales_15d as int) > 0
    and cast(imp_detail_pv as int) > 0
order by
    cast(gmv_7d as double) desc
limit
    2000
""".format(last_1d=last_1d,last_2d=last_2d)

print("getting data with sql:\n",sql)

rows = get_presto_data(sql)
print("how many data:", len(rows),"\n sample line:", rows[0])

rdd = spark.sparkContext.parallelize(rows)
df=rdd.toDF(cols)

df.printSchema()

getting data with sql:
 
select
    date_id as date_bjt,
    pid,
    cast(pid as varchar) || '|' || image_link as image_link,
    case when write_uid = 8 then '0货源' else write_name end as is_zero_suppliyer,
    product_type,
    click_15d,
    last_15d_views,
    last_7d_views,
    add_15d,
    sales_15d,
    sales,
    case when cast(sales_7d as int) * 100 / cast(sales as int) >= 70 then 'new-product' else 'not-new' end as is_new,
    create_date,
    gmv,
    gmv_7d,
    round(cast(gmv_7d as double) / cast(last_7d_views as int), 3) as gmv_per_view,
    is_online
from
    jiayundw_dm.product_profile_df a
    left join(
        select id,image_link,product_type
        from marketing.facebook_catalog_app_main
        where dt = '2022-12-11'
    ) b on a.pno = b.id
    left join(
        select 
            cast(split(regexp_extract(url,'(items|product)/\d+',0),'/')[2] as bigint) as pid_from_url,
            count(case when date_id>=date_format(current_date - interval '7' day,'%Y-%m-%d

In [34]:
html_df=df.toPandas()

html_df.head(1)

from IPython.core.display import display, HTML
import re

def path_to_image_html(path):
    id_and_img = path.split("|")
    return '<a target="_blank" href="https://www.wholeeshopping.com/product/{pid}"><img src="{img}" width="100" loading="lazy" /></a>'.format(pid=id_and_img[0],img=id_and_img[1])

def pno_to_link(pno):
    link = 'https://www.wholeeshopping.com/product/{pno}'.format(pno=pno)
    return '<a href="{link}" target="_blank">{pno}</a>'.format(pno=pno, link=link)

def prettier_category(product_type = ""):
    return '<p class="product-type">'+product_type.replace(" > ", " > <br/>")+'</p>'

def is_new_formatter(is_new = 'new-product'):
    color = 'red;'
    if is_new != 'new-product':
        color = 'green;'
    return '<span style="color:'+ color + '">' + is_new + '</span>'

def is_online_formatter(is_online = 0):
    color = 'red;'
    if is_online == 1:
        color = 'green;'
        is_online = 'in stock'
    else:
        is_online = 'out of stock'
    return '<span style="color:'+ color + '">' + is_online + '</span>'

html = html_df.to_html(escape=False, formatters=dict(image_link = path_to_image_html, \
                                                     pid = pno_to_link, \
                                                     is_new = is_new_formatter, \
                                                     is_online = is_online_formatter, \
                                                     product_type = prettier_category))

# display(HTML(html))

  from IPython.core.display import display, HTML


In [35]:
from pathlib import Path
home = str(Path.home())

def table_html_to_formatted_doc(html, date = last_1d):
    title = 'Wholee Products top-1000 - {date}'.format(date = date)
    html = """
    <!DOCTYPE html>
<html lang="en-US">
  <head>
    <title>""" + title + """</title>
    <style>
    table, th, td {
    border: 1px solid black;
    border-collapse: collapse;
    }
    th, td {
    padding: 2px 10px;
    text-align: left;
    }
    </style>
  </head>
  <body><h1>"""+title+"</h1>"+html+"</body></html>"
    
    file = home + '/work/html/wholee-top-1000-{date}.html'.format(date=date)
    print(file)

    f = open(file, "w")
    f.write(html)
    f.close()
    
    return file
    
table_html_to_formatted_doc(html)

/home/jovyan/work/html/wholee-top-1000-2022-12-11.html


'/home/jovyan/work/html/wholee-top-1000-2022-12-11.html'