In [1]:
import json
from os.path import abspath
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
import pandas as pd

from datetime import datetime
from datetime import timedelta

today=datetime.now().strftime("%Y-%m-%d")
yesterday=(datetime.now()-timedelta(1)).strftime("%Y-%m-%d")

print(today,yesterday)

pd.set_option('display.max_rows', 1000)
pd.options.display.float_format = '{:.2f}'.format

warehouse_location = abspath('/home/jovyan/work/spark-warehouse')

print(warehouse_location)

spark = SparkSession \
    .builder \
    .appName("shopee-category-search-extract") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("spark.debug.maxToStringFields",200) \
    .config("spark.sql.debug.maxToStringFields",2000) \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.extraJavaOptions","-Dio.netty.tryReflectionSetAccessible=true -Xms4096m") \
    .config("spark.driver.extraJavaOptions","-Dio.netty.tryReflectionSetAccessible=true -Xms4096m") \
    .enableHiveSupport() \
    .getOrCreate()

spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic")


2021-12-31 2021-12-30
/home/jovyan/work/spark-warehouse


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/31 06:30:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## 先解析日志文件

In [2]:
%run ./shopee-category-search-extract.ipynb

+--------------+
|current_date()|
+--------------+
|    2021-12-31|
+--------------+

[('spark.driver.host', 'ad1515b12d9c'), ('spark.driver.memory', '4g'), ('spark.executor.memory', '4g'), ('spark.driver.port', '37753'), ('spark.executor.extraJavaOptions', '-Dio.netty.tryReflectionSetAccessible=true -Xms4096m'), ('spark.executor.id', 'driver'), ('spark.sql.debug.maxToStringFields', '2000'), ('spark.app.startTime', '1640932225250'), ('spark.app.id', 'local-1640932226506'), ('spark.driver.extraJavaOptions', '-Dio.netty.tryReflectionSetAccessible=true -Xms4096m'), ('spark.sql.catalogImplementation', 'hive'), ('spark.rdd.compress', 'True'), ('spark.debug.maxToStringFields', '200'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.submit.pyFiles', ''), ('spark.submit.deployMode', 'client'), ('spark.sql.warehouse.dir', 'file:/home/jovyan/work/spark-warehouse'), ('spark.app.name', 'shopee-category-search-extract'), ('spark.ui.showConsoleProgress', 'true')]

                                                                                

df size: 1633
root
 |-- adjust: struct (nullable = true)
 |    |-- count: long (nullable = true)
 |-- algorithm: string (nullable = true)
 |-- day: string (nullable = true)
 |-- disclaimer_infos: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- food_item_info: struct (nullable = true)
 |    |-- total_count: long (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- bff_item_tracking: string (nullable = true)
 |    |    |-- item_basic: struct (nullable = true)
 |    |    |    |-- add_on_deal_info: struct (nullable = true)
 |    |    |    |    |-- add_on_deal_id: long (nullable = true)
 |    |    |    |    |-- add_on_deal_label: string (nullable = true)
 |    |    |    |    |-- status: long (nullable = true)
 |    |    |    |    |-- sub_type: long (nullable = true)
 |    |    |    |-- badge_icon_type: long (nullable = true)
 |    |    |    |-- brand: string (nullable = true)
 |    |    |    |-- bun

21/12/31 06:30:45 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
21/12/31 06:30:45 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
21/12/31 06:30:50 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
21/12/31 06:30:50 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.21.0.2
21/12/31 06:30:51 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
21/12/31 06:31:12 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
21/12/31 06:31:12 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
21/12/31 06:31:12 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
21/12/31 06:31:12 W

Unnamed: 0,day,count
0,2021-12-31,1633


In [3]:
shopee_category_search_list_df="/home/jovyan/work/spark-warehouse/ods.db/shopee_category_search_list_df/"

def write_to_hive(df):
    spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic")
    df.write\
        .mode("overwrite")\
        .partitionBy("day")\
        .option("path",shopee_category_search_list_df)\
        .format("parquet")\
        .saveAsTable("shopee_category_search_list_df");
    

## 解析原始文件并存储到磁盘

In [4]:
import pyspark.sql.functions as F
from pyspark.sql.types import MapType,StringType,ArrayType


df=spark.read.parquet("/home/jovyan/work/spark-warehouse/ods.db/shopee_category_search_raw_df")
print(df.columns)

items_df=df.select('adjust', 'algorithm', 'disclaimer_infos', 'food_item_info', 'items', 'json_data', \
                   'low_result', 'need_next_search', 'nomore', 'refer', 'request_time', 'search_tracking', \
                   'show_disclaimer', 'sink_time', 'source', 'total_ads_count', 'total_count', 'url', \
                   'day', 'pos', 'col',F.posexplode(df.items))


items_exploded_df=items_df.selectExpr("request_time",
    "search_tracking",
    "show_disclaimer",
    "sink_time",
    "total_ads_count",
    "total_count",
    "url",
    "day",
    "pos",
    "col.item_type as item_type",
    "col.itemid as itemid",
    "col.shopid as shopid",
    "col.item_basic.name as display_name",
    "'col.collection_id' as collection_id",
    "'col.campaign_stock' as campaign_stock",
    "col.item_basic.badge_icon_type as badge_icon_type",
    "col.item_basic.brand as brand",
    "col.item_basic.can_use_bundle_deal as can_use_bundle_deal",
    "col.item_basic.can_use_cod as can_use_cod",
    "col.item_basic.can_use_wholesale as can_use_wholesale",
    "col.item_basic.catid as catid",
    "col.item_basic.cb_option as cb_option",
    "col.item_basic.cmt_count as cmt_count",
    "col.item_basic.ctime as ctime",
    "col.item_basic.currency as currency",
    "col.item_basic.discount as discount",
    "col.item_basic.flag as flag",
    "col.item_basic.has_lowest_price_guarantee as has_lowest_price_guarantee",
    "col.item_basic.historical_sold as historical_sold",
    "col.item_basic.image as image",
    "col.item_basic.images as images",
    "col.item_basic.is_adult as is_adult",
    "col.item_basic.is_official_shop as is_official_shop",
    "col.item_basic.is_on_flash_sale as is_on_flash_sale",
    "col.item_basic.is_preferred_plus_seller as is_preferred_plus_seller",
    "col.item_basic.item_status as item_status",
    "col.item_basic.item_rating as item_rating",
    "col.item_basic.liked_count as liked_count",
    "col.item_basic.name as name",
    "col.item_basic.price as price",
    "col.item_basic.price_before_discount as price_before_discount",
    "col.item_basic.price_max as price_max",
    "col.item_basic.price_max_before_discount as price_max_before_discount",
    "col.item_basic.price_min as price_min",
    "col.item_basic.price_min_before_discount as price_min_before_discount",
    "col.item_basic.raw_discount as raw_discount",
    "col.item_basic.reference_item_id as reference_item_id",
    "col.item_basic.shop_location as shop_location",
    "col.item_basic.show_free_shipping as show_free_shipping",
    "col.item_basic.sold as sold",
    "col.item_basic.status as status",
    "col.item_basic.tier_variations as tier_variations",
    "col.item_basic.transparent_background_image as transparent_background_image",
    "col.item_basic.video_info_list as video_info_list",
    "col.item_basic.view_count as view_count")

items_exploded_df.select("url","name").show(1,truncate=False)

print("processed data size:", items_exploded_df.count())


['adjust', 'algorithm', 'disclaimer_infos', 'food_item_info', 'items', 'json_data', 'low_result', 'need_next_search', 'nomore', 'refer', 'request_time', 'search_tracking', 'show_disclaimer', 'sink_time', 'source', 'total_ads_count', 'total_count', 'url', 'day']


                                                                                

+----------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------+
|url                                                                                                                                                 |name                                                                            |
+----------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------+
|https://shopee.sg/api/v4/search/search_items?by=sales&limit=60&match_id=11011433&newest=0&order=desc&page_type=search&scenario=PAGE_OTHERS&version=2|DUWEN 4ply Smooth Milk Fiber Knitting Crochet Yarn Milk Cotton Hand Knitted Yarn|
+-----------------------------------------------------------------------



processed data size: 1743813




In [5]:
from urllib.parse import urlparse, parse_qs
from pyspark.sql.types import MapType, StringType
extract_params = F.udf(lambda x: {k: v[0] for k, v in parse_qs(urlparse(x).query).items()}, MapType(StringType(), StringType()))


with_params_df = items_exploded_df.withColumn(
  "params", extract_params(items_exploded_df.url)
)

# with_params_df.show(1,truncate=False)
write_to_hive(with_params_df)
spark.sql("""select day,count(1)
from shopee_category_search_list_df 
group by day 
order by day desc""").show()






+----------+--------+
|       day|count(1)|
+----------+--------+
|2021-12-31|   97980|
|2021-12-30|   98080|
|2021-12-29|   97920|
|2021-12-28|   97919|
|2021-12-27|   97840|
|2021-12-24|   63540|
|2021-12-23|  150918|
|2021-12-22|  150858|
|2021-12-21|  150900|
|2021-12-20|  150899|
|2021-12-17|  189560|
|2021-12-16|  134640|
|2021-12-15|   91760|
|2021-12-14|   91799|
|2021-12-13|   79200|
|2021-12-10|   28560|
+----------+--------+



                                                                                

## 分析数据

In [6]:
from pyspark.sql.functions import when,col

df=spark.read.parquet(shopee_category_search_list_df)

df_with_country = df.withColumn("country", when(col("url").like("https://shopee.sg/%"),"SG")
                                 .when(col("url").like("https://shopee.co.id/%"),"ID")
                                 .when(col("url").like("https://shopee.com.my/%"),"MY")
                                 .when(col("url").like("https://shopee.co.th/%"),"TH")
                                 .when(col("url").like("https://shopee.com.br/%"),"BR")
                                 .otherwise('None'))

df_with_country.createOrReplaceTempView("v_shopee_category_search_list_df")
spark.sql("""select day,country,count(1) as cnt
from v_shopee_category_search_list_df 
group by day,country""").show()



+----------+-------+-----+
|       day|country|  cnt|
+----------+-------+-----+
|2021-12-16|     SG|53040|
|2021-12-16|     MY|42840|
|2021-12-16|     ID|38760|
|2021-12-24|     MY| 9240|
|2021-12-17|     ID|38660|
|2021-12-17|     SG|52980|
|2021-12-17|     TH|55080|
|2021-12-24|     TH|25020|
|2021-12-17|     MY|42840|
|2021-12-24|     SG|16080|
|2021-12-24|     BR|13200|
|2021-12-22|     SG|52998|
|2021-12-21|     TH|55020|
|2021-12-22|     TH|55080|
|2021-12-21|     MY|42840|
|2021-12-21|     SG|53040|
|2021-12-22|     MY|42780|
|2021-12-29|     SG|24480|
|2021-12-20|     TH|55079|
|2021-12-13|     SG|40440|
+----------+-------+-----+
only showing top 20 rows



                                                                                

In [7]:
shopee_cate_df=spark.read.parquet("/home/jovyan/work/spark-warehouse/ods.db/shopee_category_tree_df/"\
                                 .format(today=today,yesterday=yesterday))

shopee_cate_df.createOrReplaceTempView("v_shopee_category_tree_df_raw")

display(spark.sql("""
create or replace temp view v_shopee_category_tree_df as
select a.country,a.parent_category,a.catid,a.display_name,a.name,a.image
    ,b.name as parent_category_name,b.parent_category as root_parent_category
    ,c.name as root_category_name
    ,c.catid as root_catid
    ,case when (c.name is not null and b.name is not null)
        then c.name||' > '||b.name||' > '||a.name 
    when b.name is not null 
        then b.name||' > '||a.name 
    else a.name
    end as cat_path
from v_shopee_category_tree_df_raw a
left join v_shopee_category_tree_df_raw b on a.parent_category=b.catid
left join v_shopee_category_tree_df_raw c on b.parent_category=c.catid
""").toPandas())

display(spark.sql("select * from v_shopee_category_tree_df where country='BR' limit 2").toPandas())

Unnamed: 0,country,parent_category,catid,display_name,name,image,parent_category_name,root_parent_category,root_category_name,root_catid,cat_path
0,BR,24656,24657,Outros,Outros,,Early Learning,23291,Toys & Hobbies,23291.0,Toys & Hobbies > Early Learning > Outros
1,BR,23291,24656,Brinquedos Primeira Infância,Early Learning,a41a1d784853f03269cd4a2c0e0a73bb,Toys & Hobbies,0,,,Toys & Hobbies > Early Learning


In [8]:
display(spark.sql("""
select *
    ,row_number() over(partition by country order by total_sold desc) as rnk
from (select 
        count(1) data_rows
        ,sum(sold) total_sold
        ,sum(sold*price)/100000 total_gmv_local_currency
        ,count(distinct shopid) shop_ids
        ,case when shop_location='ต่างประเทศ' then 'Overseas' else shop_location end as ship_from
        ,country
    from v_shopee_category_search_list_df
    where country!='ID'
    group by ship_from,country
    )t
""").where(col("rnk")<=10).toPandas())

                                                                                

Unnamed: 0,data_rows,total_sold,total_gmv_local_currency,shop_ids,ship_from,country,rnk
0,66789,7641369,110406028.18,4792,São Paulo,BR,1
1,27633,2103806,28153818.51,2356,Mainland China,BR,2
2,6617,828414,12333984.27,599,Minas Gerais,BR,3
3,6258,633093,8078931.91,646,Rio de Janeiro,BR,4
4,2671,127775,2876653.98,287,Paraná,BR,5
5,1294,104573,1705100.09,146,Santa Catarina,BR,6
6,286,98112,1326751.86,43,Espírito Santo,BR,7
7,796,46758,736887.42,119,Rio Grande do Sul,BR,8
8,458,36235,233962.5,44,Ceará,BR,9
9,298,27014,939788.39,19,Sergipe,BR,10


In [9]:
spark.sql("""
create or replace temp view v_items_stats as
select itemid
        ,shopid
        ,params.match_id as catid
        ,day
        ,country
        ,shop_location
        ,avg(historical_sold) as historical_sold
        ,avg(sold) as sold
        ,avg(price)/100000 as price_local
        ,avg(price*(case when country = 'ID' then 0.00044 
                         when country = 'SG' then 4.65 
                         when country = 'MY' then 1.5055 
                         when country = 'TH' then 0.190647 
                         when country = 'BR' then 1.13 end))/100000 as price_cny
        ,avg(price_before_discount)/100000 as price_before_discount_local
        ,avg(price_before_discount*(case when country = 'ID' then 0.00044 
                         when country = 'SG' then 4.65 
                         when country = 'MY' then 1.5055
                         when country = 'TH' then 0.190647 
                         when country = 'BR' then 1.13 end))/100000 as price_before_discount_cny
        ,round(avg(liked_count)) as liked_count
        ,round(avg(view_count)) as view_count
        ,round(avg(pos+params.newest)) as position
        ,max(image) as image
        ,max(collection_id) as collection_id
        ,max(name) as name
        ,max(url) as url_appears
    from v_shopee_category_search_list_df
    where day='{day}' 
        and params.scenario!="PAGE_COLLECTION_SEARCH"
    group by itemid,shopid,params.match_id,day,country,shop_location
""".format(day=today)).show()

++
||
++
++



In [10]:
# 1.6 是预估的单件数；
# 从moniso的订单数据来反推，sold和historical_sold 均是订单量；

cat_stats=spark.sql("""
select ct.catid,st.catid as item_catid,day,country
    ,ct.cat_path as cat_name_en
    ,ct.name_local as cat_name_local
    ,count(1) as data_row_cnt
    ,case when position <=19 then 'less_20'
        when position <=39 then 'less_40'
        when position <=59 then 'less_60'
        else 'over_60' end as ranking
    ,count(distinct itemid,shopid) items
    ,count(distinct shopid) shops
    ,count(case when sold>100 then itemid end) order_solds_14d_gt_100
    ,sum(sold) order_solds_14d
    ,sum(historical_sold) historical_order_solds
    ,round(1.6*sum(sold*price_cny)) items_gmv_cny
    ,round(1.6*sum(historical_sold*price_cny)) historical_items_gmv_cny
    ,round(sum(sold*price_cny)/sum(sold),2) items_avg_price_cny
    ,percentile(price_cny, 0.25) as price_25percent_cny
    ,percentile(price_cny, 0.50) as price_50percent_cny
    ,percentile(price_cny, 0.75) as price_75percent_cny
    ,percentile(price_cny, 0.95) as price_95percent_cny
    ,percentile(price_cny*sold, 0.25) as gmv14d_25percent_cny
    ,percentile(price_cny*sold, 0.50) as gmv14d_50percent_cny
    ,percentile(price_cny*sold, 0.75) as gmv14d_75percent_cny
    ,percentile(price_cny*sold, 0.95) as gmv14d_95percent_cny
    ,percentile(price_cny*historical_sold, 0.25) as historical_gmv_25percent_cny
    ,percentile(price_cny*historical_sold, 0.50) as historical_gmv_50percent_cny
    ,percentile(price_cny*historical_sold, 0.75) as historical_gmv_75percent_cny
    ,percentile(price_cny*historical_sold, 0.95) as historical_gmv_95percent_cny
from v_items_stats st
left join(select catid,name as name_en,display_name as name_local,image,cat_path
    from v_shopee_category_tree_df) ct on st.catid=ct.catid
group by ct.catid,st.catid,ranking,cat_name_local,cat_name_en,day,country
order by country,catid,ranking,day
""").toPandas()

                                                                                

In [11]:
import pandas as pd
from IPython.display import Image, HTML


def path_to_image_html(path):
    return '<img style="width:128px;height:128px" loading="lazy" src="https://cf.shopee.sg/file/'+ path + '_tn"/>'

def cate_id_to_link_html(catid):
    return '<a target="_blank" href="https://shopee.sg/shopee-cat.{catid}">{catid}</a>'.format(catid=catid)

def cny_html(cny):
    return '<span>¥{cny}</span>'.format(cny=cny)

def item_url_to_link_html(url):
    return """<a target="popup" href="{url}" 
        onclick="window.open('{url}','popup','left=100,top=100,width=600,height=800,popup=1'); return false;">{name}</a>""".format(url=url,name=url)

pd.set_option('display.max_colwidth', None)

In [12]:
import pandas as pd
import pandas.io.formats.style

def write_to_html_file(df, title='', filename='out.html'):
    '''
    Write an entire dataframe to an HTML file with nice formatting.
    '''

    result = '''
<html>
<head>
'''
    result += '<title> %s </title>' % title
    result += '''
<meta charset="utf-8">
<style>

    h2 {
        text-align: center;
        font-family: Helvetica, Arial, sans-serif;
    }
    table { 
        margin-left: auto;
        margin-right: auto;
    }
    table, th, td {
        border: 1px solid black;
        border-collapse: collapse;
    }
    th, td {
        padding: 5px;
        text-align: center;
        font-family: Helvetica, Arial, sans-serif;
        font-size: 90%;
    }
    table tbody tr:hover {
        background-color: #dddddd;
    }
    .wide {
        width: 90%; 
    }
    table thead{
        background: #888;
        position: sticky;
        top: 0;
        color: #fff;
        font-size: 1.2em;
    }

</style>
</head>
<body>
    '''
    result += '<h2> %s </h2>\n' % title
    if type(df) == pd.io.formats.style.Styler:
        result += df.render()
    else:
        result += df.to_html(classes='wide', escape=False,\
                            formatters=dict(image=path_to_image_html,\
                                            catid=cate_id_to_link_html,\
                                            item_url=item_url_to_link_html,\
                                            gmv_14d=cny_html,avg_price_cny=cny_html,\
                                            historical_gmv=cny_html))
    result += '''
</body>
</html>
'''
    with open(filename, 'w') as f:
        f.write(result)

In [13]:
from IPython.display import FileLink, FileLinks
import datetime

day=datetime.datetime.now().strftime("%Y-%m-%d")

for country in ['MY','SG','TH','BR']:
    country_cat_df=cat_stats[cat_stats['country']==country]
    country_cat_df.\
        to_csv('~/work/export/cat_stats-{country}-{day}.csv'.format(day=day,country=country), index=False)
    write_to_html_file(country_cat_df, \
                       '{day} {country}各类目分析'.format(day=day,country=country), \
                       './export/{country}-{day}-各类目分析.html'.format(day=day,country=country))

FileLinks('./export/')

## 单品粒度分析

In [14]:
# 1.6 是预估的单件数；
# 从moniso的订单数据来反推，sold和historical_sold 均是订单量；

item_stats_all=spark.sql("""
select 
    case when country="ID" then 'https://shopee.co.id/item-i.'
         when country="SG" then 'https://shopee.sg/item-i.'
         when country="MY" then 'https://shopee.com.my/item-i.'
         when country="TH" then 'https://shopee.co.th/item-i.'
         when country="BR" then 'https://shopee.com.br/item-i.'
        end||shopid||'.'||itemid as item_url
    ,name as item_name
    ,ct.catid
    ,image
    ,day
    ,country
    ,shop_location
    ,ct.cat_path as cat_name_en
    ,ct.name_local as cat_name_local
    ,cast(position as int) as ranking
    ,cast(sold as int) solds_14d
    ,cast(historical_sold as int) historical_solds
    ,round(1.6*(sold*price_cny)) gmv_14d
    ,round(1.6*(historical_sold*price_cny)) historical_gmv
    ,round(price_cny,2) avg_price_cny
    ,row_number() over (partition by ct.catid,day order by (sold*price_cny) desc) as rnk
from v_items_stats st
left join(select catid,name as name_en,display_name as name_local,image as cat_image,cat_path
    from v_shopee_category_tree_df) ct on st.catid=ct.catid
""").toPandas()

item_stats=item_stats_all[item_stats_all["rnk"]<=40]
# display(item_stats)

                                                                                

In [15]:
for country in ["SG","MY","TH",'BR']:
    write_to_html_file(item_stats[item_stats["country"]==country], \
                       '{day} {country}各类目 14d GMV前20item'.format(day=day,country=country), \
                       './export/{country}-{day}-top_20_items.html'.format(day=day,country=country))

## 中国卖家的数据

In [16]:
# 1.6 是预估的单件数；
# 从moniso的订单数据来反推，sold和historical_sold 均是订单量；

china_item_stats_all=spark.sql("""
select 
    case when country="ID" then 'https://shopee.co.id/item-i.'
         when country="SG" then 'https://shopee.sg/item-i.'
         when country="MY" then 'https://shopee.com.my/item-i.'
         when country="TH" then 'https://shopee.co.th/item-i.'
         when country="BR" then 'https://shopee.com.br/item-i.'
        end||shopid||'.'||itemid as item_url
    ,name as item_name
    ,ct.catid
    ,image
    ,day
    ,country
    ,case when shop_location='ต่างประเทศ' then 'Overseas' else shop_location end as shop_location
    ,ct.cat_path as cat_name_en
    ,ct.name_local as cat_name_local
    ,position as ranking
    ,sold solds_14d
    ,historical_sold historical_solds
    ,round(1.6*(sold*price_cny)) gmv_14d
    ,round(1.6*(historical_sold*price_cny)) historical_gmv
    ,round(price_cny,2) avg_price_cny
    ,row_number() over (partition by ct.catid,day order by (sold*price_cny) desc) as rnk
from v_items_stats st
left join(select catid,name as name_en,display_name as name_local,image as cat_image,cat_path
    from v_shopee_category_tree_df) ct on st.catid=ct.catid
where shop_location in('Mainland China',"ต่างประเทศ",'Overseas')
""").toPandas()

china_item_stats=china_item_stats_all[china_item_stats_all["rnk"]<=40]
# display(item_stats)

                                                                                

In [17]:
for country in ["SG","MY",'TH','BR']:
    write_to_html_file(china_item_stats[china_item_stats["country"]==country], \
                       '{day} {country}各类目 14d GMV前20item（大陆卖家）'.format(day=day,country=country), \
                       './export/{country}-{day}-top_20_items_cn_seller.html'.format(day=day,country=country))
    
from IPython.display import FileLink, FileLinks
FileLinks('./export/')

# 这里将数据写入Postgres DB，进行superset可视化

In [1]:
import psycopg2

conn = psycopg2.connect(
   database="warehouse", user='postgres', password='postgres-local', host='db-postgres', port= '5432'
)

In [19]:
insert_sql_tpl="""
insert into item_stats(item_url,item_name,catid,image,day,country,shop_location,cat_name_en,cat_name_local,ranking,solds_14d,historical_solds,gmv_14d,historical_gmv,avg_price_cny,rnk)
values {values}
ON CONFLICT (item_url,day) DO NOTHING;
"""

value_tpl="""('{item_url}','{item_name}','{catid}','{image}','{day}','{country}','{shop_location}','{cat_name_en}','{cat_name_local}','{ranking}','{solds_14d}','{historical_solds}','{gmv_14d}','{historical_gmv}','{avg_price_cny}','{rnk}')"""


In [20]:
def insert_batch(insert_sql):
    conn = psycopg2.connect(database="warehouse", user='postgres', \
                            password='postgres-local', host='db-postgres', port= '5432')
    cursor = conn.cursor()
    cursor.execute(insert_sql)
    conn.commit()
    conn.close()

    
def insert_for_df(item_stats_all,insert_sql_tpl=insert_sql_tpl,value_tpl=value_tpl):
    values_list=[]
    for index,row in item_stats_all.iterrows():
        dic=row.to_dict()
        if 'catid' in dic:
            dic['catid']=int(dic['catid'])
        if 'item_name' in dic and "'" in dic['item_name']:
            dic['item_name']=dic['item_name'].replace("'","''")
        if "'" in dic['cat_name_en']:
            dic['cat_name_en']=dic['cat_name_en'].replace("'","''")
        if "'" in dic['cat_name_local']:
            dic['cat_name_local']=dic['cat_name_local'].replace("'","''")

        values_list.append(value_tpl.format(**dic))
        if len(values_list) >=2000:
            print("about to insert 2000 rows...")
            insert_batch(insert_sql_tpl.format(values=",".join(values_list)))
            values_list=[]

    if len(values_list)>0:
        print("about to insert final batch rows:", len(values_list))
        insert_batch(insert_sql_tpl.format(values=",".join(values_list)))
        values_list=[]

    print("done")

In [21]:
def create_item_stats_view_for_day(day):
    spark.sql("""
    create or replace temp view v_items_stats as
    select itemid
        ,shopid
        ,params.match_id as catid
        ,day
        ,country
        ,shop_location
        ,avg(historical_sold) as historical_sold
        ,avg(sold) as sold
        ,avg(price)/100000 as price_local
        ,avg(price*(case when country = 'ID' then 0.00044 
                         when country = 'SG' then 4.65 
                         when country = 'MY' then 1.5055 
                         when country = 'TH' then 0.190647 
                         when country = 'BR' then 1.13 end))/100000 as price_cny
        ,avg(price_before_discount)/100000 as price_before_discount_local
        ,avg(price_before_discount*(case when country = 'ID' then 0.00044 
                         when country = 'SG' then 4.65 
                         when country = 'MY' then 1.5055
                         when country = 'TH' then 0.190647 
                         when country = 'BR' then 1.13 end))/100000 as price_before_discount_cny
        ,round(avg(liked_count)) as liked_count
        ,round(avg(view_count)) as view_count
        ,round(avg(pos+params.newest)) as position
        ,max(image) as image
        ,max(collection_id) as collection_id
        ,max(name) as name
        ,max(url) as url_appears
    from v_shopee_category_search_list_df
    where day='{day}' 
        and params.scenario!="PAGE_COLLECTION_SEARCH"
    group by itemid,shopid,params.match_id,day,country,shop_location
    """.format(day=day)).show()
    
def insert_for_day(day):
    item_stats_all=spark.sql("""
    select 
        case when country="ID" then 'https://shopee.co.id/item-i.'
             when country="SG" then 'https://shopee.sg/item-i.'
             when country="MY" then 'https://shopee.com.my/item-i.'
             when country="TH" then 'https://shopee.co.th/item-i.'
             when country="BR" then 'https://shopee.com.br/item-i.'
            end||shopid||'.'||itemid as item_url
        ,name as item_name
        ,ct.catid
        ,image
        ,day
        ,country
        ,shop_location
        ,ct.cat_path as cat_name_en
        ,ct.name_local as cat_name_local
        ,cast(position as int) as ranking
        ,cast(sold as int) solds_14d
        ,cast(historical_sold as int) historical_solds
        ,round(1.6*(sold*price_cny)) gmv_14d
        ,round(1.6*(historical_sold*price_cny)) historical_gmv
        ,round(price_cny,2) avg_price_cny
        ,row_number() over (partition by ct.catid,day order by (sold*price_cny) desc) as rnk
    from v_items_stats st
    left join(select cast(catid as int) as catid,name as name_en,display_name as name_local,image as cat_image,cat_path
        from v_shopee_category_tree_df) ct on st.catid=ct.catid
    """)
    item_stats_all.printSchema()
    item_stats_all=item_stats_all.toPandas()
    
    insert_for_df(item_stats_all)
    print("finished with day:",day)

In [22]:
cat_insert_sql_tpl="""insert into cat_stats_v2(catid,cat_image,day,country,shop_location,cat_name_en,cat_name_local,ranking,solds_14d,
historical_solds,shops,items,gmv_14d,historical_gmv,avg_price_cny,price_25percent_cny,price_50percent_cny,price_75percent_cny,
gmv14d_25percent_cny,gmv14d_50percent_cny,gmv14d_75percent_cny)
values {values}
ON CONFLICT (cat_name_en, day, ranking) DO NOTHING;"""

cat_values_tpl="""('{catid}','{cat_image}','{day}','{country}','{shop_location}','{cat_name_en}','{cat_name_local}','{ranking}','{solds_14d}',
'{historical_solds}','{shops}','{items}','{gmv_14d}','{historical_gmv}','{avg_price_cny}','{price_25percent_cny}','{price_50percent_cny}',
'{price_75percent_cny}','{gmv14d_25percent_cny}','{gmv14d_50percent_cny}','{gmv14d_75percent_cny}')"""

def insert_cat_stats_for_day(day):
    cat_stats_all=spark.sql("""
    select
        ct.catid
        ,cat_image
        ,day
        ,country
        ,shop_location
        ,ct.cat_path as cat_name_en
        ,ct.name_local as cat_name_local
        ,case when position<=20 then 'less_20'
              when position<=40 then 'less_40'
              when position<=60 then 'less_60' else 'over_60' end as ranking
        ,cast(sum(sold) as int) solds_14d
        ,cast(sum(historical_sold) as int) historical_solds
        ,count(distinct shopid) as shops
        ,count(distinct shopid,itemid) as items 
        ,round(sum(1.6*(sold*price_cny))) gmv_14d
        ,round(sum(1.6*(historical_sold*price_cny))) historical_gmv
        ,round(avg(price_cny),2) avg_price_cny
        ,percentile(price_cny, 0.25) as price_25percent_cny
        ,percentile(price_cny, 0.50) as price_50percent_cny
        ,percentile(price_cny, 0.75) as price_75percent_cny
        ,percentile(1.6*price_cny*sold, 0.25) as gmv14d_25percent_cny
        ,percentile(1.6*price_cny*sold, 0.50) as gmv14d_50percent_cny
        ,percentile(1.6*price_cny*sold, 0.75) as gmv14d_75percent_cny
    from v_items_stats st
    left join(select catid,name as name_en,display_name as name_local,image as cat_image,cat_path
        from v_shopee_category_tree_df) ct on st.catid=ct.catid
    group by ct.catid,day,country,shop_location,cat_name_en,cat_name_local,ranking,ct.cat_image
    """).toPandas()
    
    insert_for_df(cat_stats_all,cat_insert_sql_tpl,cat_values_tpl)
    print("finished with day:",day)

In [23]:
for day in [today]:
    create_item_stats_view_for_day(day)
    insert_for_day(day)
    insert_cat_stats_for_day(day)

++
||
++
++

root
 |-- item_url: string (nullable = true)
 |-- item_name: string (nullable = true)
 |-- catid: integer (nullable = true)
 |-- image: string (nullable = true)
 |-- day: date (nullable = true)
 |-- country: string (nullable = false)
 |-- shop_location: string (nullable = true)
 |-- cat_name_en: string (nullable = true)
 |-- cat_name_local: string (nullable = true)
 |-- ranking: integer (nullable = true)
 |-- solds_14d: integer (nullable = true)
 |-- historical_solds: integer (nullable = true)
 |-- gmv_14d: double (nullable = true)
 |-- historical_gmv: double (nullable = true)
 |-- avg_price_cny: decimal(38,2) (nullable = true)
 |-- rnk: integer (nullable = false)



                                                                                

about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to insert 2000 rows...
about to inser

                                                                                

about to insert 2000 rows...
about to insert final batch rows: 100
done
finished with day: 2021-12-31
