### Mount Colab to Google Drive

In [None]:
from google.colab import drive
from pathlib import Path
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#@markdown To access data saved on Shared Folder, add them to "My drive" as shortcut first
root_path="/content/drive/My Drive/" #@param {type:"string"}
_root = Path(root_path)
#!ls "$root_path"

### Import librabries

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import sqlite3 as lite

In [None]:
import io
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.parquet as pq

### Create/Connect to a sqlite3 database

In [None]:
db_path = "Main.db" #@param {type:"string"}

In [None]:
#@markdown Connect to sqlite3 database (which reside on Google Drive)
# Create parent path to the DB first if not exist
_db_path = _root.joinpath(db_path)
Path(_db_path).parent.mkdir(parents=True, exist_ok=True)

try:
    conn = lite.connect(_db_path)
    print(f"sqlite3 {lite.version} has connected to database successfully")
except lite.Error as e:
    print("Error: ", e)

sqlite3 2.6.0 has connected to database successfully


### Helper Function

In [None]:
def ArrowSchemaToPyDict(dt):
  # If this data type is schema or struct
  # then iterate each field inside
  if isinstance(dt, (pa.lib.Schema, pa.lib.StructType)):
    schema = {}
    for field in dt:
      schema[field.name] = ArrowSchemaToPyDict(field.type)
    return schema

  # If this data type is list
  # then wrap them in a list of struct
  elif isinstance(dt, pa.lib.ListType):
    return [ArrowSchemaToPyDict(dt.value_field.type)]

  # If this data type is arrow defined
  # then return string value
  elif isinstance(dt, pa.lib.DataType):
    return str(dt)

In [None]:
def PyDictToArrowSchema(dt, level=0):
  if level == 0:
    return pa.schema([
        (field, PyDictToArrowSchema(dt[field], level=level+1)) for field in dt.keys()
    ])
  
  if isinstance(dt, dict):
    return pa.struct([
        (field, PyDictToArrowSchema(dt[field], level=level+1)) for field in dt.keys()
    ])
  elif isinstance(dt, list):
    return pa.list_(PyDictToArrowSchema(dt[0], level=level+1))
  elif isinstance(dt, str):
    map = {
        'string': pa.string(),
        'int64': pa.int64(),
        'bool': pa.bool_(),
        'null': pa.null(),
        'double': pa.float64()
    }
    return map[str(dt)]

In [None]:
def ArrowSchemaToDefinition(dt):
  # If this data type is schema or struct
  # then iterate each field inside
  if isinstance(dt, pa.lib.Schema):
    def_str = f"""pa.schema([
      {','.join([
          'pa.field("'+field.name+'", '+ArrowSchemaToDefinition(field.type)+')' 
          for field in dt
      ])}
    ])"""
  elif isinstance(dt, pa.lib.StructType):
    def_str = f"""pa.struct([
      {','.join([
          'pa.field("'+field.name+'", '+ArrowSchemaToDefinition(field.type)+')' 
          for field in dt
      ])}
    ])"""

  # If this data type is list
  # then wrap them in a list of struct
  elif isinstance(dt, pa.lib.ListType):
    def_str = f"""pa.list_({ArrowSchemaToDefinition(dt.value_field.type)})"""
    
  # If this data type is arrow defined
  # then return string value
  elif isinstance(dt, pa.lib.DataType):
    map = {
        'string': 'pa.string()',
        'int64': 'pa.int64()',
        'bool': 'pa.bool_()',
        'null': 'pa.null()',
        'double': 'pa.float64()'
    }
    def_str = map[str(dt)]

  return def_str

In [None]:
# Stolen from https://stackoverflow.com/questions/71035754/pyarrow-drop-a-column-in-a-nested-structure/71039389#71039389?newreg=0e5baf2fd7184da8adad65c8e1789db3
# and improve
def ArrowDropNull(array):
    # Catch table first
    if isinstance(array, pa.Table):
        names = []
        arrays = []
        for field in array.schema:
          data = ArrowDropNull(array[field.name])
          if data is not None:
            names.append(field.name)
            arrays.append(data)
        return pa.Table.from_arrays(arrays, names)

    # Catch null type second
    if pa.types.is_null(array.type): return None

    # Catch other types later
    if pa.types.is_struct(array.type):
        # Bắt buộc phải gom chunked array -> StructArray
        if isinstance(array, pa.ChunkedArray):
            array = array.combine_chunks()
        names = []
        arrays = []
        for index, field in enumerate(array.type):
          data = ArrowDropNull(array.field(index))
          if data is not None:
            names.append(field.name)
            arrays.append(data)        
        if len(arrays) == 0: return None  
        else: return pa.StructArray.from_arrays(arrays, names)

    elif pa.types.is_list(array.type):
        if isinstance(array, pa.ChunkedArray):
            array = array.combine_chunks()
        offset = array.offsets
        value = ArrowDropNull(array.values)
        if value is None: return None
        else: return pa.ListArray.from_arrays(offset, value)
    else:
        return array

In [None]:
def ArrowTableExplode(df, col):
  """
  Explode a column in Table, replace the original column and return a new table
  """
  exploded_arr = df.column(col).combine_chunks()
  idxs = exploded_arr.value_parent_indices()
  exploded_arr = exploded_arr.flatten()

  col_idx = df.column_names.index(col)
  df = df.drop([col])
  return df\
    .take(idxs)\
    .add_column(col_idx, col, exploded_arr)

# Main

In [None]:
pd.read_sql("SELECT name FROM sqlite_master WHERE type='table';", conn)

Unnamed: 0,name
0,DIM_ACTIVE_HOURS
1,DIM_SHOPEE_CATEGORY
2,DIM_SHOPEE_CUISINE
3,DIM_SHOPEE_ROOT_CATEGORY
4,DIM_FOODY_CUISINE
5,DIM_FOODY_CATEGORY
6,DIM_FOODY_AUDIENCE
7,RESTAURANT_bk
8,DIM_SHOPEE_BRAND
9,RESTAURANT


## Shopee

### Extract image information to database

In [None]:
fnd_dishes = pq.read_table(
    _root.joinpath("Data/2 - cleaned data/Foundation/2022.11.26.restaurant_dishes.parquet"), 
    memory_map=True, 
    columns=['dish_id', 'pictures'])

In [None]:
db_dishes = pd.read_sql("SELECT * FROM DIM_SHOPEE_DISHES", conn)

In [None]:
fnd_dishes = ArrowTableExplode(fnd_dishes, 'pictures').flatten()

In [None]:
fnd_dish_pictures = fnd_dishes\
  .sort_by([
      ("pictures.width", "descending"), 
      ("pictures.height", "descending")
    ])\
  .group_by("dish_id")\
  .aggregate([
      ("pictures.url", "one")
    ]).to_pandas()

In [None]:
fnd_dish_pictures.columns

Index(['pictures.url_one', 'dish_id'], dtype='object')

In [None]:
fnd_dish_pictures.columns = ['url', 'dish_id']

In [None]:
t = db_dishes.merge(fnd_dish_pictures, how='left', on="dish_id")

In [None]:
t.to_sql('DIM_SHOPEE_DISHES',conn, if_exists='replace',index=False)

In [None]:
pd.read_sql("SELECT * FROM DIM_SHOPEE_DISHES LIMIT 10", conn)

Unnamed: 0,catalog_id,dish_total_order,catalog_name,catalog_rank,catalog_partner_catalog_id,catalog_description,dish_restaurant_id,dish_id,dish_name,dish_partner_dish_id,dish_listing_status,dish_description,dish_total_like,dish_rank,dish_picture_label,dish_is_hidden,dish_price,dish_is_group_discount_item,dishes_property_info,url
0,2635001,160.0,Món Hot,-11.0,,,1114911.0,46050027.0,Trà Đào,,1,Trà đào kèm miếng đào tươi,4.0,0.0,,0,24000.0,1,"{""has_alcohol"": false}",https://images.foody.vn/res/g112/1114911/s1242...
1,2635001,136.0,Món Hot,-11.0,,,1114911.0,28885958.0,Trà Sữa Nọng Xanh,,1,Trà Sữa Nọng Xanh là trà sữa vị thái xanh đã b...,55.0,0.0,,0,24000.0,1,"{""has_alcohol"": false}",https://images.foody.vn/res/g103/1025330/s1242...
2,2635001,126.0,Món Hot,-11.0,,,1114911.0,46050017.0,Trà Vải,,1,Trà vải tươi mát kèm trái vải,3.0,0.0,,0,24000.0,1,"{""has_alcohol"": false}",https://images.foody.vn/res/g112/1114911/s1242...
3,2635001,4.0,Món Hot,-11.0,,,1114911.0,121626885.0,Cà Phê Sữa Tươi Hạnh Nhân Phin Di,,1,,0.0,0.0,,0,29000.0,1,"{""has_alcohol"": false}",
4,2635001,4.0,Món Hot,-11.0,,,1114911.0,121637344.0,Sữa Tươi Trân Châu Đường Đen HQ,,1,,0.0,0.0,,0,35000.0,1,"{""has_alcohol"": false}",
5,2635001,2.0,Món Hot,-11.0,,,1114911.0,121638144.0,Cà Phê Sữa Tươi,,1,,0.0,0.0,,0,25000.0,1,"{""has_alcohol"": false}",
6,2635001,1.0,Món Hot,-11.0,,,1114911.0,118070637.0,Combo 110,,1,1 Trà sữa trân châu phô mai\n1 Trà sữa cục xì ...,0.0,0.0,,0,145000.0,1,"{""has_alcohol"": false}",https://images.foody.vn/res/g103/1025330/s1242...
7,2635001,1.0,Món Hot,-11.0,,,1114911.0,121636221.0,Ca Cao Dằm Cốt Dừa Tuổi Thơ,,1,,0.0,0.0,,0,29000.0,1,"{""has_alcohol"": false}",
8,2635001,0.0,Món Hot,-11.0,,,1114911.0,121627002.0,Cà Phê Sữa Tươi Sương Sáo,,1,,0.0,0.0,,0,29000.0,1,"{""has_alcohol"": false}",
9,2360173,27.0,NOWFOOD DAY,-10.0,,,1114911.0,31028098.0,Combo: 2 Ly Trà Sữa Lài Thạch Trứng,,1,,4.0,0.0,,0,58000.0,1,"{""has_alcohol"": false}",


In [None]:
with conn:
    cur = conn.cursor()
    cur.execute("DROP TABLE DIM_SHOPEE_DISHES_t")

### FND Shopee dish (2022/11/26 - 06)

In [None]:
data_date = "2022.12.06"
#"2022.12.03"
#"2022.11.29"
#"2022.11.26"

In [None]:
stg_dishes = pq.read_table(_root.joinpath(f"Data/2 - cleaned data/Staging/{data_date}.dishes.parquet"), memory_map=True)
print(pa.total_allocated_bytes()/1024/1024, "MB")

5262.9560546875 MB


In [None]:
stg_dishes = stg_dishes.flatten()
stg_dishes = stg_dishes.drop(['msg','code'])
stg_dishes = ArrowTableExplode(stg_dishes, 'data.catalogs').flatten()

#### Extract catalogs

In [None]:
fnd_catalog = pa.table([
    stg_dishes['restaurant_id'],
    stg_dishes['data.catalogs.restaurant_id'],
    stg_dishes['data.catalogs.id'],
    stg_dishes['data.catalogs.name'],
    stg_dishes['data.catalogs.partner_catalog_id'],
    stg_dishes['data.catalogs.rank'],
    stg_dishes['data.catalogs.sort_type'],
    stg_dishes['data.catalogs.is_group_discount'],
], names=[
    "restaurant_id",
    "restaurant_id_rep",
    "catalog_id",
    "catalog_name",
    "partner_catalog_id",
    "rank",
    "sort_type",
    "is_group_discount"
])

Check key không khớp giữa 2 level

In [None]:
fnd_catalog.filter(pc.not_equal(fnd_catalog["restaurant_id"], fnd_catalog["restaurant_id_rep"])).to_pandas()

Unnamed: 0,restaurant_id,restaurant_id_rep,catalog_id,catalog_name,partner_catalog_id,rank,sort_type,is_group_discount
0,1361,813,9,Menu,,1,1,
1,1517,857,1,Breakfast,,1,1,
2,5523,13358,445,Sinh tố,,0,1,
3,5523,1350,291,Gỏi,,2,1,
4,5523,3244,179,Thịt gà (Gà thả vườn),,2,1,
5,5523,3599,459,Món Khác - Various Dishes,,10,1,
6,5523,5997,502,Cafe - Trà - Chocolate,,12,1,
7,5523,4335,368,Bò,,27,1,
8,5523,96,342,Bia - Beer,,29,1,


In [None]:
fnd_catalog.filter(pc.field("catalog_id") == 342).to_pandas()

Unnamed: 0,restaurant_id,restaurant_id_rep,catalog_id,catalog_name,partner_catalog_id,rank,sort_type,is_group_discount
0,5523,96,342,Bia - Beer,,29,1,


In [None]:
pd.read_sql_query("select * from restaurant where restaurant_id=5523", conn)

Unnamed: 0,restaurant_id,restaurant_url,restaurant_name,name_en,restaurant_short_description,address_detail,address_district,address_city,lat,lon,brand_id,delivery_avg_price,delivery_fees,delivery_has_contract,delivery_id,delivery_merchant_limit_distance,delivery_merchant_time,delivery_payment_methods,delivery_prepare_duration,delivery_service_by,delivery_setting_limit_distance,delivery_shipping_fee_minimum,delivery_shipping_fee_rate,limit_distance,asap_is_available,has_phone,is_city_alert,is_display_cutlery,is_quality_merchant,contract_type,position_verifie,foody_service_id,parent_category_id,min_order_value,price_range_max_price,price_range_min_price,total_like,total_order,rating_avg,rating_total_review,promotion_count,area_address,status,min_price,max_price,average_rating,position_rating,price_rating,quality_rating,service_rating,space_rating,checked_in_count,favourite_count,review_average_count,review_bad_count,review_count,review_excellent_count,review_good_count,view_count,wanted_count,picture_count
0,5523,five-oysters,Five Oysters - Authentic Vietnamese Cuisine,,,"234 - 236 Bùi Viện, P. Phạm Ngũ Lão",Quận 1,TP. HCM,10.765773,106.691027,-1,58750,[],1,925,-1,12,"[6, 1, 4, 8]",12,ShopeeFood,7000,16000.0,1.02,20000,0,1,0,1,0,1,1,1,2387,20000,110000.0,35000.0,0.0,249.0,2.3,3,5,Khu vực Phố Tây Phạm Ngũ Lão,Đang hoạt động,35000.0,110000.0,8.0,7.9,7.8,8.2,8.3,8.1,16,85,2.0,1.0,39.0,12.0,24.0,12661.0,144.0,306.0


In [None]:
pd.read_sql_query("select * from restaurant where restaurant_id=96", conn)

Unnamed: 0,restaurant_id,restaurant_url,restaurant_name,name_en,restaurant_short_description,address_detail,address_district,address_city,lat,lon,brand_id,delivery_avg_price,delivery_fees,delivery_has_contract,delivery_id,delivery_merchant_limit_distance,delivery_merchant_time,delivery_payment_methods,delivery_prepare_duration,delivery_service_by,delivery_setting_limit_distance,delivery_shipping_fee_minimum,delivery_shipping_fee_rate,limit_distance,asap_is_available,has_phone,is_city_alert,is_display_cutlery,is_quality_merchant,contract_type,position_verifie,foody_service_id,parent_category_id,min_order_value,price_range_max_price,price_range_min_price,total_like,total_order,rating_avg,rating_total_review,promotion_count,area_address,status,min_price,max_price,average_rating,position_rating,price_rating,quality_rating,service_rating,space_rating,checked_in_count,favourite_count,review_average_count,review_bad_count,review_count,review_excellent_count,review_good_count,view_count,wanted_count,picture_count


Check catalog id duplicate

In [None]:
fnd_catalog.num_rows - pc.count_distinct(fnd_catalog["catalog_id"]).as_py()

3283

In [None]:
fnd_catalog.group_by(["catalog_id"]).aggregate([
    ("catalog_id", "count")
]).filter(pc.field("catalog_id_count") > 1).to_pandas()

Unnamed: 0,catalog_id_count,catalog_id
0,2,179
1,3283,-1


In [None]:
fnd_catalog.filter(pc.field("catalog_id") == 179).to_pandas()

Unnamed: 0,restaurant_id,restaurant_id_rep,catalog_id,catalog_name,partner_catalog_id,rank,sort_type,is_group_discount
0,3244,3244,179,Thịt gà (Gà thả vườn),,2,1,
1,5523,3244,179,Thịt gà (Gà thả vườn),,2,1,


Check xem nếu trùng id thì các field khác có trùng k

In [None]:
c = fnd_catalog.column_names
c

['restaurant_id',
 'restaurant_id_rep',
 'catalog_id',
 'catalog_name',
 'partner_catalog_id',
 'rank',
 'sort_type',
 'is_group_discount']

In [None]:
for x in c:
  print("Checking ", x, ", number of invalidated catalogs: ",
  fnd_catalog.group_by(["catalog_id"]).aggregate([
    (x, "count_distinct")
  ]).filter(pc.field(f"{x}_count_distinct") > 1).num_rows)

Checking  restaurant_id , number of invalidated catalogs:  2
Checking  restaurant_id_rep , number of invalidated catalogs:  0
Checking  catalog_id , number of invalidated catalogs:  0
Checking  catalog_name , number of invalidated catalogs:  0
Checking  partner_catalog_id , number of invalidated catalogs:  0
Checking  rank , number of invalidated catalogs:  0
Checking  sort_type , number of invalidated catalogs:  0
Checking  is_group_discount , number of invalidated catalogs:  0


Drop dòng trùng

In [None]:
fnd_catalog.group_by(fnd_catalog.column_names).aggregate([("catalog_id","count")]).num_rows - fnd_catalog.num_rows

0

In [None]:
fnd_catalog = fnd_catalog.group_by(fnd_catalog.column_names).aggregate([("catalog_id","count")]).drop(["catalog_id_count"])

Save the table

In [None]:
pq.write_table(fnd_catalog, _root.joinpath(f"Data/2 - cleaned data/Foundation/{data_date}.restaurant_catalogs.parquet"))

#### Extract dish information

In [None]:
fnd_dish = pa.table([
    stg_dishes['restaurant_id'],
    stg_dishes['data.catalogs.id'],
    stg_dishes["data.catalogs.dishes"]
], names=[
    "restaurant_id",
    "catalog_id",
    "dishes"
])

Flatten và rename cột

In [None]:
fnd_dish = ArrowTableExplode(fnd_dish, "dishes").flatten()

In [None]:
f = lambda s: s.replace("dishes.", "") if s != "dishes.id" else "dish_id"
new_c = [f(c) for c in fnd_dish.column_names]
if new_c[0] == "restaurant_id":
  new_c[0] = "primary_restaurant_id"
else:
  raise Exception("Schema have changed compared to schema of data collected in the date 2022/11/26, please recheck")

if new_c[1] == "catalog_id":
  new_c[1] = "primary_catalog_id"
else:
  raise Exception("Schema have changed compared to schema of data collected in the date 2022/11/26, please recheck")

new_c

['primary_restaurant_id',
 'primary_catalog_id',
 'listing_status',
 'partner_dish_id',
 'description',
 'total_like',
 'restaurant_id',
 'rank',
 'dish_id',
 'property_info',
 'catalog_id',
 'stock_info',
 'is_hidden',
 'sale_time_info',
 'price',
 'is_group_discount_item',
 'name',
 'pictures',
 'discount_price',
 'is_searchable',
 'discount_remaining_quantity',
 'limit_type',
 'picture_label',
 'is_discount_topping',
 'discount_item_tag']

In [None]:
fnd_dish = fnd_dish.rename_columns(new_c)

In [None]:
fnd_dish = fnd_dish.flatten()

In [None]:
f = lambda s: s.replace(".", "_")
new_c = [f(c) for c in fnd_dish.column_names]
fnd_dish = fnd_dish.rename_columns(new_c)
new_c

['primary_restaurant_id',
 'primary_catalog_id',
 'listing_status',
 'partner_dish_id',
 'description',
 'total_like',
 'restaurant_id',
 'rank',
 'dish_id',
 'property_info_has_alcohol',
 'catalog_id',
 'stock_info_start_time',
 'stock_info_is_out_stocked',
 'stock_info_end_time',
 'is_hidden',
 'sale_time_info_loop_sale_days',
 'sale_time_info_is_in_sale_time',
 'sale_time_info_custom_sale_days',
 'price',
 'is_group_discount_item',
 'name',
 'pictures',
 'discount_price',
 'is_searchable',
 'discount_remaining_quantity',
 'limit_type',
 'picture_label_photos',
 'picture_label_label_position',
 'is_discount_topping',
 'discount_item_tag']

Xử lý catalog:
- Trong hierarchy cũ là catalog tới dish nên bị duplicate dish
- Trong phần này cần transform lại mỗi dòng là 1 unique dish của restaurant và danh sách các catalog mà dish này thuộc về

Kiểm tra xem cùng dish_id thì có cùng data k (integrity)

In [None]:
for x in fnd_dish.column_names:
  try:
    print("Checking ", x, ", number of invalidated dishes: ",
    fnd_dish.group_by(["dish_id"]).aggregate([
      (x, "count_distinct")
    ]).filter(pc.field(f"{x}_count_distinct") > 1).num_rows)
  except pa.ArrowNotImplementedError:
    print("Fail to check ", x, ", this could be because of this column is of type struct")

Checking  primary_restaurant_id , number of invalidated dishes:  0
Checking  primary_catalog_id , number of invalidated dishes:  16069
Checking  listing_status , number of invalidated dishes:  0
Checking  partner_dish_id , number of invalidated dishes:  0
Checking  description , number of invalidated dishes:  0
Checking  total_like , number of invalidated dishes:  0
Checking  restaurant_id , number of invalidated dishes:  0
Checking  rank , number of invalidated dishes:  0
Checking  dish_id , number of invalidated dishes:  0
Checking  property_info_has_alcohol , number of invalidated dishes:  0
Checking  catalog_id , number of invalidated dishes:  0
Checking  stock_info_start_time , number of invalidated dishes:  0
Checking  stock_info_is_out_stocked , number of invalidated dishes:  0
Checking  stock_info_end_time , number of invalidated dishes:  0
Checking  is_hidden , number of invalidated dishes:  0
Fail to check  sale_time_info_loop_sale_days , this could be because of this column 

Kiểm tra xem restaurant_id, catalog_id bên ngoài và bên trong có khớp k

In [None]:
fnd_dish.filter(
    pc.field("restaurant_id") != pc.field("primary_restaurant_id")
).to_pandas()

Unnamed: 0,primary_restaurant_id,primary_catalog_id,listing_status,partner_dish_id,description,total_like,restaurant_id,rank,dish_id,property_info_has_alcohol,catalog_id,stock_info_start_time,stock_info_is_out_stocked,stock_info_end_time,is_hidden,sale_time_info_loop_sale_days,sale_time_info_is_in_sale_time,sale_time_info_custom_sale_days,price,is_group_discount_item,name,pictures,discount_price,is_searchable,discount_remaining_quantity,limit_type,picture_label_photos,picture_label_label_position,is_discount_topping,discount_item_tag


In [None]:
fnd_dish.filter(
    pc.field("catalog_id") != pc.field("primary_catalog_id")
).to_pandas()

Unnamed: 0,primary_restaurant_id,primary_catalog_id,listing_status,partner_dish_id,description,total_like,restaurant_id,rank,dish_id,property_info_has_alcohol,catalog_id,stock_info_start_time,stock_info_is_out_stocked,stock_info_end_time,is_hidden,sale_time_info_loop_sale_days,sale_time_info_is_in_sale_time,sale_time_info_custom_sale_days,price,is_group_discount_item,name,pictures,discount_price,is_searchable,discount_remaining_quantity,limit_type,picture_label_photos,picture_label_label_position,is_discount_topping,discount_item_tag
0,1056007,-1,True,,Topping trân châu. Lượng đường cố định,6,1056007,0,27032187,False,2905616,0,False,0,False,"[{'time_for_sales': [{'start_time_sec': 0, 'en...",True,,28000.0,True,Trà Sữa Trân Châu,[{'url': 'https://images.foody.vn/res/g98/9786...,7000.0,False,1.0,2.0,,,,
1,1056007,-1,True,,Khách hàng vui lòng ghi chú lại Tên Vị muốn đặ...,0,1056007,0,23084533,False,2134639,0,False,0,False,"[{'time_for_sales': [{'start_time_sec': 0, 'en...",True,,169000.0,True,Combo 04 chai (gồm đủ 04 vị),[{'url': 'https://images.foody.vn/res/g98/9786...,112000.0,False,1.0,3.0,,,,
2,1076537,-1,True,,Topping trân châu. Lượng đường cố định,1,1076537,0,27032208,False,2905637,0,False,0,False,"[{'time_for_sales': [{'start_time_sec': 0, 'en...",True,,28000.0,True,Trà Sữa Trân Châu,[{'url': 'https://images.foody.vn/res/g98/9786...,12000.0,False,1.0,3.0,,,,
3,1080967,-1,True,,,0,1080967,0,87032839,False,3977684,0,False,0,False,"[{'time_for_sales': [{'start_time_sec': 0, 'en...",True,,130000.0,True,2 CƠM BA CHỈ CHIÊN GIÒN MẮM TỎI + 2 TRÀ CHANH,[{'url': 'https://images.foody.vn/res/g109/108...,112000.0,False,1.0,3.0,,,,
4,1084977,-1,True,,"4 viên sushi cá hồi - tôm thẻ, 6 viên sushi ma...",6,1084977,0,16782868,False,5977918,0,False,0,False,"[{'time_for_sales': [{'start_time_sec': 0, 'en...",True,,159000.0,True,Combo Sushi C5,[{'url': 'https://images.foody.vn/res/g109/108...,112000.0,False,1.0,3.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16064,1000033440,-1,True,,,0,1000033440,0,24993127,False,2205644,0,False,0,False,"[{'time_for_sales': [{'start_time_sec': 0, 'en...",True,,25000.0,True,Bánh tráng trộn,,12000.0,False,1.0,3.0,,,,
16065,1126084,-1,True,,,0,1126084,0,69462744,False,3435674,0,False,0,False,"[{'time_for_sales': [{'start_time_sec': 0, 'en...",True,,25000.0,True,Sữa Chua Trân Châu,,12000.0,False,1.0,1.0,,,,
16066,1126087,-1,True,,,0,1126087,0,69462747,False,3435677,0,False,0,False,"[{'time_for_sales': [{'start_time_sec': 0, 'en...",True,,25000.0,True,Sữa Chua Trân Châu,,12000.0,False,1.0,1.0,,,,
16067,1000032932,-1,True,,,1,1000032932,0,23802555,False,2164952,0,False,0,False,"[{'time_for_sales': [{'start_time_sec': 0, 'en...",True,,27000.0,True,Bánh mì thịt chả,,25000.0,False,,,,,,


Gom các catalog_id vô thành list, rồi distinct lại phần dish information

In [None]:
# Tui bị ngu
# t = fnd_dish\
#   .group_by(["primary_restaurant_id", "dish_id"])\
#   .aggregate([
#       ("primary_catalog_id", "count"),
#       ("catalog_id", "count"),
#   ])\
#   .sort_by([
#       ("primary_restaurant_id", "ascending"), 
#       ("dish_id", "ascending")
#   ])
# t = t.set_column(0, "primary_catalog_id_offset", pc.cumulative_sum(t["primary_catalog_id_count"], start=-1))
# t = t.set_column(1, "catalog_id_offset", pc.cumulative_sum(t["catalog_id_count"], start=-1))
# t

In [None]:
# Tui bị ngu...
# primary_catalog_id_arr = pa.ListArray.from_arrays(
#     offsets=t["primary_catalog_id_offset"].combine_chunks(),
#     values=fnd_dish.sort_by([
#       ("primary_restaurant_id", "ascending"), 
#       ("dish_id", "ascending")
#     ]).column("primary_catalog_id").combine_chunks()
# )
# catalog_id_arr = pa.ListArray.from_arrays(
#     offsets=t["catalog_id_offset"].combine_chunks(),
#     values=fnd_dish.sort_by([
#       ("primary_restaurant_id", "ascending"), 
#       ("dish_id", "ascending")
#     ]).column("catalog_id").combine_chunks()
# )

Lấy 1 giá trị cho mỗi dish_id (xử lý cho case cột kiểu struct)

In [None]:
dishes_first_rows = fnd_dish\
  .append_column("index", pa.array(range(fnd_dish.num_rows)))\
  .group_by(["dish_id"])\
  .aggregate([("index", "one")])

In [None]:
dishes_first_rows

pyarrow.Table
index_one: int64
dish_id: int64
----
index_one: [[0,1,2,3,4,...,1870579,1870580,1870581,1870582,1870583]]
dish_id: [[5326376,2408862,5326377,5325374,2449356,...,121625879,91725428,91725429,91725430,91725431]]

In [None]:
special_cols = ["dish_id",
"sale_time_info_loop_sale_days",
"sale_time_info_custom_sale_days",
"pictures",
"picture_label_photos" # ommited in data 2022.12.03 (this version doesn't have this column)
]
special_cols = {
    k: fnd_dish[k].take(dishes_first_rows["index_one"]) for k in special_cols
}
special_cols = pa.Table.from_pydict(special_cols).sort_by("dish_id")

In [None]:
agg = [(x, "one") 
 for x in fnd_dish.column_names 
 if x not in (
     "primary_restaurant_id", 
     "dish_id", 
     "primary_catalog_id", 
     "catalog_id",
     "sale_time_info_loop_sale_days",
     "sale_time_info_custom_sale_days",
     "pictures",
     "picture_label_photos")]
agg = [("primary_catalog_id", "list"), ("catalog_id", "list")] + agg
agg

[('primary_catalog_id', 'list'),
 ('catalog_id', 'list'),
 ('listing_status', 'one'),
 ('partner_dish_id', 'one'),
 ('description', 'one'),
 ('total_like', 'one'),
 ('restaurant_id', 'one'),
 ('rank', 'one'),
 ('property_info_has_alcohol', 'one'),
 ('stock_info_start_time', 'one'),
 ('stock_info_is_out_stocked', 'one'),
 ('stock_info_end_time', 'one'),
 ('is_hidden', 'one'),
 ('sale_time_info_is_in_sale_time', 'one'),
 ('price', 'one'),
 ('is_group_discount_item', 'one'),
 ('name', 'one'),
 ('discount_price', 'one'),
 ('is_searchable', 'one'),
 ('discount_remaining_quantity', 'one'),
 ('limit_type', 'one'),
 ('picture_label_label_position', 'one'),
 ('is_discount_topping', 'one'),
 ('discount_item_tag', 'one')]

In [None]:
fnd_dish = fnd_dish\
  .group_by(["primary_restaurant_id", "dish_id"])\
  .aggregate(agg).sort_by("dish_id")

In [None]:
for c in special_cols.column_names:
  if c != "dish_id":
    fnd_dish = fnd_dish.append_column(c, special_cols[c])

In [None]:
fnd_dish.schema

primary_catalog_id_list: list<item: int64>
  child 0, item: int64
catalog_id_list: list<item: int64>
  child 0, item: int64
listing_status_one: bool
partner_dish_id_one: string
description_one: string
total_like_one: int64
restaurant_id_one: int64
rank_one: int64
property_info_has_alcohol_one: bool
stock_info_start_time_one: int64
stock_info_is_out_stocked_one: bool
stock_info_end_time_one: int64
is_hidden_one: bool
sale_time_info_is_in_sale_time_one: bool
price_one: double
is_group_discount_item_one: bool
name_one: string
discount_price_one: double
is_searchable_one: bool
discount_remaining_quantity_one: int64
limit_type_one: int64
picture_label_label_position_one: int64
is_discount_topping_one: bool
discount_item_tag_one: string
primary_restaurant_id: int64
dish_id: int64
sale_time_info_loop_sale_days: list<element: struct<time_for_sales: list<element: struct<start_time_sec: int64, end_time_sec: int64>>, weekday: int64>>
  child 0, element: struct<time_for_sales: list<element: struct

In [None]:
pq.write_table(fnd_dish, _root.joinpath(f"Data/2 - cleaned data/Foundation/{data_date}.restaurant_dishes.parquet"))