In [None]:
import os
import concurrent.futures

import json
import requests
from datetime import date

import pandas as pd
import sqlite3 as lite
from tqdm import tqdm

import io
import json
import pyarrow as pa
import pyarrow.json as pj
import pyarrow.parquet as pq

# Mount Google Drive and connect to SQLlite3

In [None]:
from google.colab import drive
from pathlib import Path
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls drive/MyDrive/Data/'1 - raw data'/shopee/'shopee dish'

 20221126_dishes    dishes
 20221129_dishes    dishes21_11.zip
 20221203_dishes    dishes_restaurant_ids.xlsx
 20221206_dishes   'Shopee food - get detail & dishes.ipynb'
'Copy of Main.db'   test_schema


In [None]:
root_path="/content/drive/MyDrive/Data/1 - raw data/shopee/shopee dish"
db='Copy of Main.db'
_root = Path(root_path)
db_path=_root.joinpath(db)
#----------------------------------------
try:
    conn = lite.connect(db_path)
    print(f"sqlite3 {lite.version} has connected to database successfully")
except lite.Error as e:
    print("Error: ", e)

sqlite3 2.6.0 has connected to database successfully


# Get detail

## Test

In [None]:
get_detail_url='https://gappapi.deliverynow.vn/api/delivery/get_detail'

In [None]:
header={'accept': 'application/json, text/plain, */*'
,'x-foody-client-id': '525d4c83-bab2-4d1e-b17e-f867be8de94f'
,'x-foody-client-type': '4'
,'x-foody-app-type': '1004'
,'x-foody-client-version': '5.23.0'
,'x-foody-api-version': '1'
,'x-foody-client-language': 'en'
,'x-foody-access-token': None
,'user-agent': 'NOW/5.13.2 (sdk_gphone64_x86_64; android 33; Scale/2.75)'
,'x-foody-client-rn-version': '5.13.2'
,'x-foody-device-fingerprint': None
,'accept-encoding': 'gzip'}

query={
    'request_id': 920007
    ,'id_type':    1
}

#### Benchmark thời gian request

In [None]:
%%timeit 10
response=requests.get(get_detail_url, params=query, headers=header)

1.6 s ± 23.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


Như vậy estimate: 61k quán \* 0.5s/request = 8.5 hrs

#### Thử dùng multithread

In [None]:
def request_detail(restaurant_id):
  return requests.get(get_detail_url, params={'request_id': restaurant_id,'id_type': 1}, headers=header)

In [None]:
%%time
t = []
i=0

with concurrent.futures.ThreadPoolExecutor() as executor:
  future_samples = []
  counter = 0

  while i<100:
    future_samples.append(executor.submit(request_detail, 920007))
    counter += 1

    if counter == 100:
      for future in concurrent.futures.as_completed(future_samples):
          try:
              response = future.result()
          except Exception as exc:
              print('Generated an exception: %s' % (exc))
          else:
              i += 1
              #pbar.update(1)
              t.append(response.json())

              if i==100: break

      future_samples = []
      counter = 0

  #pbar.close()

CPU times: user 1.46 s, sys: 147 ms, total: 1.61 s
Wall time: 28 s


Như vậy estimate: 61k quán / 100 (requests/batch) * 5s/batch = 0.8 hr

## Multithreaded request

In [None]:
def request_detail(restaurant_id):
  return requests.get(
      'https://gappapi.deliverynow.vn/api/delivery/get_detail', 
      params={'request_id': restaurant_id,'id_type': 1}, 
      headers={
        'accept': 'application/json, text/plain, */*'
        ,'x-foody-client-id': '525d4c83-bab2-4d1e-b17e-f867be8de94f'
        ,'x-foody-client-type': '4'
        ,'x-foody-app-type': '1004'
        ,'x-foody-client-version': '5.23.0'
        ,'x-foody-api-version': '1'
        ,'x-foody-client-language': 'en'
        ,'x-foody-access-token': None
        ,'user-agent': 'NOW/5.13.2 (sdk_gphone64_x86_64; android 33; Scale/2.75)'
        ,'x-foody-client-rn-version': '5.13.2'
        ,'x-foody-device-fingerprint': None
        ,'accept-encoding': 'gzip'
      })

In [None]:
list_restaurant_id = pd.read_sql_query("SELECT DISTINCT restaurant_id FROM restaurant", conn)["restaurant_id"].to_list()

In [None]:
pbar = tqdm(total = len(list_restaurant_id))

  0%|          | 0/61581 [00:00<?, ?it/s]

In [None]:
t = []
i=0
total_len = len(list_restaurant_id)

with concurrent.futures.ThreadPoolExecutor() as executor:
  future_samples = []
  counter = 0

  while i<total_len:
    future_samples.append(executor.submit(request_detail, list_restaurant_id[i]))
    counter += 1

    if counter == 100:
      for future in concurrent.futures.as_completed(future_samples):
          try:
              response = future.result().json()
          except Exception as exc:
              print('Generated an exception: %s' % (exc))
          else:
              if 'delivery_detail' in response.keys():
                data=response['delivery_detail']
              
              # Update progress bar
              i += 1
              pbar.update(1)
              
              # Limitation reached
              if i==100: break

      future_samples = []
      counter = 0

  #pbar.close()

## Old method

In [None]:
# def get_detail(url,header,param,restaurant_list):
#   #-----------------------------------------------------------------
#   count=0
#   for id in restaurant_list:
#     count+=1
#     print(count)
#     param['request_id']=id
#     respone=requests.get(url, params=param, headers=header)
#     respone_1=json.loads(respone.text)['reply']
#     if 'delivery_detail' in respone_1.keys():
#       data=respone_1['delivery_detail']
#     else:
#       print(respone_1)
#       continue
#  #------------------------------------------------------------------
#     restaurant_id.append(data['restaurant_id'])
#     has_phone.append(len(data['phones'])>0)
#     restaurant_short_description.append(data['short_description'])
#     if data['brand']:
#       brand_id.append(data['brand']['brand_id'])
#       brand_url.append(data['brand']['brand_url'])
#       brand_name.append(data['brand']['name'])
#       restaurant_count.append(data['brand']['restaurant_count'])
#     else:
#       brand_id.append(None)
#       brand_url.append(None)
#       brand_name.append(None)
#       restaurant_count.append(None)
#     limit_distance.append(data['limit_distance'])
#     categories.append('*#*'.join(data['categories']))
#     is_favorite.append(data['is_favorite'])
#     delivery_id.append(data['delivery_id'])
#     total_order.append(data['total_order'])
#     rating_total_review.append(data['rating']['total_review'])
#     rating_avg.append(data['rating']['avg'])
#     rating_display_total_review.append(data['rating']['display_total_review'])
#     rating_app_link.append(data['rating']['app_link'])
#     is_subscribe.append(data['is_subscribe'])
#     asap_is_available.append(data['asap_is_available'])
#     is_city_alert.append(data['is_city_alert'])
#     contract_type.append(data['contract_type'])
#     delivery_fees.append(json.dumps(data['delivery_fees']))
#     vat.append(data['vat'])
#     confirm_language.append(data['confirm_language'])
#     service_type.append(data['service_type'])
#     user_favorite_count.append(data['user_favorite_count'])
#     delivery_week_days.append(json.dumps(data['delivery']['time']['week_days']))
#     delivery_service_by.append(data['delivery']['service_by'])
#     delivery_service_fee.append(data['delivery']['service_fee']['value'])
#     delivery_merchant_limit_distance.append(data['delivery']['merchant_limit_distance'])
#     delivery_payment_methods.append(json.dumps(data['delivery']['payment_methods']))
#     delivery_has_contract.append(data['delivery']['has_contract'])
#     delivery_setting_limit_distance.append(data['delivery']['setting_limit_distance'])
#     delivery_merchant_time.append(data['delivery']['merchant_time'])
#     delivery_prepare_duration.append(data['delivery']['prepare_duration'])
#     delivery_ship_types.append(json.dumps(data['delivery']['ship_types']))
#     delivery_avg_price.append(data['delivery']['avg_price']['value'])
#     delivery_avg_price_unit.append(data['delivery']['avg_price']['unit'])
#     delivery_shipping_fee_value.append(data['delivery']['shipping_fee']['value'])
#     delivery_shipping_fee_minimum.append(data['delivery']['shipping_fee']['minimum_fee'])
#     delivery_shipping_fee_rate.append(data['delivery']['shipping_fee']['rate'])
#     delivery_min_charge.append(data['delivery']['min_charge'])
#     is_display_cutlery.append(data['is_display_cutlery'])
#     confirm_methods.append(json.dumps(data['confirm_methods']))
#     name_en.append(data['name_en'])
#     foody_service_id.append(data['foody_service_id'])
#     min_order_value.append(data['min_order_value']['value'])
#     root_category_ids.append(data['root_category_ids'])
#     promotion_count.append(len(data['promotions']))
#     price_range_min_price.append(data['price_range']['min_price'])
#     price_range_max_price.append(data['price_range']['max_price'])
#     parent_category_id.append(data['parent_category_id'])
#     position_verified.append(data['position']['is_verified'])
#   total_data={
#       'restaurant_id':restaurant_id,
#       'has_phone':has_phone,
#       'restaurant_short_description':restaurant_short_description,
#       'brand_id':brand_id,
#       'brand_url':brand_url,
#       'brand_name':brand_name,
#       'restaurant_count':restaurant_count,
#       'limit_distance':limit_distance,
#       'categories':categories,
#       'is_favorite':is_favorite,
#       'delivery_id':delivery_id,
#       'total_order':total_order,
#       'rating_total_review':rating_total_review,
#       'rating_avg':rating_avg,
#       'rating_display_total_review':rating_display_total_review,
#       'rating_app_link':rating_app_link,
#       'is_subscribe':is_subscribe,
#       'asap_is_available':asap_is_available,
#       'is_city_alert':is_city_alert,
#       'contract_type':contract_type,
#       'delivery_fees':delivery_fees,
#       'vat':vat,
#       'confirm_language':confirm_language,
#       'service_type':service_type,
#       'user_favorite_count':user_favorite_count,
#       'delivery_week_days':delivery_week_days,
#       'delivery_service_by':delivery_service_by,
#       'delivery_service_fee':delivery_service_fee,
#       'delivery_merchant_limit_distance':delivery_merchant_limit_distance,
#       'delivery_payment_methods':delivery_payment_methods,
#       'delivery_has_contract':delivery_has_contract,
#       'delivery_setting_limit_distance':delivery_setting_limit_distance,
#       'delivery_merchant_time':delivery_merchant_time,
#       'delivery_prepare_duration':delivery_prepare_duration,
#       'delivery_ship_types':delivery_ship_types,
#       'delivery_avg_price':delivery_avg_price,
#       'delivery_avg_price_unit':delivery_avg_price_unit,
#       'delivery_shipping_fee_value':delivery_shipping_fee_value,
#       'delivery_shipping_fee_minimum':delivery_shipping_fee_minimum,
#       'delivery_shipping_fee_rate':delivery_shipping_fee_rate,
#       'delivery_min_charge':delivery_min_charge,
#       'is_display_cutlery':is_display_cutlery,
#       'confirm_methods':confirm_methods,
#       'name_en':name_en,
#       'foody_service_id':foody_service_id,
#       'min_order_value':min_order_value,
#       'root_category_ids':root_category_ids,
#       'promotion_count':promotion_count,
#       'price_range_min_price':price_range_min_price,
#       'price_range_max_price':price_range_max_price,
#       'parent_category_id':parent_category_id,
#       'position_verifie':position_verified
#       }
#   return pd.DataFrame(total_data)

In [None]:
# district=pd.read_excel('restaurant_id_Quận 2.xlsx')
# list_restaurant_id=list(district['restaurant_id'])

In [None]:
# detail_df=get_detail(get_detail_url,header,query,list_restaurant_id)

In [None]:
# detail_df.to_excel('detail_Quận 2.xlsx')

# Get dishes

In [None]:
def request_dish_detail(restaurant_id):
  return requests.get(
      'https://gappapi.deliverynow.vn/api/v5/buyer/store/dishes', 
      params={'restaurant_id': restaurant_id}, 
      headers={
        'accept': 'application/json, text/plain, */*'
        ,'x-foody-client-id': '525d4c83-bab2-4d1e-b17e-f867be8de94f'
        ,'x-foody-client-type': '4'
        ,'x-foody-app-type': '1004'
        ,'x-foody-client-version': '5.23.0'
        ,'x-foody-api-version': '1'
        ,'x-foody-client-language': 'en'
        ,'x-foody-access-token': '170f42e77caf17826dc30f7598aa2a56bac308fb8923a173a8fd8abfbcf53af79a5d25fe63d7deccf4b92c0384b0b718a5cf5b750b7e940bdb02507af7e38797'
        #Có vẻ token không đổi, có thể chỉ dựa vào restaurant_id
        ,'user-agent': 'NOW/5.13.2 (sdk_gphone64_x86_64; android 33; Scale/2.75)'
        ,'x-foody-client-rn-version': '5.13.2'
        ,'x-foody-device-fingerprint': None
        ,'accept-encoding': 'gzip'
    })

In [None]:
#demo
respone=request_dish_detail(1137988)
df=json.loads(respone.text)

In [None]:
def get_dish_detail(restaurant_id):
    #-----------------------------------------------------------------------
    respone=request_dish_detail(restaurant_id)
    data = pa.json.read_json(io.BytesIO(respone.content)).append_column('restaurant_id', [[restaurant_id]])
    #-----------------------------------------------------------------------
    return data

In [None]:
list_restaurant_id = pd.read_sql_query("SELECT DISTINCT restaurant_id FROM restaurant", conn)["restaurant_id"].to_list()

In [None]:
# Empty the list of result
dataset = []
err_samples = {}

Crawl toàn bộ hoặc ngắt quãng ra đc

In [None]:
total_len = len(list_restaurant_id)
skip = 0
batch_size = 200

with concurrent.futures.ThreadPoolExecutor() as executor:
  future_samples = {}
  counter = 0

  for i in tqdm(range(skip, total_len)):
    future_samples[executor.submit(get_dish_detail, list_restaurant_id[i])] = list_restaurant_id[i]
    counter += 1

    if counter == batch_size or i == (total_len-1):
      for future in concurrent.futures.as_completed(future_samples):
          try:
              dataset.append(future.result())
          except Exception as exc:
              print('Generated an exception: %s' % (exc))
              id = future_samples[future]
              err_samples[id] = request_dish_detail(id).content

      # Reset batch
      future_samples = {}
      counter = 0

 27%|██▋       | 16800/61581 [56:02<2:35:47,  4.79it/s]

Generated an exception: straddling object straddles two block boundaries (try to increase block size?)


100%|██████████| 61581/61581 [3:31:38<00:00,  4.85it/s]


In [None]:
# err_samples -> 1070345 --> Siêu thị Mekong Gourmet, rất nhiều dish
# request_dish_detail(1070345).text

Gom nhóm data theo các schema khác nhau và lưu xuống nhiều file

In [None]:
schemas = {}

In [None]:
for sample in dataset:
  if sample.schema in schemas.keys():
    schemas[sample.schema].append(sample)
  else:
    schemas[sample.schema]=[sample]

In [None]:
p = Path(root_path).joinpath(f"{date.today().strftime('%Y%m%d')}_dishes")
p.mkdir(parents=True, exist_ok=True)

for i, key in enumerate(schemas.keys()):
  pq.write_table(
      pa.concat_tables(schemas[key]), 
      p.joinpath(f"dishes_schema{i:03d}.parquet")
  )

## Old method

In [None]:
# def get_dishes(url, query, header,restaurant_list):
#   catalog_id=[]
#   catalog_name=[]
#   catalog_rank=[]
#   catalog_partner_catalog_id=[]
#   catalog_description=[]
#   dish_restaurant_id=[]
#   dish_id=[]
#   dish_name=[]
#   dish_partner_dish_id=[]
#   dish_listing_status=[]
#   dish_description=[]
#   dish_total_like=[]
#   dish_rank=[]
#   dish_picture_label=[]
#   dish_is_hidden=[]
#   dish_price=[]
#   dish_is_group_discount_item=[]
#   dishes_property_info=[]

#   for restaurant_id in restaurant_list:
#     query['restaurant_id']=restaurant_id
#     #-----------------------------------------------------------------------
#     respone=json.loads(requests.get(get_dishes_url, params=query, headers=header).text)
#     #-----------------------------------------------------------------------
#     if 'data' in respone:
#       if 'catalogs' in respone['data']:
#         for catalog in respone['data']['catalogs']:
#           for dishes in catalog['dishes']:
#             catalog_id.append(catalog['id'])
#             catalog_name.append(catalog['name'])
#             catalog_rank.append(catalog['rank'])
#             if 'partner_catalog_id' in catalog:
#               catalog_partner_catalog_id.append(catalog['partner_catalog_id'])
#             else:
#               catalog_partner_catalog_id.append(None)
#             if 'description' in catalog:
#               catalog_description.append(catalog['description'])
#             else:
#               catalog_description.append(None)
#             dish_restaurant_id.append(dishes['restaurant_id'])
#             dish_id.append(dishes['id'])
#             dish_name.append(dishes['name'])
#             dish_partner_dish_id.append(dishes['partner_dish_id'])
#             dish_listing_status.append(dishes['listing_status'])
#             dish_description.append(dishes['description'])
#             dish_total_like.append(dishes['total_like'])
#             dish_rank.append(dishes['rank'])
#             if 'picture_label' in dishes:
#               dish_picture_label.append(dishes['picture_label'])
#             else:
#               dish_picture_label.append(None)
#             dish_is_hidden.append(dishes['is_hidden'])
#             dish_price.append(dishes['price'])
#             dish_is_group_discount_item.append(dishes['is_group_discount_item'])
#             dishes_property_info.append(json.dumps(dishes['property_info']))
#       else:
#         continue

#   data_dict={
#         'catalog_id':catalog_id
#         ,'catalog_name':catalog_name
#         ,'catalog_rank':catalog_rank
#         ,'catalog_partner_catalog_id':catalog_partner_catalog_id
#         ,'catalog_description':catalog_description
#         ,'dish_restaurant_id':dish_restaurant_id
#         ,'dish_id':dish_id
#         ,'dish_name':dish_name
#         ,'dish_partner_dish_id':dish_partner_dish_id
#         ,'dish_listing_status':dish_listing_status
#         ,'dish_description':dish_description
#         ,'dish_total_like':dish_total_like
#         ,'dish_rank':dish_rank
#         ,'dish_picture_label':dish_picture_label
#         ,'dish_is_hidden':dish_is_hidden
#         ,'dish_price':dish_price
#         ,'dish_is_group_discount_item':dish_is_group_discount_item
#         ,'dishes_property_info':dishes_property_info
#   }
#   return pd.DataFrame(data_dict)

In [None]:
# district=pd.read_excel('restaurant_id_Quận 2.xlsx')
# list_restaurant_id=list(district['restaurant_id'])

In [None]:
# df=get_dishes(get_dishes_url, query, header,list_restaurant_id)

In [None]:
# df.to_excel('dishes_Quận 2.xlsx')

In [None]:
# !pwd