In [85]:
import glob
import hashlib

import pandas as pd

## Helpers

In [87]:
def read_all(path_glob: str) -> pd.DataFrame:
    return pd.concat([pd.read_json(path, lines=True) for path in glob.glob(path_glob)], ignore_index=True)

In [88]:
def derive_hash(x: str) -> str:
    if x is None or x == "":
        return None
    
    return hashlib.sha256(x.encode("utf-8")).hexdigest()

## Tokopedia Dataset

In [89]:
df_tokopedia = read_all("../data/scrape-api/tokopedia/*.jsonl")
df_tokopedia.head(2)

Unnamed: 0,product_id,title,url,currency,delivery,final_price,initial_price,seller_name,description,availability,...,seller_image,rating_breakdown,vouchers,timestamp,input,discovery_input,error,error_code,warning,warning_code
0,13967750000.0,MSI THIN 15 B12UC RTX3050 I5-12450H 8GB 512GB ...,https://www.tokopedia.com/agresid/msi-thin-15-...,IDR,"[Dikirim dari Jakarta Utara, Ongkir Reguler 56...",9799000.0,14399000.0,AGRES ID,PROMO SPECIAL !!\nFREE SPEAKER HOME THEATER (S...,999.0,...,https://images.tokopedia.net/img/cache/215-squ...,"[{'rate': 5, 'count': 11}, {'rate': 4, 'count'...","[{'voucher_name': 'Computer Post 1212', 'vouch...",2024-12-17 08:37:44.292000+00:00,{'url': 'https://www.tokopedia.com/agresid/msi...,{'keyword': 'MSI 15'},,,,
1,11623880000.0,MSI KATANA 15 B13VEK i7-13620H 16GB 1TB SSD RT...,https://www.tokopedia.com/msi-official/msi-kat...,IDR,"[Dikirim dari Jakarta Pusat, Ongkir Reguler 48...",16999000.0,19999000.0,MSI Official Store,Katana 15 B13VEK-1851ID - INCLUDE OHS2021\n\nB...,593.0,...,https://images.tokopedia.net/img/cache/215-squ...,"[{'rate': 5, 'count': 85}, {'rate': 4, 'count'...","[{'voucher_name': 'Computer Post 1212', 'vouch...",2024-12-17 08:39:27.165000+00:00,{'url': 'https://www.tokopedia.com/msi-officia...,{'keyword': 'MSI 15'},,,,


In [90]:
# filter out missing data
df_tokopedia = df_tokopedia.dropna(subset=["url"])

# assign source
df_tokopedia["source"] = "tokopedia"

# create ID from URL
df_tokopedia["id"] = df_tokopedia["url"].apply(derive_hash)

## Lazada

In [91]:
df_lazada = read_all("../data/scrape-api/lazada/*.jsonl")
df_lazada.head(2)

Unnamed: 0,url,title,rating,reviews,initial_price,final_price,currency,stock,image,seller_name,...,seller_ratings,colors,color,seller_ship_on_time,seller_chat_response,discovery_input,warning,warning_code,error,error_code
0,https://www.lazada.co.id/products/axioo-hype-3...,AXIOO HYPE 3 I3 1005G1 8GB 256GB DOS 14.0FHD I...,0.0,0.0,0.0,4350000.0,IDR,1.0,[https://img.lazcdn.com/g/p/7322523ebf64e482de...,ATC- Aneka Technology Computer,...,,,,,,,,,,
1,https://www.lazada.co.id/products/laptop-axioo...,LAPTOP AXIOO MYBOOK HYPE 3 CORE I3 8GB SSD 256...,0.0,0.0,0.0,4180000.0,IDR,1.0,[https://img.lazcdn.com/g/p/249fb907291791435b...,bintang raya com,...,0.98,"[DOS BLUE, DOS GREY, WIN 11 BLUE, WIN 11 GREY]",DOS BLUE,,,,,,,


In [92]:
# filter out missing data
df_lazada = df_lazada.dropna(subset=["url"])

# rename column
df_lazada = df_lazada.rename(columns={"product_description": "description"})

# assign source
df_tokopedia["source"] = "lazada"

# create ID from URL
df_lazada["id"] = df_lazada["url"].apply(derive_hash)

# merge product spec into text
df_lazada["extra_descripton"] = df_lazada["product_specifications"].apply(lambda x: "\n".join([f"{val['name']}: {val['value']}" for val in x]) if isinstance(x, list) else None)

## Merge All

In [93]:
# merge all
df_all = pd.concat([
    df_tokopedia[["id", "title", "initial_price", "final_price", "seller_name", "description", "url"]],
    df_lazada[["id", "title", "initial_price", "final_price", "seller_name", "description", "extra_descripton", "url"]]
], ignore_index=True).rename(columns={"title": "product_name"})

# remove duplicates
df_all = df_all.drop_duplicates(subset=["id"])
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12839 entries, 0 to 15703
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                12839 non-null  object 
 1   product_name      12839 non-null  object 
 2   initial_price     12839 non-null  float64
 3   final_price       12839 non-null  float64
 4   seller_name       12839 non-null  object 
 5   description       12837 non-null  object 
 6   url               12839 non-null  object 
 7   extra_descripton  8851 non-null   object 
dtypes: float64(2), object(6)
memory usage: 902.7+ KB


In [94]:
# save to file
df_all.to_json("../data/clean/ecom-products.jsonl", orient="records", lines=True)
df_all.head()

Unnamed: 0,id,product_name,initial_price,final_price,seller_name,description,url,extra_descripton
0,6f6d353462836c7b0a641413a78eec014dd9528bffb1d3...,MSI THIN 15 B12UC RTX3050 I5-12450H 8GB 512GB ...,14399000.0,9799000.0,AGRES ID,PROMO SPECIAL !!\nFREE SPEAKER HOME THEATER (S...,https://www.tokopedia.com/agresid/msi-thin-15-...,
1,2cf4400e5eb525758f9f4e4e31a9976ec20343299065bd...,MSI KATANA 15 B13VEK i7-13620H 16GB 1TB SSD RT...,19999000.0,16999000.0,MSI Official Store,Katana 15 B13VEK-1851ID - INCLUDE OHS2021\n\nB...,https://www.tokopedia.com/msi-official/msi-kat...,
2,d0c5989db9761d9b0b1eca7a0d8f3d1266cdef99a25755...,MSI KATANA 15 B13VFK i7-13620H 16GB 1TB SSD RT...,22599000.0,18699000.0,MSI Official Store,SKU\tKatana 15 B13VFK-1850D \n\nBundle office ...,https://www.tokopedia.com/msi-official/msi-kat...,
3,1f7a82c3880c3b54b98408ed062fea43e9aaa27ab15577...,MSI Thin 15 B12UC i7-12650H RTX 3050 4GB 512GB...,14999000.0,11199000.0,MSI Official Store,SKU\tThin 15 B12UC-2405ID\n\nSpesifikasi:\n\nD...,https://www.tokopedia.com/msi-official/msi-thi...,
4,a6e5daed20d7e527d85a2d32d37775cf68bdb07f137624...,MSI Thin 15 B13VE i7-13620H RTX4050 16GB 512GB...,17999000.0,15699000.0,MSI Official Store,SKU\tThin 15 B13VE-2406ID\n\nWarna\t:Cosmos Gr...,https://www.tokopedia.com/msi-official/msi-thi...,
