In [2]:
!pip -q install --upgrade selenium beautifulsoup4 pandas
# Optional (only if Selenium Manager has trouble on your machine):
# !pip -q install webdriver-manager

  You can safely remove it manually.


In [3]:
import re, time, random
from typing import List, Dict, Any, Optional

import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ---- Defaults you can change
SEARCH_KEYWORD     = "astaxanthin"
TARGET_MIN_ITEMS   = 80     # stop scrolling after we likely have this many items
MAX_SCROLLS        = 25     # hard cap so it won’t scroll forever
SCROLL_PAUSE_RANGE = (1.0, 2.0)  # seconds, random between these to be polite
HEADLESS           = False  # True if you want Chrome invisible (Colab: True recommended)

# Robust-ish selectors (DOM can change; these give us fallbacks)
PRODUCT_ANCHOR_CSS = [
    "a[href*='/products/']",
    "a[data-qa-locator='product-item']",
]
NAME_CANDIDATES = [
    "div.RfADt",                        # seen on Lazada today
    "a[title]",
    "div[data-qa-locator='product-title']",
]
PRICE_CANDIDATES = [
    "div.aBrP0",                        # price container
    "span[data-qa-locator='product-price']",
    "span[aria-label*='price']",
]
SOLD_CANDIDATES = [
    "div._6uN7R",                       # sold container
]
RATING_CANDIDATES = [
    "span[data-qa-locator='rating-score']",
    "span[aria-label*='rating']",
]
REVIEWS_CANDIDATES = [
    "span[data-qa-locator='rating-total']",
]


In [4]:
def clean_text(s: Optional[str]) -> str:
    if not s:
        return ""
    return " ".join(s.split())

def price_to_float(price_text: str) -> Optional[float]:
    if not price_text:
        return None
    # Keep digits and dot, drop commas
    t = price_text.replace(",", "")
    m = re.search(r"(\d+(?:\.\d+)?)", t)
    if not m:
        return None
    try:
        return float(m.group(1))
    except Exception:
        return None

def sold_to_number(sold_text: str) -> Optional[int]:
    """
    Convert 'ขายแล้ว 1.2พัน', 'Sold 1.2k', '10K+', '3 หมื่น', '2.5 ล้าน+' to an int.
    """
    if not sold_text:
        return None
    t = sold_text.lower().replace("+", "").strip()
    m = re.search(r"(\d+(?:\.\d+)?)", t)
    if not m:
        return None
    num = float(m.group(1))
    if any(k in t for k in ["k", "พัน"]):   return int(num * 1_000)
    if "หมื่น" in t:                        return int(num * 10_000)
    if "แสน" in t:                          return int(num * 100_000)
    if any(k in t for k in ["m", "ล้าน"]):  return int(num * 1_000_000)
    return int(num)


In [5]:
def make_driver(headless: bool = HEADLESS) -> webdriver.Chrome:
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--window-size=1280,1600")
    opts.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/118.0.0.0 Safari/537.36"
    )
    # Selenium Manager automatically provides the right driver for your Chrome
    service = Service()
    return webdriver.Chrome(service=service, options=opts)

def maybe_accept_cookies_or_popups(driver: webdriver.Chrome) -> None:
    candidates = [
        "//button[contains(., 'ยอมรับ') or contains(., 'ตกลง') or contains(., 'Accept') or contains(., 'Agree')]",
        "//a[contains(., 'ยอมรับ') or contains(., 'Accept')]",
        "//button[contains(@class, 'close') or contains(@aria-label, 'close')]",
    ]
    for xp in candidates:
        try:
            el = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.XPATH, xp)))
            el.click()
            time.sleep(0.3)
        except Exception:
            pass


In [6]:
def find_search_input(driver: webdriver.Chrome):
    tries = [
        (By.CSS_SELECTOR, "input[type='search']"),
        (By.CSS_SELECTOR, "input#q"),
        (By.XPATH, "//input[contains(@placeholder, 'ค้นหา')]"),
        (By.XPATH, "//input[contains(@placeholder, 'Search')]"),
        (By.XPATH, "//input[@type='text']"),
    ]
    for by, sel in tries:
        try:
            el = WebDriverWait(driver, 6).until(EC.visibility_of_element_located((by, sel)))
            return el
        except Exception:
            continue
    raise RuntimeError("Search input not found — Lazada DOM may have changed.")

def search_keyword(driver: webdriver.Chrome, keyword: str) -> None:
    driver.get("https://www.lazada.co.th/")
    maybe_accept_cookies_or_popups(driver)
    box = find_search_input(driver)
    box.click()
    box.clear()
    box.send_keys(keyword)
    box.send_keys(Keys.ENTER)

def wait_for_results(driver: webdriver.Chrome, timeout: int = 20) -> None:
    def any_selector_present(drv):
        for css in PRODUCT_ANCHOR_CSS + NAME_CANDIDATES:
            try:
                if drv.find_elements(By.CSS_SELECTOR, css):
                    return True
            except Exception:
                pass
        return False
    WebDriverWait(driver, timeout).until(lambda d: any_selector_present(d))

def lazy_scroll(driver: webdriver.Chrome,
                min_items: int = TARGET_MIN_ITEMS,
                max_scrolls: int = MAX_SCROLLS,
                pause_range: tuple = SCROLL_PAUSE_RANGE) -> None:
    last_height = driver.execute_script("return document.body.scrollHeight")
    scrolls = 0
    while scrolls < max_scrolls:
        # If enough anchors are present, stop early
        anchors = []
        for css in PRODUCT_ANCHOR_CSS:
            anchors.extend(driver.find_elements(By.CSS_SELECTOR, css))
        if len(anchors) >= min_items:
            break

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(random.uniform(*pause_range))

        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            # little jiggle to trigger more content
            driver.execute_script("window.scrollBy(0, -400);")
            time.sleep(0.5)
            driver.execute_script("window.scrollBy(0, 800);")
            time.sleep(random.uniform(*pause_range))
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break

        last_height = new_height
        scrolls += 1


In [7]:
def soup_select_first_text(node: BeautifulSoup, css_list: List[str]) -> str:
    for css in css_list:
        try:
            found = node.select_one(css)
            if found:
                t = clean_text(found.get_text())
                if t:
                    return t
        except Exception:
            pass
    return ""

def parse_page_cards(page_html: str) -> List[Dict[str, Any]]:
    soup = BeautifulSoup(page_html, "html.parser")

    # Collect & dedupe product anchors by href
    anchors = []
    for css in PRODUCT_ANCHOR_CSS:
        anchors.extend(soup.select(css))

    seen, uniq = set(), []
    for a in anchors:
        href = a.get("href") or ""
        if not href:
            continue
        if href.startswith("//"): href = "https:" + href
        elif href.startswith("/"): href = "https://www.lazada.co.th" + href
        if href in seen:
            continue
        seen.add(href)
        uniq.append((a, href))

    rows = []
    for a, href in uniq:
        # climb up a few parents to get a reasonable container
        card = a
        for _ in range(3):
            if card.parent:
                card = card.parent
            else:
                break

        # name: anchor title or text → else from nearby candidates
        name = clean_text(a.get("title")) or clean_text(a.get_text())
        if not name:
            name = soup_select_first_text(card, NAME_CANDIDATES)

        price_text  = soup_select_first_text(card, PRICE_CANDIDATES)
        sold_text   = soup_select_first_text(card, SOLD_CANDIDATES)
        rating_text = soup_select_first_text(card, RATING_CANDIDATES)
        reviews_text= soup_select_first_text(card, REVIEWS_CANDIDATES)

        # global fallback if card-local fails
        if not price_text:
            price_text = soup_select_first_text(soup, PRICE_CANDIDATES)
        if not sold_text:
            sold_text  = soup_select_first_text(soup, SOLD_CANDIDATES)

        rows.append({
            "name": name,
            "price_text": price_text,
            "price": price_to_float(price_text),
            "sold_text": sold_text,
            "sold_est": sold_to_number(sold_text) if sold_text else None,
            "rating_text": rating_text,
            "reviews_text": reviews_text,
            "product_url": href,
        })

    # Keep meaningful rows only
    rows = [r for r in rows if r["name"] and r["product_url"]]
    return rows


In [8]:
def scrape_lazada(keyword: str,
                  min_items: int = TARGET_MIN_ITEMS,
                  max_scrolls: int = MAX_SCROLLS,
                  headless: bool = HEADLESS) -> pd.DataFrame:
    driver = make_driver(headless=headless)
    try:
        search_keyword(driver, keyword)
        wait_for_results(driver)
        lazy_scroll(driver, min_items=min_items, max_scrolls=max_scrolls)

        html = driver.page_source
        items = parse_page_cards(html)
        df = pd.DataFrame(items)

        if not df.empty:
            df = df.drop_duplicates(subset=["product_url"]).reset_index(drop=True)
        return df
    finally:
        driver.quit()


In [9]:
keyword = SEARCH_KEYWORD
df = scrape_lazada(keyword, min_items=TARGET_MIN_ITEMS, max_scrolls=MAX_SCROLLS, headless=HEADLESS)

print(f"Parsed rows: {len(df)}")
display(df.head(10))

# Save CSV next to your notebook
out_path = f"lazada_{keyword}.csv"
df.to_csv(out_path, index=False, encoding="utf-8-sig")
print("Saved to:", out_path)


Parsed rows: 43


Unnamed: 0,name,price_text,price,sold_text,sold_est,rating_text,reviews_text,product_url
0,Dr.Pong Astaxanthin 6 mg AstaREAL from Japan แ...,฿379.00,379.0,100.4K sold(29574)Pathum Thani,100400.0,,,https://www.lazada.co.th/products/pdp-i3640665...
1,Blackmores Astaxanthin 6Mg Plus 30C,฿469.00,469.0,4.2K sold(1139)Chachoengsao,4200.0,,,https://www.lazada.co.th/products/pdp-i3257811...
2,Dr.PONG Special Set Astaxanthin แอสตาแซนธิน 3 ...,"฿1,099.00",1099.0,3.5K sold(1581)Pathum Thani,3500.0,,,https://www.lazada.co.th/products/pdp-i5372715...
3,( Pack 2 ) VISTRA ASTAXANTHIN 6 MG PLUS VITAMI...,"฿1,232.00",1232.0,3.4K sold(1385)Nonthaburi,3400.0,,,https://www.lazada.co.th/products/pdp-i5014701...
4,FITWHEY ASTAXANTHIN 6MG + COQ10 (30 SOFTGELS) ...,฿199.00,199.0,7.4K sold(2370)Samut Prakan,7400.0,,,https://www.lazada.co.th/products/pdp-i5067987...
5,"Astaxanthin Capsules Support Skin, Nerve, and ...",฿313.65,313.65,Krabi,,,,https://www.lazada.co.th/products/pdp-i5851701...
6,(EXP: 23/11/2025) Astaxanthin 6 MG + CoQ10 แอส...,฿99.00,99.0,116 sold(60)Samut Prakan,116000.0,,,https://www.lazada.co.th/products/pdp-i4898321...
7,KIRKLAND 24mg Natural Astaxanthin Capsules สนั...,฿223.00,223.0,255 sold(79)Chiang Mai,255000000.0,,,https://www.lazada.co.th/products/pdp-i5688665...
8,Astaxanthin แบรนด์ Blacktra from Japan แอสตาแซ...,฿225.00,225.0,671 sold(266)Bangkok,671000.0,,,https://www.lazada.co.th/products/pdp-i5214138...
9,(ซื้อ 1 แถม 1) Dr.PONG แอสตาแซนธิน จากญี่ปุ่น ...,฿319.00,319.0,3.1K sold(1123)Bangkok,3100.0,,,https://www.lazada.co.th/products/pdp-i5555484...


Saved to: lazada_astaxanthin.csv
