In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
import re


In [2]:

from selenium.common.exceptions import NoSuchElementException

def get_all_product_links(driver, base_url):
    driver.get(base_url)
    time.sleep(3) 

    product_links = set()
    last_count = -1

    while True:
        items = driver.find_elements(By.CSS_SELECTOR, '.pj16-item-info h3 a')
        for item in items:
            href = item.get_attribute('href')
            if href:
                product_links.add(href)

        if len(product_links) == last_count:
            print("Đã lấy hết link sản phẩm.")
            break
        last_count = len(product_links)

        try:
            xem_them = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, '#page-pager a'))
            )
            driver.execute_script("arguments[0].scrollIntoView();", xem_them)
            time.sleep(1)
            driver.execute_script("arguments[0].click();", xem_them)
            time.sleep(3)  
        except (NoSuchElementException, TimeoutException):
            print("Không tìm thấy hoặc không click được nút 'Xem thêm'.")
            break

    return list(product_links)


In [None]:
from datetime import datetime

def crawl_page(driver):
    time.sleep(2)

    try:
        name = driver.find_element(By.CSS_SELECTOR, '.box-header .header-name h1').text.strip()
    except NoSuchElementException:
        name = ""

    try:
        price = driver.find_element(By.CSS_SELECTOR, '.box-price .price').text.strip()
    except NoSuchElementException:
        price = ""

    specs = {}

    # Mở popup cấu hình
    try:
        xem_cau_hinh = driver.find_element(By.CSS_SELECTOR, '#see-specs-content a')
        driver.execute_script("arguments[0].scrollIntoView();", xem_cau_hinh)
        time.sleep(1)
        xem_cau_hinh.click()
        time.sleep(2)

        specs_blocks = driver.find_elements(By.CSS_SELECTOR, "#popup-modal .specs-item")
        for block in specs_blocks:
            try:
                category = block.find_element(By.CSS_SELECTOR, ".title").text.strip()
                items = block.find_elements(By.CSS_SELECTOR, "ul.specs-content > li")
                details = {}
                for item in items:
                    try:
                        key = item.find_element(By.TAG_NAME, "strong").text.strip()
                        value = item.find_element(By.TAG_NAME, "span").get_attribute("innerHTML").strip().replace("<br>", "\n")
                        details[key] = value
                    except NoSuchElementException:
                        continue
                specs[category] = details
            except NoSuchElementException:
                continue

    except NoSuchElementException:
        print("Không tìm thấy hoặc click được nút xem cấu hình.")

    crawl_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"Crawl time: {crawl_time}")

    return {
        "name": name,
        "price": price,
        "specs": specs,
        "crawl_time": crawl_time
    }


In [None]:
import pandas as pd

def save_to_csv(data_list, filename="../raw_data/hoanghamobile.csv"):
    if not data_list:
        print("Không có dữ liệu để lưu.")
        return

    flat_data_list = []

    for item in data_list:
    if not isinstance(item, dict):
        print("Bỏ qua sản phẩm không hợp lệ:", item)
        continue

    flat_item = {
        "Tên sản phẩm": item.get("name", ""),
        "Giá bán": item.get("price", ""),
        "Thời gian crawl": item.get("crawl_time", "")  
    }

    specs = item.get("specs", {})
    for category, spec_items in specs.items():
        for spec_key, spec_value in spec_items.items():
            column_name = f"{category} - {spec_key}"
            flat_item[column_name] = spec_value

    flat_data_list.append(flat_item)


    df = pd.DataFrame(flat_data_list)
    df.to_csv(filename, index=False, encoding="utf-8")
    print(f"Đã lưu dữ liệu vào file '{filename}'.")


In [23]:
url = "https://hoanghamobile.com/laptop"
links = []
all_products = []
    
options = webdriver.ChromeOptions()
options.headless = False  
driver = webdriver.Chrome(options=options)

try:
        print("Đang lấy danh sách link sản phẩm...")
        links = get_all_product_links(driver, url)

        print(f"\nĐã lấy được {len(links)} link sản phẩm.")
        for i, link in enumerate(links, 1):
            driver.get(link)
            print(f"\nĐang crawl sản phẩm {i}/{len(links)}: {link}")
            time.sleep(2)
            product = crawl_page(driver)
            all_products.append(product)
except Exception as e:
    print(f"Đã xảy ra lỗi: {e}")

finally:
    driver.quit()

save_to_csv(all_products, "../raw_data/hoanghamobile.csv")

Đang lấy danh sách link sản phẩm...
Không tìm thấy hoặc không click được nút 'Xem thêm'.

Đã lấy được 384 link sản phẩm.

Đang crawl sản phẩm 1/384: https://hoanghamobile.com/laptop/laptop-asus-gaming-tuf-f15-fx506hf-hn078w-chinh-hang
Crawl time: 2025-05-23 02:54:49

Đang crawl sản phẩm 2/384: https://hoanghamobile.com/laptop/macbook-air-m3-15-inch-8gb-256gb-chinh-hang-apple-viet-nam
Crawl time: 2025-05-23 02:55:00

Đang crawl sản phẩm 3/384: https://hoanghamobile.com/laptop/laptop-asus-tuf-gaming-f15-fx507vv-lp157w-chinh-hang
Crawl time: 2025-05-23 02:55:12

Đang crawl sản phẩm 4/384: https://hoanghamobile.com/laptop/lenovo-ideapad-5-14ial7-82sd006pvn
Crawl time: 2025-05-23 02:55:24

Đang crawl sản phẩm 5/384: https://hoanghamobile.com/laptop/macbook-pro-m4-pro-16-inch-48gb-512gb
Crawl time: 2025-05-23 02:55:34

Đang crawl sản phẩm 6/384: https://hoanghamobile.com/laptop/may-tinh-xach-tay-asus-e210ma-gj353t
Crawl time: 2025-05-23 02:55:46

Đang crawl sản phẩm 7/384: https://hoanghamob

In [26]:
options = webdriver.ChromeOptions()
options.headless = False  
driver = webdriver.Chrome(options=options)

try:

        print(f"\nĐã lấy được {len(links)} link sản phẩm.")
        for i, link in enumerate(links[:10], 1):  # 👉 Chỉ lấy 10 link đầu tiên
            driver.get(link)
            print(f"\nĐang crawl sản phẩm {i}/{min(10, len(links))}: {link}")
            time.sleep(2)
            product = crawl_page(driver)
            all_products.append(product)

except Exception as e:
    print(f"Đã xảy ra lỗi: {e}")

finally:
    driver.quit()

save_to_csv(all_products, "../raw_data/hoanghamobile.csv")


Đã lấy được 384 link sản phẩm.

Đang crawl sản phẩm 1/10: https://hoanghamobile.com/laptop/laptop-asus-gaming-tuf-f15-fx506hf-hn078w-chinh-hang
Crawl time: 2025-05-23 03:21:43

Đang crawl sản phẩm 2/10: https://hoanghamobile.com/laptop/macbook-air-m3-15-inch-8gb-256gb-chinh-hang-apple-viet-nam
Crawl time: 2025-05-23 03:21:53

Đang crawl sản phẩm 3/10: https://hoanghamobile.com/laptop/laptop-asus-tuf-gaming-f15-fx507vv-lp157w-chinh-hang
Crawl time: 2025-05-23 03:22:05

Đang crawl sản phẩm 4/10: https://hoanghamobile.com/laptop/lenovo-ideapad-5-14ial7-82sd006pvn
Crawl time: 2025-05-23 03:22:17

Đang crawl sản phẩm 5/10: https://hoanghamobile.com/laptop/macbook-pro-m4-pro-16-inch-48gb-512gb
Crawl time: 2025-05-23 03:22:28

Đang crawl sản phẩm 6/10: https://hoanghamobile.com/laptop/may-tinh-xach-tay-asus-e210ma-gj353t
Crawl time: 2025-05-23 03:22:41

Đang crawl sản phẩm 7/10: https://hoanghamobile.com/laptop/laptop-asus-zenbook-14-oled-ux3405ma-pp151w-chinh-hang
Crawl time: 2025-05-23 03: