In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import os
import json
import time
import hashlib

In [2]:
def encode_filename(title):
    # Remove any illegal characters from the title
    safe_title = "".join(c for c in title if c.isalnum() or c in [" ", "_", "-"])

    # Encode the title using MD5
    encoded_title = hashlib.md5(safe_title.encode()).hexdigest()

    return encoded_title


def extract_info(soup):
    try:
        # Extract title
        title = soup.find(
            "h1", class_="Title__TitledStyled-sc-c64ni5-0 iXccQY"
        ).text.strip()

        # Extract author's name
        author_tag = soup.find("a", {"data-view-id": "pdp_details_view_author"})
        author_name = author_tag.text.strip() if author_tag else None
        return {"title": title, "author": author_name}
    except:
        return {}


def extract_rating(soup):
    try:
        rating_summary = soup.find("div", class_="review-rating__inner")
        total_rating = float(rating_summary.find("div", class_="review-rating__point").get_text(strip=True))
        number_of_ratings_text = rating_summary.find("div", class_="review-rating__total").get_text(strip=True)
        number_of_ratings = int(''.join(filter(str.isdigit, number_of_ratings_text)))

        return {
            "total_rating": total_rating,
            "number_of_rating": number_of_ratings
        }
    except:
        return {}


def extract_reviews(soup):
    try:
        reviews = soup.find_all(
            "div", class_="style__StyledComment-sc-1y8vww-5 dpVjwc review-comment"
        )
        review_data = []
        for review in reviews:
            # Extract user information
            user_name = review.find("div", class_="review-comment__user-name").get_text(
                strip=True
            )
            user_date = review.find("div", class_="review-comment__user-date").get_text(
                strip=True
            )

            # Extract review count and liked count
            user_info_divs = review.find_all("div", class_="review-comment__user-info")
            review_count = user_info_divs[0].find("span").get_text(strip=True)
            liked_count = user_info_divs[1].find("span").get_text(strip=True)

            # Extract review title
            review_title = review.find("div", class_="review-comment__title").get_text(
                strip=True
            )

            # Determine rating based on review title
            if review_title == "Cực kì hài lòng":
                rating = 5
            elif review_title == "Hài lòng":
                rating = 4
            elif review_title == "Bình thường":
                rating = 3
            elif review_title == "Không hài lòng":
                rating = 2
            else:
                rating = 1

            # Extract review content
            review_content = review.find("div", class_="review-comment__content").get_text(
                strip=True
            )

            review_data.append(
                {
                    "reviewer": {
                        "name": user_name,
                        "date": user_date,
                        "review_count": review_count,
                        "liked_count": liked_count,
                    },
                    "review": {
                        "rating": rating,
                        "title": review_title,
                        "content": review_content,
                    },
                }
            )
        return review_data
    except:
        return {}


def extract_details(soup):
    try:
        details = {}
        # Find all divs with class 'WidgetTitle__WidgetContentStyled-sc-12sadap-2'
        content_divs = soup.find_all(
            "div", class_="WidgetTitle__WidgetContentStyled-sc-12sadap-2 hNNYbU"
        )


        for div in content_divs:
            # Get the label and value spans
            label_span = div.find(
                "span", style="max-width: 300px; color: rgb(128, 128, 137);"
            )
            value_span = div.find(
                "span", class_="styles__ProductAttributeValueStyled-sc-vjutbk-0 chhHdv"
            )

            if label_span and value_span:
                # Extract label and value text
                label = label_span.text.strip()
                value = value_span.text.strip()

                # Store label and value in details dictionary
                details[label] = value

        return details
    except:
        return {}

In [3]:
categories = [
    "https://tiki.vn/tam-ly-gioi-tinh/c868",
    "https://tiki.vn/sach-ky-nang-song/c870",
   
]

In [4]:
def extract_books_from_category(base_dir, url):
    # Extract the category name from the URL
    category_name = url.split("/")[-2]

    # Create a directory for the category
    category_dir = os.path.join(base_dir, category_name)
    if not os.path.exists(category_dir):
        os.makedirs(category_dir)
    driver = webdriver.Chrome()
    driver.get(url)

    ActionChains(driver).scroll_by_amount(0, 800).perform()

    # Wait until the presence of at least one element with data-view-id="product_list_item"
    wait = WebDriverWait(driver, 15)
    a_tags = wait.until(
        EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, 'a[data-view-id="product_list_item"]')
        )
    )

    # Now you can proceed to extract data from the a_tags
    # For example:
    urls = []
    for a_tag in a_tags:
        href = a_tag.get_attribute("href")
        urls.append(href)

    for url in urls:
        # Load the webpage
        driver.get(url)
        # Wait for 3 seconds
        time.sleep(3)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        info_data = extract_info(soup)
        # Get the title
        title = info_data["title"]
        filename = os.path.join(category_dir, f"{encode_filename(title)}.json")
        if os.path.exists(filename):
            continue
        time.sleep(3)

        # Scroll to the details section
        ActionChains(driver).scroll_by_amount(0, 2000).perform()
        time.sleep(5)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        details = extract_details(soup)

        # Scroll to the desired section
        ActionChains(driver).scroll_by_amount(0, 1300).perform()
        time.sleep(5)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        rating = extract_rating(soup)
        review_data = []

        try:
            btn_next = driver.find_element(
                By.CSS_SELECTOR,
                "#customer-review-widget-id > div > div.style__StyledCustomerReviews-sc-1y8vww-0.gCaHEu.customer-reviews > div > div.customer-reviews__pagination > ul > li:nth-child(7) > a",
            )

            while btn_next:
                soup = BeautifulSoup(driver.page_source, "html.parser")
                data = extract_reviews(soup)
                if data == []:
                    break
                review_data.extend(data)
                actions = ActionChains(driver)
                actions.move_to_element(btn_next)
                actions.click(btn_next)
                actions.perform()
                time.sleep(3)
        except:
            soup = BeautifulSoup(driver.page_source, "html.parser")
            data = extract_reviews(soup)
            review_data.extend(data)
        finally:
            # Generate filename
            filename = os.path.join(category_dir, f"{encode_filename(title)}.json")
            data = {
                "info": info_data,
                "rating": rating,
                "details": details,
                "reviews": review_data,
            }

            # Save the extracted data to a JSON file
            with open(filename, "w", encoding="utf-8") as f:
                json.dump(data, f, ensure_ascii=False, indent=4)

In [6]:
# Ensure the base directory exists
base_dir = "crawl-data-tiki"
if not os.path.exists(base_dir):
    os.makedirs(base_dir)
    
for category in categories:
    try:
        extract_books_from_category(base_dir=base_dir, url=category)
    except:
        continue

In [None]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Set the path to the Chrome WebDriver
driver = webdriver.Chrome()

url = categories[1]

# Extract the category name from the URL
category_name = url.split("/")[-2]

# Ensure the base directory exists
base_dir = "tiki"
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

# Create a directory for the category
category_dir = os.path.join(base_dir, category_name)
if not os.path.exists(category_dir):
    os.makedirs(category_dir)

driver.get(url)

# Wait until the presence of at least one element with data-view-id="product_list_item"
wait = WebDriverWait(driver, 15)
a_tags = wait.until(
    EC.presence_of_all_elements_located(
        (By.CSS_SELECTOR, 'a[data-view-id="product_list_item"]')
    )
)
print(a_tags)

# Now you can proceed to extract data from the a_tags
# For example:
urls = []
for a_tag in a_tags:
    href = a_tag.get_attribute("href")
    urls.append(href)

for url in urls:
    # Load the webpage
    driver.get(url)
    # Wait for 5 seconds
    time.sleep(5)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    info_data = extract_info(soup)

    time.sleep(3)

    # Scroll to the details section
    ActionChains(driver).scroll_by_amount(0, 2000).perform()
    time.sleep(5)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    details = extract_details(soup)

    # Scroll to the desired section
    ActionChains(driver).scroll_by_amount(0, 1300).perform()
    time.sleep(5)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    rating = extract_rating(soup)
    review_data = []

    btn_next = driver.find_element(
        By.CSS_SELECTOR,
        "#customer-review-widget-id > div > div.style__StyledCustomerReviews-sc-1y8vww-0.gCaHEu.customer-reviews > div > div.customer-reviews__pagination > ul > li:nth-child(7) > a",
    )

    flag = btn_next.text
    while btn_next:
        soup = BeautifulSoup(driver.page_source, "html.parser")
        data = extract_reviews(soup)
        if data == []:
            break
        review_data.extend(data)
        actions = ActionChains(driver)
        actions.move_to_element(btn_next)
        actions.click(btn_next)
        actions.perform()
        time.sleep(3)

    # Get the title
    title = info_data["title"]

    # Generate filename
    filename = os.path.join(category_dir, f"{title}.json")
    data = {
        "info": info_data,
        "rating": rating,
        "details": details,
        "reviews": review_data,
    }

    # Save the extracted data to a JSON file
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [59]:
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup

# Load the webpage
driver.get(urls[0])

# Wait for 5 seconds
time.sleep(5)
soup = BeautifulSoup(driver.page_source, "html.parser")
info_data = extract_info(soup)

time.sleep(3)


# Scroll to the details section
ActionChains(driver).scroll_by_amount(0, 2000).perform()
time.sleep(5)
soup = BeautifulSoup(driver.page_source, "html.parser")
details = extract_details(soup)


# Scroll to the desired section
ActionChains(driver).scroll_by_amount(0, 1300).perform()
time.sleep(5)
soup = BeautifulSoup(driver.page_source, "html.parser")
rating = extract_rating(soup)
review_data = []

btn_next = driver.find_element(
    By.CSS_SELECTOR,
    "#customer-review-widget-id > div > div.style__StyledCustomerReviews-sc-1y8vww-0.gCaHEu.customer-reviews > div > div.customer-reviews__pagination > ul > li:nth-child(7) > a",
)

flag = btn_next.text
while btn_next:
    soup = BeautifulSoup(driver.page_source, "html.parser")
    data = extract_reviews(soup)
    if data == []:
        break
    review_data.extend(data)
    actions = ActionChains(driver)
    actions.move_to_element(btn_next)
    actions.click(btn_next)
    actions.perform()
    time.sleep(3)

# Get the title
title = info_data["title"]

# Generate filename
filename = os.path.join(category_dir, f"{title}.json")
data = {"info": info_data, "rating": rating, "details": details, "reviews": review_data}



# Save the extracted data to a JSON file
with open(filename, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=4)