In [None]:
%pip install selenium

In [None]:
%pip install bs4

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import os
import json
import time
import hashlib


def encode_filename(title):
    # Remove any illegal characters from the title
    safe_title = "".join(c for c in title if c.isalnum() or c in [" ", "_", "-"])

    # Encode the title using MD5
    encoded_title = hashlib.md5(safe_title.encode()).hexdigest()

    return encoded_title


def extract_info(soup):
    try:
        # Extract title
        title = soup.find(
            "h1", class_="Title__TitledStyled-sc-c64ni5-0 iXccQY"
        ).text.strip()

        # Extract author's name
        author_tag = soup.find("a", {"data-view-id": "pdp_details_view_author"})
        author_name = author_tag.text.strip() if author_tag else None

        # Extract image URL from <source> tag
        imageDiv = soup.find(
            "div", class_="style__ProductImagesStyle-sc-15sdfel-0 kXwtNH"
        )
        image_tag = imageDiv.find("source") if imageDiv else None

        if image_tag and "srcset" in image_tag.attrs:
            srcset = image_tag["srcset"]
            image_url = (
                srcset.split(",")[0].strip().split(" ")[0]
            )  # Take the first URL before " 1x"
        else:
            image_url = None

        # Extract sold quantity
        sold_tag = soup.find("div", {"data-view-id": "pdp_quantity_sold"})
        sold_quantity = (
            sold_tag.text.strip().replace("Đã bán ", "") if sold_tag else None
        )

        # Extract price
        price_tag = soup.find("div", class_="product-price__current-price")
        current_price = price_tag.text.strip() if price_tag else None

        # Extract original price
        try:
            original_price_tag = soup.find(
                "div", class_="product-price__original-price"
            ).find("del")

            # Extracting the text, if the <del> tag exists
            original_price = (
                original_price_tag.text.strip() if original_price_tag else None
            )
        except Exception as e:
            return {
                "title": title,
                "author": author_name,
                "imageUrl": image_url,
                "soldQuantity": int(sold_quantity),
                "currentPrice": int(current_price.replace("₫", "").replace(".", "")),
                "originalPrice": int(current_price.replace("₫", "").replace(".", "")),
            }

        return {
            "title": title,
            "author": author_name,
            "imageUrl": image_url,
            "soldQuantity": int(sold_quantity.replace("k", "000")),
            "currentPrice": int(current_price.replace("₫", "").replace(".", "")),
            "originalPrice": int(
                original_price.replace(".", ""),
            ),
        }
    except Exception as e:
        print(f"Error extract info: {e}")  # Optional: print the error for debugging
        return {}


def extract_rating(soup):
    try:
        rating_summary = soup.find("div", class_="review-rating__inner")
        total_rating = float(
            rating_summary.find("div", class_="review-rating__point").get_text(
                strip=True
            )
        )
        number_of_ratings_text = rating_summary.find(
            "div", class_="review-rating__total"
        ).get_text(strip=True)
        number_of_ratings = int("".join(filter(str.isdigit, number_of_ratings_text)))

        return {"totalRating": total_rating, "numberOfRating": number_of_ratings}
    except Exception as e:
        print(f"Error extract rating: {e}")  # Optional: print the error for debugging
        return {}


def extract_reviews(soup):
    try:
        reviews = soup.find_all(
            "div", class_="style__StyledComment-sc-1y8vww-5 dpVjwc review-comment"
        )
        review_data = []
        for review in reviews:
            # Extract user information
            user_name = review.find("div", class_="review-comment__user-name").get_text(
                strip=True
            )
            user_date = review.find("div", class_="review-comment__user-date").get_text(
                strip=True
            )

            # Extract review count and liked count
            user_info_divs = review.find_all("div", class_="review-comment__user-info")
            review_count = user_info_divs[0].find("span").get_text(strip=True)
            liked_count = user_info_divs[1].find("span").get_text(strip=True)

            # Extract review title
            review_title = review.find("div", class_="review-comment__title").get_text(
                strip=True
            )

            # Determine rating based on review title
            if review_title == "Cực kì hài lòng":
                rating = 5
            elif review_title == "Hài lòng":
                rating = 4
            elif review_title == "Bình thường":
                rating = 3
            elif review_title == "Không hài lòng":
                rating = 2
            else:
                rating = 1

            # Extract review content
            review_content = review.find(
                "div", class_="review-comment__content"
            ).get_text(strip=True)

            review_data.append(
                {
                    "reviewer": {
                        "name": user_name,
                        "date": user_date,
                        "reviewCount": review_count,
                        "likedCount": liked_count,
                    },
                    "review": {
                        "rating": rating,
                        "title": review_title,
                        "content": review_content,
                    },
                }
            )
        return review_data
    except Exception as e:
        print(f"Error extract review: {e}")  # Optional: print the error for debugging
        return {}


def extract_details(soup):
    try:
        details = {}
        # Find all divs with class 'WidgetTitle__WidgetContentStyled-sc-12sadap-2'
        content_divs = soup.find_all(
            "div", class_="WidgetTitle__WidgetContentStyled-sc-12sadap-2 hNNYbU"
        )

        map = {
            "Công ty phát hành": "publisher",
            "Kích thước": "dimensions",
            "Dịch Giả": "translator",
            "Loại bìa": "coverType",
            "Số trang": "pageCount",
            "Nhà xuất bản": "publishingHouse",
            "Ngày xuất bản": "publishDate",
            "Phiên bản sách": "bookVersion",
        }

        for div in content_divs:
            # Get the label and value spans
            label_span = div.find(
                "span", style="max-width: 300px; color: rgb(128, 128, 137);"
            )
            value_span = div.find(
                "span", class_="styles__ProductAttributeValueStyled-sc-vjutbk-0 chhHdv"
            )

            if label_span and value_span:
                # Extract label and value text
                label = label_span.text.strip()
                value = value_span.text.strip()

                # Store label and value in details dictionary
                details[map.get(label) if map.get(label) is not None else label] = value

        return details
    except Exception as e:
        print(f"Error extract details: {e}")  # Optional: print the error for debugging
        return {}


def extract_description_product(soup):
    try:
        content_div = soup.find("div", class_="ToggleContent__View-sc-fbuwol-0 imwRtb")
        if not content_div:
            return ""
        paragraphs = content_div.find_all("p")
        # Join all paragraphs as a single introduction text
        introduction = "\n".join(p.get_text(strip=True) for p in paragraphs)

        return introduction
    except Exception as e:
        print(
            f"Error extract description: {e}"
        )  # Optional: print the error for debugging
        return {}


categories = [
    "https://tiki.vn/sach-kien-thuc-tong-hop/c873",
]


def extract_books_from_category(base_dir, url):
    # Extract the category name from the URL
    category_name = url.split("/")[-2]

    # Create a directory for the category
    category_dir = os.path.join(base_dir, category_name)
    if not os.path.exists(category_dir):
        os.makedirs(category_dir)
    driver = webdriver.Chrome()
    driver.get(url)

    ActionChains(driver).scroll_by_amount(0, 800).perform()

    # Wait until the presence of at least one element with data-view-id="product_list_item"
    wait = WebDriverWait(driver, 15)
    a_tags = wait.until(
        EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, 'a[data-view-id="product_list_item"]')
        )
    )

    # Now you can proceed to extract data from the a_tags
    # For example:
    urls = []
    for a_tag in a_tags:
        href = a_tag.get_attribute("href")
        urls.append(href)

    for url in urls:
        try:
            # Load the webpage
            driver.get(url)
            # Wait for 3 seconds
            time.sleep(3)
            soup = BeautifulSoup(driver.page_source, "html.parser")
            info_data = extract_info(soup)
            # Get the title
            title = info_data["title"]
            filename = os.path.join(category_dir, f"{encode_filename(title)}.json")
            if os.path.exists(filename):
                continue
            time.sleep(3)

            # Scroll to the details section
            ActionChains(driver).scroll_by_amount(0, 2000).perform()
            time.sleep(3)
            soup = BeautifulSoup(driver.page_source, "html.parser")
            details = extract_details(soup)

            time.sleep(3)
            description = extract_description_product(soup)

            # Scroll to the desired section
            ActionChains(driver).scroll_by_amount(0, 1300).perform()
            time.sleep(3)
            soup = BeautifulSoup(driver.page_source, "html.parser")
            rating = extract_rating(soup)
            review_data = []

            try:
                btn_next = driver.find_element(
                    By.CSS_SELECTOR,
                    "#customer-review-widget-id > div > div.style__StyledCustomerReviews-sc-1y8vww-0.gCaHEu.customer-reviews > div > div.customer-reviews__pagination > ul > li:nth-child(7) > a",
                )

                while btn_next:
                    soup = BeautifulSoup(driver.page_source, "html.parser")
                    data = extract_reviews(soup)
                    if data == []:
                        break
                    review_data.extend(data)
                    actions = ActionChains(driver)
                    actions.move_to_element(btn_next)
                    actions.click(btn_next)
                    actions.perform()
                    time.sleep(3)
            except:
                soup = BeautifulSoup(driver.page_source, "html.parser")
                data = extract_reviews(soup)
                review_data.extend(data)
            finally:
                # Generate filename
                filename = os.path.join(category_dir, f"{encode_filename(title)}.json")
                data = {
                    "info": info_data,
                    "details": details,
                    "description": description,
                    "rating": rating,
                    "reviews": review_data,
                }

                # Save the extracted data to a JSON file
                with open(filename, "w", encoding="utf-8") as f:
                    json.dump(data, f, ensure_ascii=False, indent=4)
        except:
            continue


# Ensure the base directory exists
base_dir = "tiki"
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

for category in categories:
    try:
        extract_books_from_category(base_dir=base_dir, url=category)
    except:
        continue

Error extract rating: 'NoneType' object has no attribute 'find'
Error extract rating: 'NoneType' object has no attribute 'find'
Error extract rating: 'NoneType' object has no attribute 'find'
Error extract rating: 'NoneType' object has no attribute 'find'
Error extract rating: 'NoneType' object has no attribute 'find'
Error extract rating: 'NoneType' object has no attribute 'find'
Error extract rating: 'NoneType' object has no attribute 'find'
Error extract rating: 'NoneType' object has no attribute 'find'
Error extract rating: 'NoneType' object has no attribute 'find'
Error extract rating: 'NoneType' object has no attribute 'find'
Error extract rating: 'NoneType' object has no attribute 'find'
Error extract rating: 'NoneType' object has no attribute 'find'
Error extract rating: 'NoneType' object has no attribute 'find'
Error extract rating: 'NoneType' object has no attribute 'find'
Error extract rating: 'NoneType' object has no attribute 'find'
Error extract info: 'NoneType' object ha