In [None]:
import json
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time
import os

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--start-maximized")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def get_soup(url):
    response = requests.get(url)
    return BeautifulSoup(response.content, 'html.parser')

def extract_links(soup):
    parent_body = soup.find('ul', class_='flex flex-1 flex-col gap-y-0.5')
    list_li = parent_body.find_all('li')
    links = [li.find('a').get('href') for li in list_li]
    return links

def fetch_page_content(url):
    response = requests.get(url)
    return BeautifulSoup(response.content, 'html.parser')

def extract_text_and_images(driver, url):
    driver.get(url)
    div_main_element = driver.find_element(By.XPATH, "/html/body/div/div/div/div/main/div[1]")
    all_text = ""
    try:
        child_elements = div_main_element.find_elements(By.XPATH, "./*")
    except:
        return all_text

    for child in child_elements:
        try:
            all_text += child.text + "\n"
            img_elements = child.find_elements(By.TAG_NAME, "img")
            if img_elements:
                for img in img_elements:
                    img_src = img.get_attribute("src")
                    all_text += f"URL: {img_src}" + "\n"
            if child.tag_name == "details":
                child.click()
                time.sleep(1)
                p_elements = child.find_elements(By.TAG_NAME, "p")
                if p_elements:
                    answer = ''.join([p.text + "\n" for p in p_elements])
                    all_text += "Câu trả lời: " + answer + "\n\n"
        except Exception:
            continue
    return all_text

def save_to_file(folder_name, endpoint, content):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    file_name = f"{endpoint.split('/')[-1]}.txt"
    with open(f"{folder_name}/{file_name}", "w", encoding="utf-8") as file:
        file.write(content)

def main():
    driver = setup_driver()
    list_domain = [
        'https://smart-dragon.gitbook.io/',
        'https://vietdragon.gitbook.io/'

    ]
    for domain in list_domain:
        folder_name = domain.split('//')[1].split('.')[0]
        soup = get_soup(domain)
        links = extract_links(soup)
        
        for endpoint in links:
            url = f"{domain}{endpoint}"
            page_soup = fetch_page_content(url)
            main_result = page_soup.find('main')
            all_text = main_result.find('header').find('h1').text + "\n"
            try:
                sub_header = main_result.find('header').find('p').text
                all_text += sub_header + "\n"
            except:
                pass
            all_text += extract_text_and_images(driver, url)
            save_to_file(folder_name, endpoint, all_text)
        
    driver.close()

if __name__ == "__main__":
    main()