In [42]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import csv
import re
from urllib.parse import urlparse
from docx import Document
import os

In [43]:
#setup - crawl bi chan bot xem dap an nen dung selenium 
DATA_DIR = "data_crawler"
DOC_DIR = "docs"
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(DOC_DIR, exist_ok=True)

options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--start-maximized")
# options.add_argument("--headless")

In [44]:
def slug_from_url(url):
    path = urlparse(url).path.strip("/")
    return path.split("/")[0]

# cai chrome
def init_driver():
    return webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )

In [45]:
def crawl_one_url(driver, url):
    driver.get(url)
    time.sleep(3)

    buttons = driver.find_elements(By.XPATH, "//span[contains(text(),'View Answer')]")
    for btn in buttons:
        driver.execute_script("arguments[0].click();", btn)
        time.sleep(0.15)

    soup = BeautifulSoup(driver.page_source, "html.parser")

    mcqs = []

    for p in soup.find_all("p"):
        text = p.get_text("\n", strip=True)

        if not re.match(r"^\d+\.", text):
            continue

        lines = text.split("\n")

        question = lines[0]
        options = [l for l in lines[1:] if re.match(r"^[a-d]\)", l.lower())]

        answer_div = p.find_next_sibling("div", class_="collapseomatic_content")
        answer = ""
        explanation = ""
        
        if answer_div:
            for line in answer_div.get_text("\n", strip=True).split("\n"):
                if line.startswith("Answer:"):
                    answer = line.replace("Answer:", "").strip()
                elif line.startswith("Explanation:"):
                    explanation = line.replace("Explanation:", "").strip()
        
        mcqs.append([question, *options, answer, explanation])

    return mcqs

In [46]:
def save_csv(mcqs, csv_path):
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow([
            "Question",
            "Option A", "Option B", "Option C", "Option D",
            "Correct Answer", "Explanation"
        ])
        writer.writerows(mcqs)

In [47]:
def csv_to_docx(csv_file, docx_file):
    doc = Document()
    doc.add_heading(csv_file.stem.replace("-", " ").title(), level=1)

    with open(csv_file, encoding="utf-8") as f:
        reader = csv.DictReader(f)

        for row in reader:
            doc.add_paragraph(row["Question"])

            options = {
                "a": row["Option A"],
                "b": row["Option B"],
                "c": row["Option C"],
                "d": row["Option D"],
            }

            correct = row["Correct Answer"]

            for k, text in options.items():
                p = doc.add_paragraph()
                run = p.add_run(text)
                if k == correct:
                    run.bold = True

            doc.add_paragraph("")

    doc.save(docx_file)

In [48]:
def main():
    with open("urls.txt") as f:
        urls = [u.strip() for u in f if u.strip()]

    driver = init_driver()

    for url in urls:
        print(f"Crawling {url}")
        slug = slug_from_url(url)

        mcqs = crawl_one_url(driver, url)

        csv_path = os.path.join(DATA_DIR, f"{slug}.csv")
        save_csv(mcqs, csv_path)

        docx_path = os.path.join(DOC_DIR, f"{slug}.docx")
        csv_to_docx(
            csv_file=Path(csv_path),
            docx_file=docx_path
        )

        print(f"{slug}: {len(mcqs)} câu")

    driver.quit()

if __name__ == "__main__":
    from pathlib import Path
    main()

Crawling https://www.sanfoundry.com/docker-mcq-multiple-choice-questions/
docker-mcq-multiple-choice-questions: 46 câu
Crawling https://www.sanfoundry.com/1000-cloud-computing-questions-answers/
1000-cloud-computing-questions-answers: 41 câu
