In [127]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import csv
from pathlib import Path
from docx import Document
from urllib.parse import urlparse
import json


In [128]:
URLS_FILE = "../urls/urls_mcq.txt"     
START_PAGE = 1
MAX_PAGE = 50              
DATA_DIR = Path("data_crawler")
DOC_DIR = Path("docs")

DATA_DIR.mkdir(exist_ok=True)
DOC_DIR.mkdir(exist_ok=True)

In [129]:
def get_topic_name(url):
    path = urlparse(url).path.strip("/")
    return path.split("/")[-1]

def build_page_url(base_url, page):
    if page == 1:
        return base_url
    return f"{base_url.rstrip('/')}/page/{page}/"

In [130]:
def init_driver():
    options = Options()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--start-maximized")
    # options.add_argument("--headless")  # bật nếu muốn chạy ngầm

    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )
    return driver

In [131]:
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import time

def crawl_one_page(driver, url):
    driver.get(url)
    time.sleep(3) 
    try:
        view_answer_buttons = driver.find_elements(By.XPATH, "//button[contains(text(), 'View Answer')]")

        for btn in view_answer_buttons:
            driver.execute_script("arguments[0].click();", btn)

        time.sleep(2)
    except Exception as e:
        print(f"Lỗi khi click nút View Answer: {e}")

    soup = BeautifulSoup(driver.page_source, "html.parser")
    mcqs = []

    questions = soup.select("div.questionContent")

    for q in questions:
        q_text = q.select_one(".questionContentText p")
        question = q_text.get_text(strip=True) if q_text else ""

        options = ["", "", "", ""]
        correct_answer = ""
        explanation = ""

        for opt in q.select(".optionItem"):
            label = opt.select_one(".optionIndex")
            content = opt.select_one(".optionContent")

            if not label or not content:
                continue

            idx = ord(label.text.strip()) - ord("A")
            text = content.get_text(strip=True)

            if 0 <= idx < 4:
                options[idx] = text

            classes = " ".join(opt.get("class", [])).lower()

            if "correct" in classes or "success" in classes:
                correct_answer = text

        exp = q.select_one(".explanationContent p")
        if exp:
            explanation = exp.get_text(strip=True)

        mcqs.append([
            question,
            options[0],
            options[1],
            options[2],
            options[3],
            correct_answer,
            explanation
        ])

    return mcqs

In [132]:
def save_csv(mcqs, csv_path):
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow([
            "Question",
            "Option A",
            "Option B",
            "Option C",
            "Option D",
            "Correct Answer",
            "Explanation"
        ])
        writer.writerows(mcqs)

def csv_to_docx(csv_file, docx_file):
    doc = Document()
    doc.add_heading(csv_file.stem.replace("_", " ").title(), level=1)

    with open(csv_file, encoding="utf-8") as f:
        reader = csv.DictReader(f)

        for index, row in enumerate(reader, start=1):
            doc.add_paragraph(f"{index}. {row['Question']}")
            options = [
                row["Option A"],
                row["Option B"],
                row["Option C"],
                row["Option D"],
            ]

            correct = row["Correct Answer"]

            for idx, text in enumerate(options):
                if not text:
                    continue
                p = doc.add_paragraph()
                run = p.add_run(f"{chr(65+idx)}. {text}")
                if text == correct:
                    run.bold = True

            doc.add_paragraph("")

    doc.save(docx_file)

In [133]:
def main():
    with open(URLS_FILE, encoding="utf-8") as f:
        base_urls = [line.strip() for line in f if line.strip()]

    driver = init_driver()

    for base_url in base_urls:
        topic = get_topic_name(base_url)
        print(f"\n Topic: {topic}")

        topic_data_dir = DATA_DIR / topic
        topic_doc_dir = DOC_DIR / topic

        topic_data_dir.mkdir(exist_ok=True)
        topic_doc_dir.mkdir(exist_ok=True)

        for page in range(START_PAGE, MAX_PAGE + 1):
            page_url = build_page_url(base_url, page)

            csv_path = topic_data_dir / f"page_{page}.csv"
            docx_path = topic_doc_dir / f"page_{page}.docx"

            if docx_path.exists():
                print(f" Skip {topic} page {page} (already crawled)")
                continue

            print(f"Crawling {topic} page {page}")

            try:
                mcqs = crawl_one_page(driver, page_url)

                if not mcqs:
                    print(f"No data, stop at page {page}")
                    break

                save_csv(mcqs, csv_path)
                csv_to_docx(csv_path, docx_path)

                print(f"{topic} page {page}: {len(mcqs)} questions")

            except Exception as e:
                print(f"Error {topic} page {page}: {e}")
                break

    driver.quit()
    print("\n ALL DONE")

if __name__ == "__main__":
    main()


 Topic: docker
 Skip docker page 1 (already crawled)
 Skip docker page 2 (already crawled)
 Skip docker page 3 (already crawled)
 Skip docker page 4 (already crawled)
 Skip docker page 5 (already crawled)
Crawling docker page 6
No data, stop at page 6

 Topic: cloud-computing
 Skip cloud-computing page 1 (already crawled)
 Skip cloud-computing page 2 (already crawled)
 Skip cloud-computing page 3 (already crawled)
 Skip cloud-computing page 4 (already crawled)
 Skip cloud-computing page 5 (already crawled)
Crawling cloud-computing page 6
No data, stop at page 6

 Topic: linux
 Skip linux page 1 (already crawled)
 Skip linux page 2 (already crawled)
 Skip linux page 3 (already crawled)
 Skip linux page 4 (already crawled)
 Skip linux page 5 (already crawled)
Crawling linux page 6
No data, stop at page 6

 ALL DONE
