In [1]:
import re
import requests
import html2text
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import csv
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup


def download_images_from_markdown(markdown_content):
    image_links = re.findall(r"!\[.*?\]\((.*?)\)", markdown_content)
    saved_images = {}

    for image_link in image_links:
        # 이미지 URL에서 파일명 추출
        image_url_parts = image_link.split("/")
        image_name = "-".join(image_url_parts[2:])  # 도메인을 제외한 나머지 URL을 연결하여 파일명 생성

        # 이미지 다운로드 및 저장
        response = requests.get(image_link)
        if response.status_code == 200:
            image_folder = "img"
            os.makedirs(image_folder, exist_ok=True)
            image_path = os.path.join(image_folder, image_name)

            with open(image_path, "wb") as f:
                f.write(response.content)

            saved_images[image_link] = image_path

    return saved_images


def download_banner_image(image_url):
    image_path = ""
    if image_url:
        response = requests.get(image_url)
        if response.status_code == 200:
            image_folder = "img"
            os.makedirs(image_folder, exist_ok=True)
            image_name = "-".join(image_url.split("/")[-2:])  # URL에서 파일명 생성
            image_path = os.path.join(image_folder, image_name)

            with open(image_path, "wb") as f:
                f.write(response.content)
    return image_path


def save_markdown_content(extracted_data):
    title = extracted_data.get("title", "Untitled").replace(" ", "-")
    filename = title + ".md"

    markdown_content = f"# {extracted_data.get('title', 'Title not found')}\n\n"
    if extracted_data.get("banner_image"):
        markdown_content += f"![Banner Image]({extracted_data['banner_image']})\n\n"

    # Keywords 섹션을 가로로 나열하고 백틱으로 각 키워드 강조
    if extracted_data.get("keywords"):
        keywords_formatted = " ".join(
            f"`{keyword}`" for keyword in extracted_data["keywords"]
        )
        markdown_content += f"## Keywords\n\n{keywords_formatted}\n\n"

    markdown_content += "---\n\n"  # 구분선 추가
    markdown_content += extracted_data.get("content", "")

    with open(filename, "w", encoding="utf-8") as file:
        file.write(markdown_content)

    print(f"Markdown content saved to '{filename}'")


# 함수 사용 예
# save_markdown_content(extracted_data)


def get_html_data():
    # 상세 페이지(detail.html)의 HTML 내용을 읽습니다.
    with open("detail.html", "r", encoding="utf-8") as file:
        html_content = file.read()

    # BeautifulSoup로 HTML 파싱
    soup_detail = BeautifulSoup(html_content, "html.parser")

    # CSS 선택자를 사용하여 필요한 데이터 추출
    title = soup_detail.select_one(
        "#root > div:nth-of-type(2) > div:nth-of-type(3) > div > h1"
    )
    date = soup_detail.select_one(
        "#root > div:nth-of-type(2) > div:nth-of-type(3) > div > div:nth-of-type(1) > div:nth-of-type(1) > span:nth-of-type(3)"
    )
    keywords = soup_detail.select(
        "#root > div:nth-of-type(2) > div:nth-of-type(3) > div > div:nth-of-type(2) > a"
    )
    banner_image = soup_detail.select_one(
        "#root > div:nth-of-type(2) > div:nth-of-type(3) > img"
    )
    content = soup_detail.select_one(
        "#root > div:nth-of-type(2) > div:nth-of-type(4) > div > div"
    )

    # html2text 라이브러리를 이용하여 HTML 콘텐츠를 Markdown으로 변환
    converter = html2text.HTML2Text()
    converter.ignore_links = False
    markdown_content = converter.handle(str(content)) if content else ""
    # 이미지 다운로드 및 Markdown 내의 이미지 링크 수정
    downloaded_images = download_images_from_markdown(markdown_content)
    for original_link, saved_path in downloaded_images.items():
        markdown_content = markdown_content.replace(original_link, saved_path)

    # 이미지 다운로드 및 저장
    banner_image_path = download_banner_image(banner_image["src"])

    # 추출된 데이터 출력
    extracted_data = {
        "title": title.get_text().strip() if title else "Title not found",
        "date": date.get_text().strip() if date else "Date not found",
        "keywords": [keyword.get_text().strip() for keyword in keywords]
        if keywords
        else "Keywords not found",
        "banner_image": banner_image_path,
        "content": markdown_content,  # 내용 전체를 Markdown 형식으로 변환
    }
    return extracted_data


# 함수 실행 및 결과 확인
extracted_data = get_html_data()
print(extracted_data["title"])
print(extracted_data["date"])
print(extracted_data["keywords"])
print(extracted_data["banner_image"])
save_markdown_content(extracted_data)

23년12월 Stable Diffusion 시작
2023년 12월 18일
['AI 이미지', 'Stable Diffusion', 'chilloutmix', 'realistic vision', 'webui', '설치', '스테이블 디퓨전', '이미지생성']
img/ee3abf75-776e-40fc-ad97-61e4c519b022-image.png
Markdown content saved to '23년12월-Stable-Diffusion-시작.md'
