In [2]:
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re

# 설정
download_folder = "G:/내 드라이브/project_poster/genre_code_movie_posters"
os.makedirs(download_folder, exist_ok=True)

# Selenium 드라이버 설정
driver = webdriver.Chrome()

# 1. URL 방문
page_url = 'https://www.kmdb.or.kr/db/have/detailSearch/imageSearch'
driver.get(page_url)
time.sleep(1)

# genre_mapping 정의 (예시)
genre_mapping = {
    "드라마": "1000000000000000000000000000000000000000000",
    "첩보": "0100000000000000000000000000000000000000000",
    "옴니버스": "0010000000000000000000000000000000000000000",
    "뮤직": "0001000000000000000000000000000000000000000",
    "동성애": "0000100000000000000000000000000000000000000",
    "로드무비": "0000010000000000000000000000000000000000000",
    "아동": "0000001000000000000000000000000000000000000",
    "하이틴(고교)": "0000000100000000000000000000000000000000000",
    "전기": "0000000010000000000000000000000000000000000",
    "청춘영화": "0000000001000000000000000000000000000000000",
    "재난": "0000000000100000000000000000000000000000000",
    "문예": "0000000000010000000000000000000000000000000",
    "연쇄극": "0000000000001000000000000000000000000000000",
    "신파": "0000000000000100000000000000000000000000000",
    "활극": "0000000000000010000000000000000000000000000",
    "반공/분단": "0000000000000001000000000000000000000000000",
    "군사": "0000000000000000100000000000000000000000000",
    "계몽": "0000000000000000010000000000000000000000000",
    "사회물(경향)": "0000000000000000001000000000000000000000000",
    "스포츠": "0000000000000000000100000000000000000000000",
    "합작(번안물)": "0000000000000000000010000000000000000000000",
    "종교": "0000000000000000000001000000000000000000000",
    "무협": "0000000000000000000000100000000000000000000",
    "미스터리": "0000000000000000000000010000000000000000000",
    "SF": "0000000000000000000000001000000000000000000",
    "코메디": "0000000000000000000000000100000000000000000",
    "느와르": "0000000000000000000000000010000000000000000",
    "액션": "0000000000000000000000000001000000000000000",
    "범죄": "0000000000000000000000000000100000000000000",
    "어드벤처": "0000000000000000000000000000010000000000000",
    "가족": "0000000000000000000000000000001000000000000",
    "에로": "0000000000000000000000000000000100000000000",
    "멜로/로맨스": "0000000000000000000000000000000010000000000",
    "공포": "0000000000000000000000000000000001000000000",
    "뮤지컬": "0000000000000000000000000000000000100000000",
    "시대극/사극": "0000000000000000000000000000000000010000000",
}

def get_genre_labels(genre_texts):
    labels = []
    for genre in genre_texts:
        labels.append(genre_mapping.get(genre, ''))
    return ' | '.join(labels)

# 2. 첫 번째 버튼 클릭 (필요에 따라)
try:
    first_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, "/html/body/div[2]/div[5]/div[2]/section/div/div[3]/div[3]/span/a"))
    )
    first_button.click()
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//span[contains(@style, 'background-image')]")))
    print("첫 번째 버튼 클릭 성공")
except Exception as e:
    print(f"첫 번째 버튼 클릭 실패: {e}")
    
# 중복 방지를 위한 세트
selected_images = set()
sleep_time=1

# 페이지 탐색 및 이미지 다운로드
def scrape_images(batch_size=100):
    for _ in range(12):  # 12번 클릭 반복

        try:
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            posters = soup.find_all('span', style=re.compile(r'background-image'))
        except Exception as e:
            print(f"이미지를 찾는 중 오류 발생: {e}")
            return
        if not posters:
            print("No posters found on this page.")

            for span in posters:
                try:
                    # 이미지를 새로 가져옴 (stale element 방지)
                    style = span.get_attribute('style')
                    img_url_match = re.search(r'url\((.*?)\)', style)

                    if img_url_match:
                        img_url = img_url_match.group(1).strip("'\"")

                        # 중복 방지: 이미 처리한 이미지 URL은 스킵
                        if img_url in selected_images:
                            continue

                        # 새 이미지 URL만 처리
                        if img_url.startswith('http://file.koreafilm.or.kr/poster/'):
                            print(f"Found image URL: {img_url}")
                            selected_images.add(img_url)  # 중복 방지를 위해 기록

                            # 이미지 클릭 및 장르 정보 수집
                            try:
                                span.click()
                                time.sleep(sleep_time)  # 페이지 로딩 대기

                                # 닫기 버튼 클릭
                                close_button = driver.find_element(By.XPATH, "/html/body/div[4]/div[2]/div[1]/a")
                                close_button.click()
                                time.sleep(1)

                                # 기본정보 클릭
                                basic_info_button = driver.find_element(By.XPATH, "/html/body/div[2]/div[5]/div[2]/form/section/div[2]/div[1]/div[1]/div/div[2]/a")
                                basic_info_button.click()
                                time.sleep(1)

                                # 장르 수집
                                genre_elements = driver.find_elements(By.CLASS_NAME, 'mTag1 gray')
                                genres = [genre_elem.text for genre_elem in genre_elements]
                                genre_labels = get_genre_labels(genres)
                                if len(genres) > 1:
                                    genre_labels = ' | '.join(set(genre_labels.split(' | ')))  # 복합 장르일 경우 or 연산

                                print(f"장르: {genres}, 라벨: {genre_labels}")

                                # 이미지 저장
                                image_url = image.get_attribute('href')
                                image_name = f"{genre_labels}.jpg"  # 파일명 지정
                                driver.get(image_url)

                                with open(os.path.join(download_folder, image_name), 'wb') as file:
                                    file.write(driver.page_source.encode('utf-8'))

                                driver.back()
                                time.sleep(1)

                                # 5. 장르 페이지로 이동 후 다시 리스트로 돌아가기
                                max_back_attempts = 10  # 최대 뒤로 가기 시도 횟수
                                back_attempts = 0

                                while back_attempts < max_back_attempts:
                                    try:
                                        # 뒤로 가기 버튼 클릭
                                        back_button = WebDriverWait(driver, 10).until(
                                            EC.element_to_be_clickable((By.XPATH, "/html/body/div[2]/div[5]/div[2]/div[1]/div/div/a"))  # 뒤로 가기 버튼의 XPATH
                                        )
                                        back_button.click()
                                        time.sleep(2)  # 버튼 클릭 후 페이지 로딩 대기

                                        # 특정 URL의 일부가 포함되었는지 확인
                                        WebDriverWait(driver, 10).until(
                                            EC.url_contains('imageSearch')  # 돌아와야 할 URL의 일부
                                        )

                                        # URL이 확인되면 반복 종료
                                        print("페이지가 성공적으로 돌아왔습니다.")
                                        break  # 페이지가 돌아오면 while 루프 종료
                                    except Exception as e:
                                        print(f"뒤로 가기 실패: {e}")
                                        back_attempts += 1  # 시도 횟수 증가
                                        time.sleep(1)  # 잠시 대기 후 재시도
                                    
                            except Exception as e:
                                print(f"")
                                driver.back()  # 오류가 발생하면 페이지를 되돌리기
                except Exception as e:
                    print(f"")

# 스크래핑 실행
scrape_images()

# 드라이버 종료
driver.quit()


첫 번째 버튼 클릭 성공


TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF79969B125+29573]
	(No symbol) [0x00007FF79960FF50]
	(No symbol) [0x00007FF7994CB6EA]
	(No symbol) [0x00007FF79951F815]
	(No symbol) [0x00007FF79951FA6C]
	(No symbol) [0x00007FF79956B917]
	(No symbol) [0x00007FF79954733F]
	(No symbol) [0x00007FF7995686BC]
	(No symbol) [0x00007FF7995470A3]
	(No symbol) [0x00007FF7995112DF]
	(No symbol) [0x00007FF799512441]
	GetHandleVerifier [0x00007FF7999CC76D+3377613]
	GetHandleVerifier [0x00007FF799A17B67+3685831]
	GetHandleVerifier [0x00007FF799A0CF8B+3641835]
	GetHandleVerifier [0x00007FF79975B2A6+816390]
	(No symbol) [0x00007FF79961B25F]
	(No symbol) [0x00007FF799617084]
	(No symbol) [0x00007FF799617220]
	(No symbol) [0x00007FF79960607F]
	BaseThreadInitThunk [0x00007FFCFA88257D+29]
	RtlUserThreadStart [0x00007FFCFBA4AF28+40]
