In [55]:
!pip install selenium
!pip install webdriver_manager

Defaulting to user installation because normal site-packages is not writeable
Collecting selenium
  Downloading selenium-4.26.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting websocket-client~=1.8 (from selenium)
  Downloading websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting PySocks!=1.5.7,<2.0,>=1.5.6 (from urllib3[socks]<3,>=1.26->selenium)
  Downloading PySocks-1.7.1-py3-none-any.wh

In [157]:
import pandas as pd

df = pd.DataFrame(columns = ['단어', '병음', '의미'])

df.to_csv('./chinese_dic.csv', encoding = 'utf-8-sig', header=True, index=False)

In [163]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager  # 자동으로 chromedriver 설치 및 관리
import time
import sys
sys.stdin = open('./input.txt', 'r')
input = sys.stdin.readline

# Selenium 설정
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # 브라우저 창을 띄우지 않고 실행하려면 추가
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

class Crawler():
    def __init__(self):
        self.word_list = []
        while True:
            word = input().strip()
            if not word:
                break
            self.word_list.append(word)

    def load_original_file(self):
        self.df = pd.read_csv("./chinese_dic.csv")

    def run(self):
        for word in self.word_list:
            self.get_word_info(word)
        self.df.to_csv('./chinese_dic.csv', header=True, encoding = 'utf-8-sig', index=False)

    def get_word_info(self, word):
        if self.df['단어'].isin([word]).any():
            return
        
        url = f"https://zh.dict.naver.com/#/search?query={word}"  # 네이버 사전 URL로 대체
        driver.get(url)
        
        # 페이지가 완전히 로드될 때까지 대기
        time.sleep(2)  # 네트워크 상태에 따라 대기 시간을 조정할 수 있습니다.
        
        # 병음 가져오기

        pinyin = driver.find_element(By.CSS_SELECTOR, 'span.pronounce').text  # 실제 CSS 선택자로 수정
        pinyin = pinyin.replace('[', '').replace(']', '').strip()
        #meaning = driver.find_element(By.CSS_SELECTOR, 'p.mean').text
        # 전체 의미 텍스트 가져오기
        meaning_element = driver.find_element(By.CSS_SELECTOR, 'p.mean')
        meaning_text = meaning_element.text

        # "성어" 부분을 포함한 첫 번째 문구를 제거
        # "성어 " 부분 이후의 텍스트만 추출
        if meaning_element.find_elements(By.CSS_SELECTOR, 'span.mark'):
            mark_text = meaning_element.find_element(By.CSS_SELECTOR, 'span.mark').text
            meaning_text = meaning_text.replace(mark_text, "").strip()
        if meaning_element.find_elements(By.CSS_SELECTOR, 'span.word_class'):
            class_text = meaning_element.find_element(By.CSS_SELECTOR, 'span.word_class').text
            meaning_text = meaning_text.replace(class_text, "").strip()
            
        temp = pd.DataFrame({'단어': [word], "병음": [pinyin], '의미': [meaning_text]})
        self.df = pd.concat([self.df, temp], ignore_index = True)

In [164]:
instance = Crawler()
instance.load_original_file()
instance.run()

In [165]:
# Selenium 종료
driver.quit()