In [62]:
import warnings
import time
import pandas as pd
import re
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import requests
from bs4 import BeautifulSoup

# 크롬 드라이버 버전이 수시로 업데이트되는 관계로 드라이버 설치하여 사용
driver = webdriver.Chrome('../chromedriver.exe')
driver.maximize_window()

# Youtube URL List(확장자: .csv) 읽어오기 
input_data_youtube = pd.read_csv('../Input-data/Youtube_Content_URL_list.csv')
url_youtube = input_data_youtube['URL']
url_list_youtube = url_youtube.values.tolist()

# Naver Blog URL List(확장자: .csv) 읽어오기
input_data_naver_blog = pd.read_csv('../Input-data/Naver_Blog_Content_URL_list.csv')
url_naver_blog = input_data_naver_blog['URL']
url_list_naver_blog = url_naver_blog.values.tolist()

# Brunch URL List(확장자: .csv) 읽어오기
input_data_brunch = pd.read_csv('../Input-data/Brunch_Content_URL_list.csv')
url_brunch = input_data_brunch['URL']
url_list_brunch = url_brunch.values.tolist()

# 구독자 수 한글 표기 없이 숫자로만 표현 가능하게끔 하는 함수1
def convert_korean_to_number(korean_str):
    units = {'천': 1000, '만': 10000}
    number = float(re.search(r'[\d.]+', korean_str).group())
    unit_char = re.search(r'[천만]', korean_str)
    if unit_char:
        unit = units[unit_char.group()]
        number *= unit
    return int(number)

# 구독자 수 한글 표기 없이 숫자로만 표현 가능하게끔 하는 함수2
def extract_subscriber_count(input_str):
    pattern = r'구독자\s+([\d.천만]+)명'
    match = re.search(pattern, input_str)
    if match:
        subscriber_count = match.group(1)
        return convert_korean_to_number(subscriber_count)
    return None

# 필요한 데이터 리스트 초기화
id_list = []   # ID
platform_list = []   # 플랫폼 - 유튜브, 네이버 블로그, 브런치 구별
title_list = []   # 제목
publisher_list = []   # 게시자
subscriber_list = []   # 구독자 수
date_list = []   # 게시일
like_list = []   # 좋아요 수
comment_list = []   # 댓글 수
view_list = []   # 조회수

id = 1

# 유튜브 크롤링
for url in url_list_youtube[:]:
    driver.get(url)
    # 페이지가 로드될 때까지 최소 시간 대기
    time.sleep(2)
    # 현재 페이지의 HTML 가져오기
    page_source = driver.page_source
    # BeautifulSoup를 사용하여 HTML 파싱
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # 플랫폼
    platform_list.append('youtube')
    
    # 제목
    try:
        title = soup.find('h1', class_ = 'style-scope ytd-watch-metadata')
        title_text = title.find('yt-formatted-string', class_ = 'style-scope ytd-watch-metadata').text
        title_list.append(title_text)
    except:
        title_list.append("No title found")
        
    # 게시자
    try:
        publisher = soup.find('ytd-channel-name', id = 'channel-name', class_ = 'style-scope ytd-video-owner-renderer')
        publisher_text = publisher.find('yt-formatted-string', class_ = 'style-scope ytd-channel-name complex-string').a.text
        publisher_list.append(publisher_text)
    except:
        publisher_list.append("No publisher found")

    # 구독자 수
    try:
        subscriber = soup.find('yt-formatted-string', id = 'owner-sub-count', class_ = 'style-scope ytd-video-owner-renderer')
        subscriber_int = extract_subscriber_count(subscriber.text)
        subscriber_list.append(subscriber_int)
    except:
        subscriber_list.append("No subscriber count found")
    
    # 좋아요 수
    try:
        like = soup.select('button.yt-spec-button-shape-next--icon-leading')[0]
        like_text = like.get('aria-label')
        like_int = re.sub(r'[^0-9]', '', like_text)
        like_list.append(like_int)
    except:
        like_list.append("No like count found")
    
    # 댓글 수
    driver.find_element_by_tag_name('html').send_keys(Keys.PAGE_DOWN)
    # 페이지가 로드될 때까지 최소 시간 대기
    time.sleep(1.5)
    try:
        comment = driver.find_element(By.CLASS_NAME, 'count-text.style-scope.ytd-comments-header-renderer')
        comment_int = comment.find_elements(By.TAG_NAME, 'span')[1].text
        comment_list.append(comment_int)
    # 댓글 사용 중지인 동영상의 경우 댓글 수 '-'로 처리
    except:
        comment_list.append('-')
        
    # 조회수
    try:
        view = soup.find('span', class_ = 'view-count style-scope ytd-video-view-count-renderer').text.strip()
        view_int = re.sub(r'[^0-9]', '', view)
        view_list.append(view_int)
    except:
        view_list.append("No view found")
    
    # '더보기' 클릭
    driver.find_element(By.ID, 'expand').click()

    # 페이지 새로고침
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')

    # 게시일
    try:
        date = soup.select('yt-formatted-string#info > span:nth-of-type(3)')[0]
        date_text = date.text.strip()
        date_list.append(date_text)
    except:
        date_list.append("No date found")

    # ID
    id_list.append(id)
    id += 1
    
# 네이버 블로그 크롤링
for url in url_list_naver_blog[:]:
    driver.get(url)
    driver.switch_to.frame('mainFrame')
    
    # 플랫폼
    platform_list.append('naver blog')
    
    # 제목
    try:
        title = driver.find_element_by_css_selector('.se-module.se-module-text.se-title-text')
    except:
        title = driver.find_element_by_css_selector('.pcol1')
    title_list.append(title.text)
    
    # 게시자
    publisher = driver.find_element_by_css_selector('.nick')
    publisher_list.append(publisher.text)
    
    # 게시일
    try:
        date = driver.find_element_by_css_selector('.se_publishDate.pcol2')
    except:
        date = driver.find_element_by_css_selector('.date.fil5.pcol2._postAddDate')
    month_dict = {'1.': '01.', '2.': '02.', '3.': '03.', '4.': '04.', '5.': '05.', '6.': '06.', '7.': '07.', '8.': '08.', '9.': '09.', '10.': '10.', '11.': '11.', '12.': '12.'}
    month = month_dict[date.text.split()[1]]
    date_text = f"{date.text.split()[0]} {month} {date.text.split()[2]}"  # 시간 정보 제거
    date_list.append(date_text)

    # 좋아요 수
    try:
        like = driver.find_element_by_css_selector('#floating_bottom > div > div > div.area_sympathy > a > div > span > em.u_cnt._count')
        if like.text == '':
            like_list.append('0')
        else:
            like_list.append(like.text)
    except:
        like_list.append('-')
    
    # 댓글 수
    try:
        comment = driver.find_element_by_css_selector('#commentCount')
        if comment.text == '':
            comment_list.append('0')
        else:
            comment_list.append(comment.text)
    except:
        comment_list.append('-')
    
    # 구독자 수(이웃 수)
    try:
        subscriber = driver.find_element_by_css_selector('#widget-stat > div > ul > li:nth-child(1) > em')
        subscriber_list.append(subscriber.text)
    except:
        try:
            driver.switch_to.frame('BuddyConnectIframe')
            driver.find_element_by_xpath('//*[@id="nc_frame1"]/ul/li[2]/a').click()
            subscriber = driver.find_element_by_css_selector('#nc_frame1 > div.wrap.bg_main > div.content.bg_main.tab2 > div.buddy_cnt > p > strong')
            subscriber_list.append(neighbor.text)
        except:
            subscriber_list.append('-')
    
    # 조회수
    view_list.append('-')
    
    # ID
    id_list.append(id)
    id += 1

# 브런치 크롤링
for url in url_list_brunch[:]:
    driver.get(url)
    
    # 플랫폼
    platform_list.append('brunch')
    
    # 제목
    try:
        title = driver.find_element_by_css_selector('body > div.service_contents.article_contents.\#post_view > div.wrap_view_article.wrap_article.article_view_disable_selection > div.wrap_cover > div > div.cover_cell.cover_direction_left > h1')
    except:
        try:
            title = driver.find_element_by_css_selector('body > div.service_contents.article_contents.\#post_view > div.wrap_view_article.wrap_article.article_view_disable_selection > div.wrap_cover.cover_type_text > div > div.cover_cell.cover_direction_center > h1')
        except:
            title = driver.find_element_by_css_selector('body > div.service_contents.article_contents.\#post_view > div.wrap_view_article.wrap_article.article_view_disable_selection > div.wrap_cover > div > div.cover_cell.cover_direction_center > h1')
    title_list.append(title.text)
    
    # 게시자
    publisher = driver.find_element_by_css_selector('#wrapArticleInfo > span.f_l.text_author.\#author > a')
    publisher_list.append(publisher.text)
    
    # 게시일
    date = driver.find_element_by_css_selector('#wrapArticleInfo > span.f_l.date')
    time.sleep(1)
    month_dict = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', 'July': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'}
    month = month_dict[date.text.split()[0]]
    yymmdd = f"{date.text.split()[2]}. {month}. {date.text.split()[1]}"
    date_list.append(yymmdd)
    
    # 좋아요 수
    like = driver.find_element_by_css_selector('body > div.service_header.article.\#post_toolbar > div.wrap_inner > div.f_r > div > div.default_action_wrap.f_r > a.default_action.headerLikeBtn.img_ico_wrap.\#likeit > span.f_l.text_like_count.text_default.text_with_img_ico.ico_likeit_like.\#like')
    like_list.append(like.text)
    
    # 댓글 수
    try:
        comment = driver.find_element_by_css_selector('body > div.service_header.article.\#post_toolbar > div.wrap_inner > div.f_r > div > div.default_action_wrap.f_r > a.default_action.img_ico_wrap.comment.\#comment > span.f_l.text_comment_count.text_default.text_with_img_ico')
        if comment.text == '':
            comment_list.append('0')
        else:
            comment_list.append(comment.text)
    except:
        comment_list.append('-')
    
    # 구독자 수
    driver.find_element_by_xpath('//*[@id="wrapArticleInfo"]/span[2]/a').click()
    time.sleep(1)
    subscriber = driver.find_element_by_css_selector('#wrapHome > header > div.wrap_profile > div.wrap_profile_desc > dl > dd:nth-child(2) > a > span')
    subscriber_list.append(subscriber.text)
    
    # 조회수
    view_list.append('-')
    
    # ID
    id_list.append(id)
    id += 1

# 데이터프레임 생성
df = {'id': id_list, '플랫폼': platform_list, '제목': title_list, 'URL': url_list_youtube + url_list_naver_blog + url_list_brunch, '게시자': publisher_list, '구독자 수': subscriber_list, '게시일': date_list, '좋아요 수': like_list, '댓글 수': comment_list, '조회수': view_list}
output_data = pd.DataFrame(df)

today = "20231108"

# 추출된 데이터 .csv 파일로 저장
output_data.to_csv("../Output-data/merged_data/merged_data_"+ today +".csv", index = False, encoding = 'utf-8-sig')

  driver = webdriver.Chrome('../chromedriver.exe')
  driver.find_element_by_tag_name('html').send_keys(Keys.PAGE_DOWN)


NameError: name 'url_list_naver_blog' is not defined