In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as bs
import pandas as pd
import os
import time

In [2]:
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 1000)
pd.options.display.float_format = '{:.2f}'.format

In [3]:
service = Service(executable_path=ChromeDriverManager().install())

[WDM] - Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████| 6.29M/6.29M [00:20<00:00, 10.4MB/s]

In [26]:
def get_comments(service, df, lang = 'kr', productNo = 3400123):
    TIMEOUT = 1   
    driver = webdriver.Chrome(service=service)
    
    if lang == 'kr':
        base_url = 'https://series.naver.com/novel/detail.series?'
        episode_url = base_url + 'productNo=' + str(productNo)      
        
    print(episode_url)
    driver.get(episode_url)
    driver.implicitly_wait(TIMEOUT)
    
    if lang == 'kr':

        # 수집 편의를 위한 클린봇 댓글 필터링 기능 해제
        # (클린봇이 부적절한 표현을 감지한 댓글입니다.) 제거
        driver.find_element(By.CLASS_NAME,'u_cbox_cleanbot_setbutton').click()
        time.sleep(0.4)
        driver.find_element(By.CLASS_NAME,'u_cbox_layer_cleanbot2_checkbox').click()
        time.sleep(0.4)
        driver.find_element(By.CLASS_NAME,'u_cbox_layer_cleanbot2_extrabtn').click()
        time.sleep(0.4)

        # 전체 댓글 클릭
        driver.find_element(By.CSS_SELECTOR,'#cbox_module_wai_u_cbox_sort_option_tab2 > span.u_cbox_sort_label').click()
        time.sleep(1)
    
    num_reviews = driver.find_element(By.CSS_SELECTOR,'#cbox_module > div > div.u_cbox_head > span')
    print("num_reviews : ", num_reviews.text)
    
    print("Start crawling.")
    
    # 46,428 pages
    num_pages  = int(num_reviews.text.replace(',','')) + 1
    for page in range(1000): #range(int(num_pages)):
        soup = bs(driver.page_source,"html.parser")

        # Extract dates
        dates = soup.findAll("span", {"class":["u_cbox_date"]})
        dates = [date.text for date in dates]        

        # Extract reveiws
        reviews = soup.findAll("span", {"class":["u_cbox_contents"]})
        reviews = [review.get_text() for review in reviews]

        df_page = pd.DataFrame(data=[], columns=['productNo','Date','Review'])       
        df_page['Date'] = dates
        df_page['Review'] = reviews
        df_page['productNo'] = df_page['productNo'].fillna(int(productNo))
        
        df = pd.concat([df,df_page]) 
        if (page+1) % 100 == 100: 
            print(page+1, " page Done.")        

        if (page+1) == int(num_pages):
            return df
        
        # Move to the next page
        driver.find_element(By.CSS_SELECTOR, "#cbox_module > div > div.u_cbox_paginate > div > strong + a").click()        
        time.sleep(0.2)
    driver.close()
    return df

# create dataframe
df = pd.DataFrame(data=[], columns=['productNo','Date','Review'])

# 전지적 독자 시점 - ebook
# language setting
lang = 'kr'

# productNo setting
if lang == 'kr': 
    productNo = 3400123

# get all comments
df = get_comments(service, df, lang, productNo)

# Save dataframe
DATA_PATH = './data/'
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)
file_path = './data/omniscient-reader_Ebook_{lang}_{productNo}.csv'.format(lang=lang, productNo=productNo)
df.to_csv(file_path,index = False)
print("Save Done.")
df

https://series.naver.com/novel/detail.series?productNo=3400123
num_reviews :  696,416
Start crawling.
Save Done.


Unnamed: 0,productNo,Date,Review
0,3400123,16분 전,📌유중혁 엉덩이
1,3400123,28분 전,📌우리엘 경악 3번
2,3400123,48분 전,너무 슬프다 이번 화
3,3400123,1시간 전,주인공...간혹 고구마 먹이는거 짜증이네...
4,3400123,1시간 전,유중혁 욕 하는 거 왤케 섹시함
...,...,...,...
10,3400123,2022-08-03 12:55,중혁아 생일 축하한다아아악!!!!!!
11,3400123,2022-08-03 12:54,중혁아 생일축하해
12,3400123,2022-08-03 12:54,중혀가!!!!쌩일 축하한답!!!!!!!!
13,3400123,2022-08-03 12:53,중혁아 생일축하해
