# 카카오 주간연재 웹툰 스크레이퍼

## 라이브러리

In [104]:
import matplotlib.pyplot as plt
import os
import pandas as pd
from PIL import Image
import pyderman
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import WebDriverException
import time
from tqdm import tqdm
import urllib.request

## 카카오 주간연재 웹툰 정보 긁어오기
- https://webtoon.kakao.com/

In [44]:
def kakao_webtoon_scraper():
    """webtoon.kakao.com의 소설원작, 웹툰원작의 모든 웹툰 url 및 웹툰번호 긁어오기    
    Args:
        없음

    Returns:
        각 웹툰 썸네일의 url 및 웹툰번호, 드라이버 경로
    """

    base_url = f'https://webtoon.kakao.com/'
    original_types = {'webtoon': '웹툰원작',
                      'novel': '소설원작'}
    
    driver_path = pyderman.install(browser=pyderman.chrome)
    print(f'Installed geckodriver driver to path: {driver_path}')

    wd = webdriver.Chrome(driver_path)
    wd.implicitly_wait(10)
    time.sleep(1)

    ret = []
    for key in original_types:
        url = f'{base_url}original-{key}?tab=mon'
        wd.get(url)
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        
        a_tags = wd.find_elements(By.CSS_SELECTOR, 'a')
        for a_tag in a_tags:
            href = a_tag.get_attribute('href')
            if href.startswith('https://webtoon.kakao.com/content/'):
                ret.append((href, href[-4:].replace('/', '')))
    return ret, driver_path

episode_url_with_num, driver_path = kakao_webtoon_scraper()

chromedriver is already installed.
Installed geckodriver driver to path: c:\Users\after\code\likelion-ai7\final-project\lib\chromedriver_108.0.5359.71.exe


In [None]:
def wheel_element(element, deltaY=120, offsetX=0, offsetY=0):
    """selenium 라이브러리의 webdriver를 이용한 마우스 조작"""
    
    error = element._parent.execute_script("""
        var element = arguments[0];
        var deltaY = arguments[1];
        var box = element.getBoundingClientRect();
        var clientX = box.left + (arguments[2] || box.width / 2);
        var clientY = box.top + (arguments[3] || box.height / 2);
        var target = element.ownerDocument.elementFromPoint(clientX, clientY);

        for (var e = target; e; e = e.parentElement) {
        if (e === element) {
            target.dispatchEvent(new MouseEvent('mouseover', {view: window, bubbles: true, cancelable: true, clientX: clientX, clientY: clientY}));
            target.dispatchEvent(new MouseEvent('mousemove', {view: window, bubbles: true, cancelable: true, clientX: clientX, clientY: clientY}));
            target.dispatchEvent(new WheelEvent('wheel',     {view: window, bubbles: true, cancelable: true, clientX: clientX, clientY: clientY, deltaY: deltaY}));
            return;
        }
        }    
        return "Element is not interactable";
        """, element, deltaY, offsetX, offsetY)
    if error:
        raise WebDriverException(error)
    return None

def get_thumbnail_url(url, number, driver):
    """각 웹툰의 모든 에피소드에 대한 정보를 담은 데이터프레임 반환
    Args:
        url: 웹툰 url
        number: 웹툰번호
        driver: Selenium의 드라이버

    Returns:
        해당 웹툰의 모든 회차(에피소드)의 썸네일 주소 및
        제목, 장르, 조회수, 좋아요 수 등을 포함한 데이터프레임
    """

    driver.get(url)

    try:
        selector = f'div.mx-20.flex.justify-between.relative.z-1'\
            f'.pointer-events-auto.pt-12 > div > div > div > p'
        elements = driver.find_elements_by_css_selector(selector)
        addtl_info = [elem.text for elem in elements if elem.text]
        
        selector = f'p.whitespace-pre-wrap.break-all.break-words'\
            f'.support-break-word.overflow-hidden.text-ellipsis.\!'\
            f'whitespace-nowrap.s22-semibold-white.leading-33.mb-1'
        title = driver.find_element_by_css_selector(selector).text
    except NoSuchElementException:
        return pd.DataFrame()
    
    time.sleep(1)
    
    element = driver.find_element_by_css_selector('main')
    
    time.sleep(1)
    
    # 성인용 웹툰 등 비로그인시 접근이 어려운 웹툰의 경우 빈 데이터프레임 반환
    try:
        wheel_element(element, 120)
    except WebDriverException:
        return pd.DataFrame()

    time.sleep(1)

    element = driver.find_element_by_css_selector('body')

    for _ in range(500):
        element.send_keys(Keys.TAB)
        time.sleep(0.005)

    img_tags = driver.find_elements_by_css_selector('img')
    
    # 에피소드, 에피소드별 url
    info = [[img_tag.get_attribute('alt'),
             img_tag.get_attribute('src')] for img_tag in img_tags]

    df = pd.DataFrame(info, columns=['episode', 'url'])
    df = df[df['episode'].astype(bool)]
    df = df[df['url'].apply(
             lambda x: x.startswith(f'https://kr-a.kakaopagecdn.com/P/EO/{number}')\
                        .endswith('jpg')
         )]
    df['title'] = title
    df['additional information'] = ' - '.join(addtl_info)
    return df

In [1]:
driver = webdriver.Chrome(driver_path)
df = pd.DataFrame()
for url, number in tqdm(episode_url_with_num):
    df_to_concat = get_thumbnail_url(url, number, driver)
    df = pd.concat([df, df_to_concat], axis=0)
    df.to_csv('./kakao-webtoon.csv', index=False)
    print(f'DataFrame shape: {df.shape}')

## 카카오 웹툰의 각 에피소드 썸네일 폴더 내 저장

In [129]:
df = pd.read_csv('./kakao-webtoon.csv')

print(df.shape)
display(df.head())
display(df.tail())

(57285, 4)


Unnamed: 0,episode,url,title,additional information
0,62화,https://kr-a.kakaopagecdn.com/P/EO/2589/188649...,대사형 선유,"액션/무협 - 1,273.9만 - 39.1만"
1,61화,https://kr-a.kakaopagecdn.com/P/EO/2589/187214...,대사형 선유,"액션/무협 - 1,273.9만 - 39.1만"
2,60화,https://kr-a.kakaopagecdn.com/P/EO/2589/185864...,대사형 선유,"액션/무협 - 1,273.9만 - 39.1만"
3,59화,https://kr-a.kakaopagecdn.com/P/EO/2589/184741...,대사형 선유,"액션/무협 - 1,273.9만 - 39.1만"
4,58화,https://kr-a.kakaopagecdn.com/P/EO/2589/183720...,대사형 선유,"액션/무협 - 1,273.9만 - 39.1만"


Unnamed: 0,episode,url,title,additional information
57280,4화,https://kr-a.kakaopagecdn.com/P/EO/2629/147650...,미연시는 1회차로 족하다,3일마다 무료 - 로맨스 판타지 - 74.7만 - 1.4만
57281,3화,https://kr-a.kakaopagecdn.com/P/EO/2629/147649...,미연시는 1회차로 족하다,3일마다 무료 - 로맨스 판타지 - 74.7만 - 1.4만
57282,2화,https://kr-a.kakaopagecdn.com/P/EO/2629/147648...,미연시는 1회차로 족하다,3일마다 무료 - 로맨스 판타지 - 74.7만 - 1.4만
57283,1화,https://kr-a.kakaopagecdn.com/P/EO/2629/147647...,미연시는 1회차로 족하다,3일마다 무료 - 로맨스 판타지 - 74.7만 - 1.4만
57284,프롤로그,https://kr-a.kakaopagecdn.com/P/EO/2629/147597...,미연시는 1회차로 족하다,3일마다 무료 - 로맨스 판타지 - 74.7만 - 1.4만


In [None]:
def prune_special_characters(x):
    """문자열 내 특수문자 특수문자 제거"""
    x = re.sub(r'[^0-9가-힣a-zA-Z ()]', '', x)
    return x.strip()

def make_webtoon_folders(titles, base_path='./kakao_webtoon'):
    """pd.Series 형태의 titles를 입력으로 받아 웹툰명을 가지는 폴더 생성
    Args:
        titles: 웹툰 제목 리스트
    
    Returns:
        없음    
    """
    
    try:
        os.makedirs(base_path)
    except FileExistsError:
        # directory already exists
        pass
    
    for title in titles:
        try:
            os.makedirs(f'{base_path}/{title}')
        except FileExistsError:
            pass
    return None

def save_thumbnail_file(row, base_path='./kakao_webtoon'):
    """카카오웹툰 정보가 담긴 데이터프레임으로부터 이미지를 분류하여 저장"""
    img_urls = df['url']
    titles = df['title']
    episodes = df['episode']
    
    file_names = base_path + '/' + titles + '/' + episodes + '.jpg'
    
    for img_url, file_name in tqdm(zip(img_urls, file_names)):
        urllib.request.urlretrieve(img_url, file_name)
    return None

In [141]:
df['title'] = df['title'].map(prune_special_characters)
df['episode'] = df['episode'].map(prune_special_characters)
titles = df['title'].unique().copy().tolist()
make_webtoon_folders(titles, base_path='./kakao_webtoon')
save_thumbnail_file(df)

55399it [1:18:46, 14.83it/s]