<a href="https://colab.research.google.com/github/cksdlakstp12/chegyedan-computational-hanmadang/blob/main/Dalle4Wak_DataCrawling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import cv2
import numpy as np

import datetime
import time
import zipfile
import os
import glob
from threading import Thread

from selenium import webdriver
from selenium.webdriver.chrome.options import Options 
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import urllib.request

# from google.colab import drive
# drive.mount("/content/drive")

In [None]:
def str2float(text):
    if "만" in text:
        text = text.replace("만", "").replace(",", "")
        result = float(text) * 10000
        
    elif "천" in text:
        text = text.replace("천", "").replace(",", "")
        result = float(text) * 1000
    
    else:
        text = text.replace(",", "")
        if not text.isnumeric():
            text = 0
        result = float(text)
        
    return result

In [None]:
class NoHeadDriver():
    def __init__(self, css_selectors, name_to_url):
        self.driver = self.init_driver()

        self.css_selectors = css_selectors
        self.name_to_url = name_to_url

    def init_driver(self):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')

        prefs = {"profile.default._setting_values.notifications":1}
        chrome_options.add_experimental_option("prefs", prefs)
        driver = webdriver.Chrome("chromedriver", chrome_options=chrome_options)
        driver = webdriver.Chrome("chromedriver")
        return driver

    def scrolling_to_bottom(self):
        print("Start scrolling...", end="\t")
        scroll_start_time = time.time()
        before_h = self.driver.execute_script("return window.scrollY")
        while True:
            self.driver.find_element_by_css_selector('body').send_keys(Keys.END) #맨 아래로 스크롤 내림
            time.sleep(1) #스크롤 사이 페이지 로딩 시간
            after_h = self.driver.execute_script("return window.scrollY")
            if after_h == before_h:
                break
            before_h = after_h
        print("Scrolling end in :", time.time() - scroll_start_time)
    
    def move_to_page(self, page_url):
        self.driver.get(page_url)
        self.driver.implicitly_wait(1)    
        if "iframe" in self.css_selectors: # if not switch to iframe, will must occur NoSuchElementException
            self.driver.switch_to.frame(self.driver.find_element_by_css_selector(self.css_selectors["iframe"]))

    def file_compressing(self, dir):
        print(f"Start {dir} files compressing...", end="")
        download_start_time = time.time()  
        with zipfile.ZipFile(f'{dir.split("/")[-1]}.zip', 'w', compression=zipfile.ZIP_DEFLATED) as new_zip:

            ROOT_PATH = f"{dir}/"
            file_name_list = os.listdir(ROOT_PATH)

            for file_name in file_name_list:
                new_zip.write(os.path.join(ROOT_PATH, file_name), arcname=file_name) 
        
        print(f" : done in {time.time() - download_start_time}")

In [None]:
class WakZooFanArtCrawler(NoHeadDriver):
    def __init__(self, channel_name_to_url, crawl_date_diff=None, visit_thres=0, likes_thres=0):
        '''
        channel_name_to_url -> dict
            - url의 닉네임을 key로, url을 value로 하여 딕셔너리를 만들면 된다.
            ex)    {"gomem_fan_art":"https://www.-----", ---}
        
        crawl_date_diff -> int
            - 오늘로부터 몇 일까지 크롤링할지 지정
            - default로 둘 시, 모든 게시물을 크롤링한다.
            ex) 3으로 지정시 오늘로부터 3일전의 영상까지 크롤링한다.
        
        visit_thres, likes_thres -> int
            - 이 파라미터 이상의 값인 게시물만 크롤링한다.
            
        method run_crawler -> void
            - dir 파라미터에 경로를 주면 해당 경로에 데이터를 저장한다.
        
        '''
        css_selectors = {
            "date":"#main-area > div:nth-child(4) > table > tbody > tr:nth-child(changable_index) > td.td_date",
            "title":'#main-area > div:nth-child(4) > table > tbody > tr:nth-child(changable_index) > td.td_article > div.board-list > div > a.article',
            "article_inner_title":"#app > div > div > div.ArticleContentBox > div.article_header > div.ArticleTitle > div > h3",
            "visit":'#main-area > div:nth-child(4) > table > tbody > tr:nth-child(changable_index) > td.td_view',
            "likes":'#main-area > div:nth-child(4) > table > tbody > tr:nth-child(changable_index) > td.td_likes',
            "image":".se-image-resource",
            "iframe":'#cafe_main',
            "page_bar":"#main-area > div.prev-next"
        }
        super().__init__(css_selectors, channel_name_to_url)
        self.visit_thres = visit_thres
        self.likes_thres = likes_thres
        self.crawl_date_diff = crawl_date_diff if crawl_date_diff is None else datetime.datetime.today() - datetime.timedelta(days=crawl_date_diff)

    def run_crawler(self, dir=None):
        thread_list = []
        for channel_name, channel_url in self.name_to_url.items():
            if dir is not None:
                channel_name = dir

            os.makedirs(channel_name, exist_ok=True)
            try:
                self.crawling(channel_name, channel_url)
            except NoSuchElementException:
                print("페이지 수가 1000을 넘지 않습니다.")
            self.file_compressing(channel_name)

            print("number of files : ", len(os.listdir(f"{channel_name}")))
            print("="*50, end="\n\n")

        self.driver.close()

    def download_image_and_caption(self, folder_name, file_name):
        images = self.driver.find_elements_by_css_selector(self.css_selectors["image"])
        for img_idx, img in enumerate(images):
            time.sleep(1)
            file_full_name = f"{file_name}_img{img_idx}"
            try:
                with open(f"{folder_name}/{file_full_name}.txt", "w") as f:
                    title_text = self.driver.find_element_by_css_selector(self.css_selectors["article_inner_title"]).text
                    f.write(title_text)

                imgUrl = img.get_attribute("src")
                urllib.request.urlretrieve(imgUrl, f"{folder_name}/{file_full_name}.png")
            except Exception as e:
                print(e)

    def crawling(self, channel_name, page_url):
        print(f"Start {channel_name} image crawling...", end="") 
        start_time = time.time()     
        for page_idx in range(1, 1000 + 1):
            current_url = page_url.replace("changable_index", str(page_idx))
            self.move_to_page(current_url) # 크롤링하고자 하는 페이지로 들어감

            for table_idx in range(1, 15 + 1): # 팬아트 게시물은 15개씩 table로 나온다.
                # 기본 정보 추출
                art_date = self.driver.find_element_by_css_selector(self.css_selectors["date"].replace("changable_index", str(table_idx))).text
                title = self.driver.find_element_by_css_selector(self.css_selectors["title"].replace("changable_index", str(table_idx)))
                visit = str2float(self.driver.find_element_by_css_selector(self.css_selectors["visit"].replace("changable_index", str(table_idx))).text)
                likes = str2float(self.driver.find_element_by_css_selector(self.css_selectors["likes"].replace("changable_index", str(table_idx))).text)
                
                if len(art_date) <= 6:
                    art_date = datetime.datetime.today()
                else:
                    year, month, day = map(int, art_date[:-1].split("."))
                    art_date = datetime.datetime(year, month, day)
                    
                if self.crawl_date_diff is not None:
                    if art_date < self.crawl_date_diff:
                        break

                # 조회수와 좋아요 수로 필터링
                if visit < self.visit_thres or likes < self.likes_thres:
                    continue

                self.move_to_page(title.get_attribute('href')) # 팬아트 게시물 내부로 들어간다.
                self.download_image_and_caption(folder_name=channel_name, file_name=f"{str(art_date).split(' ')[0]}{page_idx}") # 이미지를 다운로드 한다.
                self.move_to_page(current_url) # 뒤로가기

            else: 
                continue

            break
        print(f" : done in {time.time() - start_time}")

In [None]:
class YouTubeThumbNailCrawler(NoHeadDriver):
    def __init__(self, channel_name_to_url, crawl_date_diff=None, visit_thres=0):
        '''
        channel_name_to_url -> dict
            - url의 닉네임을 key로, url을 value로 하여 딕셔너리를 만들면 된다.
            ex)    {"gomem_fan_art":"https://www.-----", ---}
        
        crawl_date_diff -> int
            - 오늘로부터 몇 일까지 크롤링할지 지정
            - default로 둘 시, 모든 게시물을 크롤링한다.
            ex) 3으로 지정시 오늘로부터 3일전의 영상까지 크롤링한다.
            주의) 유튜브의 날짜 표기가 혐이기 때문에 비정확할 수 있다.
            
        method run_crawler -> void
            - dir 파라미터에 경로를 주면 해당 경로에 데이터를 저장한다.
        
        '''
        css_selectors = {
            "thumbnail":"/html/body/ytd-app/div[1]/ytd-page-manager/ytd-browse/ytd-two-column-browse-results-renderer/div[1]/ytd-section-list-renderer/div[2]/ytd-item-section-renderer/div[3]/ytd-grid-renderer/div[1]/ytd-grid-video-renderer[changable_index]/div[1]/ytd-thumbnail/a",
            "title":"/html/body/ytd-app/div[1]/ytd-page-manager/ytd-browse/ytd-two-column-browse-results-renderer/div[1]/ytd-section-list-renderer/div[2]/ytd-item-section-renderer/div[3]/ytd-grid-renderer/div[1]/ytd-grid-video-renderer[changable_index]/div[1]/div[1]/div[1]/h3/a",
            "visit":"/html/body/ytd-app/div[1]/ytd-page-manager/ytd-browse/ytd-two-column-browse-results-renderer/div[1]/ytd-section-list-renderer/div[2]/ytd-item-section-renderer/div[3]/ytd-grid-renderer/div[1]/ytd-grid-video-renderer[changable_index]/div[1]/div[1]/div[1]/div/div[1]/div[2]/span[1]",
            "elapsed_date":"/html/body/ytd-app/div[1]/ytd-page-manager/ytd-browse/ytd-two-column-browse-results-renderer/div[1]/ytd-section-list-renderer/div[2]/ytd-item-section-renderer/div[3]/ytd-grid-renderer/div[1]/ytd-grid-video-renderer[changable_index]/div[1]/div[1]/div[1]/div/div[1]/div[2]/span[2]"
        }
        super().__init__(css_selectors, channel_name_to_url)
        self.crawl_date_diff = crawl_date_diff if crawl_date_diff is None else datetime.datetime.today() - datetime.timedelta(days=crawl_date_diff)
        self.visit_thres = float(visit_thres)

    def run_crawler(self, dir=None):
        for channel_name, channel_url in self.name_to_url.items():
            if dir is not None:
                channel_name = dir

            print(f"Start {channel_name} image crawling...")
            start_time = time.time()

            os.makedirs(channel_name, exist_ok=True)
            self.move_to_page(channel_url)
            self.scrolling_to_bottom()
            self.crawling(channel_name)
            self.file_compressing(channel_name)

            print(f" : done in {time.time() - start_time}")
            print("number of files : ", len(os.listdir(f"{channel_name}")))
            print("="*50, end="\n\n")
    
        self.driver.close()

    def crawling(self, channel_name):  
        idx = 0
        while True:
            idx += 1
            title_xpath = self.css_selectors["title"]
            image_xpath = self.css_selectors["thumbnail"]
            visit_xpath = self.css_selectors["visit"]
            elapsed_date_xpath = self.css_selectors["elapsed_date"]
            try:
                title = self.driver.find_element_by_xpath(title_xpath.replace("changable_index", str(idx))).text
                image = self.driver.find_element_by_xpath(image_xpath.replace("changable_index", str(idx)))
                visit = self.driver.find_element_by_xpath(visit_xpath.replace("changable_index", str(idx))).text
                elapsed_date = self.driver.find_element_by_xpath(elapsed_date_xpath.replace("changable_index", str(idx))).text
                
            except NoSuchElementException as e:
                print("no such element exception:", e)
                break
                
            if "#Shorts" in title or "#shorts" in title: 
                continue
            if visit_thres > self.calc_visit_count_to_row(visit):
                continue
            if self.crawl_date_diff is not None:
                if self.calc_date_by_elapsed_date(elapsed_date) < self.crawl_date_diff:
                    break

            file_name = datetime.date.today().isoformat()
            with open(f"{channel_name}/{file_name}_img{idx}.txt", "w") as f:
                f.write(title)

            time.sleep(1)
            image_url = "https://i.ytimg.com/vi/"+image.get_attribute("href").replace("https://www.youtube.com/watch?v=","")+"/hqdefault.jpg"      
            urllib.request.urlretrieve(image_url, f"{channel_name}/{file_name}_img{idx}.png")
                
    def calc_date_by_elapsed_date(self, elapsed_date):
        elapsed_date = elapsed_date[:-2]
        unit_to_dict = {
            "초":0,
            "분":0,
            "시간":0,
            "일":1,
            "주":7,
            "개월":30,
            "년":365
        }
        
        for unit, day in unit_to_dict.items():
            if unit in elapsed_date:
                return datetime.datetime.today() - datetime.timedelta(days=int(elapsed_date.replace(unit, ""))*day)
        raise "unknown error occur!"
            
    def calc_visit_count_to_row(self, visit):
        visit = visit[4:-1]
        unit_to_dict = {
            "천":1000,
            "만":10000,
            "억":100000000
        }
        for unit, count in unit_to_dict.items():
            if unit in visit:
                return float(visit.replace(unit, ""))*count
        return float(visit)

In [None]:
visit_thres = 1000
likes_thres = 100

wakzoo_name_to_url = {
    "isedol_fan_art":"https://cafe.naver.com/steamindiegame?iframe_url=/ArticleList.nhn%3Fsearch.clubid=27842958%26search.menuid=344%26search.boardtype=L%26search.totalCount=151%26search.cafeId=27842958%26search.page=changable_index",
    "gomem_fan_art":"https://cafe.naver.com/steamindiegame?iframe_url=/ArticleList.nhn%3Fsearch.clubid=27842958%26search.menuid=299%26search.boardtype=L%26search.totalCount=151%26search.cafeId=27842958%26search.page=changable_index",
    "hyung_fan_art":"https://cafe.naver.com/steamindiegame?iframe_url=/ArticleList.nhn%3Fsearch.clubid=27842958%26search.menuid=59%26search.boardtype=L%26search.totalCount=151%26search.cafeId=27842958%26search.page=changable_index",
    "geumson":"https://cafe.naver.com/steamindiegame?iframe_url=/ArticleList.nhn%3Fsearch.clubid=27842958%26search.menuid=551%26search.boardtype=L%26search.totalCount=151%26search.cafeId=27842958%26search.page=changable_index",
}

youtube_name_to_url = {
    "gosegu_youtube":"https://www.youtube.com/channel/UCV9WL7sW6_KjanYkUUaIDfQ/videos",
    "lilpa_youtube":"https://www.youtube.com/channel/UC-oCJP9t47v7-DmsnmXV38Q/videos",
    "viichan_youtube":"https://www.youtube.com/channel/UCs6EwgxKLY9GG4QNUrP5hoQ/videos",
    "ine_youtube":"https://www.youtube.com/channel/UCroM00J2ahCN6k-0-oAiDxg/videos",
    "jururu_youtube":"https://www.youtube.com/c/%EC%A3%BC%EB%A5%B4%EB%A5%B4/videos",
    "jingberger_youtube":"https://www.youtube.com/c/%EC%A7%95%EB%B2%84%EA%B1%B0/videos",
    "wakgoob_youtube":"https://www.youtube.com/user/woowakgood/videos",
    "waktaverse":"https://www.youtube.com/c/welshcorgimessi/videos",
    "gyeleug":"https://www.youtube.com/channel/UChCqDNXQddSr0ncjs_78duA/videos",
}


In [None]:
fan_art_crawler = WakZooFanArtCrawler(wakzoo_name_to_url, visit_thres, likes_thres)
fan_art_crawler.run_crawler()

youtube_thumbnail_crawler = YouTubeThumbNailCrawler(youtube_name_to_url)
youtube_thumbnail_crawler.run_crawler()