In [2]:
import os
from datetime import datetime
import time

from bs4 import BeautifulSoup
import pandas as pd
import requests
import re


# 크롤링
def crawling(start_date, end_date):
    
    if(len(str(start_date)) != 8 and len(str(end_date)) != 8):
        print("잘못된 날짜 입력")
        return
    
    # 섹션 별 아이디 설정
    Section_ID = {"Political":"100", "Economic":"101", "Society":"102", "Culture":"103", "International":"104", "Science":"105"}
    
    # 날짜 별로 크롤링
    for date in range(end_date - start_date + 1):        
        
        # 날짜 별 총 크롤링 시간 측정
        total_crawling_time = 0 
        date = start_date + date
        print("Start Crawling :", date)
        
        # 섹션 별로 크롤링
        for section in Section_ID:
            
            # 파일 저장 경로 설정
            directory = "C:\\Users\\cjy89\\Python_Study\\NLP\\Project_news_crawling\\Naver\\" + section
            if not os.path.isdir(directory):
                os.makedirs(directory)
            os.chdir(directory)
            
            # 섹션 별 크롤링 시간 측정 start
            start = time.time() 
            page = 0
            News_DataFrame = pd.DataFrame()

            # 서버에서 뉴스 데이터 가져오기
            flag = True
            while(flag):
                page += 1
                
                url = "https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&listType=title&sid1=" + Section_ID[section] + "&date=" + str(date) + "&page=" + str(page)
                headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
                resp = requests.get(url, headers=headers)
                soup = BeautifulSoup(resp.text, "html.parser")
                news_box = soup.find_all(class_ = "type02")
                
                
                con_page_news = 0
                for box in range(len(news_box)):
                    news = [i.get_text() for i in news_box[box].find_all(class_="nclicks(fls.list)")]
                    urls = [i['href'] for i in news_box[box].find_all(class_="nclicks(fls.list)")]
                    press = [i.get_text() for i in news_box[box].find_all(class_="writing")]
                    con_page_news += len(news)
                    
                    if(page != 1 and urls[0] == News_DataFrame.iloc[-pre_page_news, 1]):
                        flag = False
                        break
        
                    temp_df = pd.DataFrame({"News":news, "Url":urls, "Press":press})
                    News_DataFrame = pd.concat([News_DataFrame, temp_df], axis=0, ignore_index=True)
                    News_DataFrame.dropna(axis=0, inplace=True)
                pre_page_news = con_page_news

            # 섹션 별 크롤링 시간 측정 end
            end = time.time()
            total_crawling_time = total_crawling_time + (end - start)
            print("Crawling", date, section + " News :", end-start)
            
            # 데이터 프레임 csv 파일로 저장
            title = str(date) + ".csv"
            News_DataFrame.to_csv(title, sep=",", encoding='utf-8-sig', index=False)
        
        print(date, "total time :", total_crawling_time)
        print(date, "average Time :", total_crawling_time / len(Section_ID))
        print("────────────────────────────")

In [3]:
crawling(20210111, 20210131)

Start Crawling : 20210111
Crawling 20210111 Political News : 18.50643539428711
Crawling 20210111 Economic News : 33.10069179534912
Crawling 20210111 Society News : 54.16489219665527
Crawling 20210111 Culture News : 5.837535381317139
Crawling 20210111 International News : 18.766250610351562
Crawling 20210111 Science News : 5.2187159061431885
20210111 total time : 135.5945212841034
20210111 average Time : 22.5990868806839
────────────────────────────
Start Crawling : 20210112
Crawling 20210112 Political News : 15.833301067352295
Crawling 20210112 Economic News : 29.632349252700806
Crawling 20210112 Society News : 63.20691704750061
Crawling 20210112 Culture News : 7.904859781265259
Crawling 20210112 International News : 19.491421699523926
Crawling 20210112 Science News : 5.303873062133789
20210112 total time : 141.37272191047668
20210112 average Time : 23.56212031841278
────────────────────────────
Start Crawling : 20210113
Crawling 20210113 Political News : 17.089777946472168
Crawling 20

Crawling 20210129 Political News : 13.33326506614685
Crawling 20210129 Economic News : 29.11468744277954
Crawling 20210129 Society News : 41.922343492507935
Crawling 20210129 Culture News : 5.979896545410156
Crawling 20210129 International News : 19.765498638153076
Crawling 20210129 Science News : 4.9744672775268555
20210129 total time : 115.09015846252441
20210129 average Time : 19.181693077087402
────────────────────────────
Start Crawling : 20210130
Crawling 20210130 Political News : 3.2925662994384766
Crawling 20210130 Economic News : 3.38906192779541
Crawling 20210130 Society News : 9.136410474777222
Crawling 20210130 Culture News : 1.9827728271484375
Crawling 20210130 International News : 13.167180061340332
Crawling 20210130 Science News : 0.5834236145019531
20210130 total time : 31.55141520500183
20210130 average Time : 5.258569200833638
────────────────────────────
Start Crawling : 20210131
Crawling 20210131 Political News : 9.092550277709961
Crawling 20210131 Economic News : 1