In [59]:
import os
from datetime import datetime
import time

from bs4 import BeautifulSoup
import pandas as pd
import requests
import re


# 크롤링
def crawling(start_date, end_date):
    
    if(len(str(start_date)) != 8 and len(str(end_date)) != 8):
        print("잘못된 날짜 입력")
        return
    
    # 섹션 별 아이디 설정
    Section_ID = {"Political":"100", "Economic":"101", "Society":"102", "Culture":"103", "International":"104", "Science":"105"}
    
    # 날짜 별로 크롤링
    for date in range(end_date - start_date + 1):        
        
        # 날짜 별 총 크롤링 시간 측정
        total_crawling_time = 0 
        date = start_date + date
        print("Start Crawling :", date)
        
        # 파일 저장 경로 설정
        directory = "C:\\Users\\cjy89\\NLP\\Project_news_crawling\\Naver\\" + str(date)
        if not os.path.isdir(directory):
            os.makedirs(directory)
        os.chdir(directory)
    
        # 섹션 별로 크롤링
        for section in Section_ID:
            
            # 섹션 별 크롤링 시간 측정 start
            start = time.time() 
            page = 0
            News_DataFrame = pd.DataFrame()

            # 서버에서 뉴스 데이터 가져오기
            flag = True
            while(flag):
                page += 1
                
                url = "https://news.naver.com/main/list.nhn?mode=LSD&mid=sec&listType=title&sid1=" + Section_ID[section] + "&date=" + str(date) + "&page=" + str(page)
                headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
                resp = requests.get(url, headers=headers)
                soup = BeautifulSoup(resp.text, "html.parser")
                news_box = soup.find_all(class_ = "type02")
                
                
                con_page_news = 0
                for box in range(len(news_box)):
                    news = [i.get_text() for i in news_box[box].find_all(class_="nclicks(fls.list)")]
                    urls = [i['href'] for i in news_box[box].find_all(class_="nclicks(fls.list)")]
                    press = [i.get_text() for i in news_box[box].find_all(class_="writing")]
                    con_page_news += len(news)
                    
                    if(page != 1 and urls[0] == News_DataFrame.iloc[-pre_page_news, 1]):
                        flag = False
                        break
        
                    temp_df = pd.DataFrame({"News":news, "Url":urls, "Press":press})
                    News_DataFrame = pd.concat([News_DataFrame, temp_df], axis=0, ignore_index=True)
                    News_DataFrame.dropna(axis=0, inplace=True)
                pre_page_news = con_page_news

            # 섹션 별 크롤링 시간 측정 end
            end = time.time()
            total_crawling_time = total_crawling_time + (end - start)
            print("Crawling", date, section + " News :", end-start)
            
            # 데이터 프레임 csv 파일로 저장
            title = section + ".csv"
            News_DataFrame.to_csv(title, sep=",", encoding='utf-8-sig', index=False)
        
        print(date, "total time :", total_crawling_time)
        print(date, "average Time :", total_crawling_time / len(Section_ID))
        print("────────────────────────────")

In [60]:
crawling(20210128, 20210131)

Start Crawling : 20210128
Crawling 20210128 Political News : 21.105687618255615
Crawling 20210128 Economic News : 50.68620252609253
Crawling 20210128 Society News : 103.61551976203918
Crawling 20210128 Culture News : 14.651144742965698
Crawling 20210128 International News : 51.60530757904053
Crawling 20210128 Science News : 9.947850227355957
20210128 total time : 251.6117124557495
20210128 average Time : 41.93528540929159
────────────────────────────
Start Crawling : 20210129
Crawling 20210129 Political News : 20.130454540252686
Crawling 20210129 Economic News : 59.637165546417236
Crawling 20210129 Society News : 63.29189658164978
Crawling 20210129 Culture News : 8.922179460525513
Crawling 20210129 International News : 26.052513599395752
Crawling 20210129 Science News : 6.4480140209198
20210129 total time : 184.48222374916077
20210129 average Time : 30.747037291526794
────────────────────────────
Start Crawling : 20210130
Crawling 20210130 Political News : 5.311671257019043
Crawling 20

In [61]:
crawling(20210201, 20210207)

Start Crawling : 20210201
Crawling 20210201 Political News : 33.56440854072571
Crawling 20210201 Economic News : 64.84268426895142
Crawling 20210201 Society News : 84.10801148414612
Crawling 20210201 Culture News : 10.280738592147827
Crawling 20210201 International News : 30.116729497909546
Crawling 20210201 Science News : 5.727240324020386
20210201 total time : 228.639812707901
20210201 average Time : 38.10663545131683
────────────────────────────
Start Crawling : 20210202
Crawling 20210202 Political News : 20.053532123565674
Crawling 20210202 Economic News : 40.57343244552612
Crawling 20210202 Society News : 68.28786706924438
Crawling 20210202 Culture News : 7.335300445556641
Crawling 20210202 International News : 23.737211227416992
Crawling 20210202 Science News : 5.618411540985107
20210202 total time : 165.60575485229492
20210202 average Time : 27.600959142049152
────────────────────────────
Start Crawling : 20210203
Crawling 20210203 Political News : 20.91770911216736
Crawling 202