In [6]:
# Implemented by Daeyong Jin
# Naver environment news crawling and scrapping
# Indexing and saving the contents tofiles

from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import os 
import time
from datetime import date, timedelta

class NaverNewsCrawler :
    def __init__(self):
        self.idx_result_df = pd.DataFrame()
        
        #디렉토리 생성
        dirname = 'result'
        if ((os.path.isdir('./' + dirname + '/')) == False):
            os.mkdir('./' + dirname + '/')
        pd.set_option("display.max_colwidth", 1000)
        
    def indexing(self, start_date=None, end_date=None, append=True):

        prev_idx_result_df = None
        
        # 인덱싱 파일 읽기
        if(append):
            input_file_name = './result/indexing.txt'
            prev_idx_result_df = pd.read_csv(input_file_name,header=None)
            prev_idx_result_df.columns = ['date','title','news_list','page_url_list','file_list']
        
            # 인덱싱 파일에 덧붙힐 경우 시작날짜를 지정하지 않을경우
            if(start_date == None):
                start_date = str(prev_idx_result_df.iloc[-1,0])
        
        # 시작날짜를 지정하지 않을경우 20180302
        if(start_date == None):
            start_date = '20050101'

        # 끝 날짜를 지정하지 않을경우 어제까지 수집
        if(end_date == None):
            yesterday = date.today() - timedelta(1)
            end_date = str(yesterday)
            
        # 시작날짜 및 종료날짜 설정
        dt_index = pd.date_range(start=start_date, end = end_date)
        dt_list = dt_index.strftime("%Y%m%d").tolist()
        sec = 60
        
        # 출력 파일 지정
        output_file_name = './result/indexing.txt'
        max_page = 100

        result = pd.DataFrame()
        prev_html = ""

        # d : 날짜
        # page : 페이지
        title_list = []
        page_url_list = []
        date_list = []
        file_list = []
        news_list = []
        
        for d in dt_list:  
            print(d)
           # time.sleep(60)
            article_num = 0
            for page in range(1,max_page):
                url = "http://news.naver.com/main/list.nhn?mode=LS2D&sid2=252&mid=shm&sid1=102"
                params = {'date': d,'page': page}
                cur_html = requests.get(url, params=params).text
           
                # 페이지 내용의 변화가 없으면 그대로 종료
                if cur_html == prev_html:
                    break
                prev_html = cur_html
               
                
                soup = BeautifulSoup(cur_html, 'lxml')
                temp_news_list = soup.select('span.writing')
                #print(temp_news_list)
                # 링크 주소
                
                idx = 0
                
                for tag in soup.select('dt > a'):      
                    #print("start")
                    #print("end")
                    link = tag['href']
                    #print(link)
                    
                    title = tag.text.strip()
                    #print("Title : " + title)
                    if title == "":
                        continue
                    if title == "동영상기사":       
                        continue
                        
                    file = './result/' + str(d) + "_" + str(article_num) + ".txt"
                    news = temp_news_list[idx].text
                    #print(news)
                    idx = idx + 1
                    #print(idx)
                    article_num = article_num + 1
                    #print(article_num)
                    date_list.append(d)
                    title_list.append(title)
                    news_list.append(news)
                    page_url_list.append(link)
                    file_list.append(file)
                  
                    
        #print(page_url_list)
        self.idx_result_df['date'] = date_list
        self.idx_result_df['title'] = title_list
        self.idx_result_df['news'] = news_list
        self.idx_result_df['page_url_list'] = page_url_list
        self.idx_result_df['file_list'] = file_list
        
        
        self.idx_result_df = pd.concat([prev_idx_result_df,self.idx_result_df],axis=0,ignore_index=True)
        #self.idx_result_df = self.idx_result_df.drop_duplicates(['date','title'])
        self.idx_result_df = self.idx_result_df.reset_index()
        self.idx_result_df.to_csv("./result/indexing.txt",header=None,index=None)
    
    def crawling_contents(self):
        url_list = self.idx_result_df['page_url_list'].tolist()
        file_path_list = self.idx_result_df['file_list'].tolist()
        title_list = self.idx_result_df['title'].tolist()
        date_list = self.idx_result_df['date'].tolist()
        
        sec = 10
        for i in range(len(url_list)):
            print(i)
            # 파일이 존재하고 0바이트가 아닐경우 통과
            if(os.path.isfile(file_path_list[i]) and os.path.getsize(file_path_list[i]) != 0):
                print("통과")
                continue
               
            if(i % 20 == 0):
                time.sleep(sec)
            
            cur_html = requests.get(url_list[i]).text.strip()
                      
            soup = BeautifulSoup(cur_html,'lxml')

            try :   
                title = soup.select('#articleTitle')[0].text.strip()

               # print(title)
                date = soup.select('span.t11')[0].text.strip()
                #print(date)

                content = soup.select('div #articleBodyContents')[0].text.strip()
                exc = soup.select('div #articleBodyContents script')[0].text.strip()            
                content = content[len(exc):].strip()

                #print(content)
                result = pd.DataFrame([title,date,content])
                #print(result)
                result.to_csv(file_path_list[i],header=None,index=None)
                
            except:
                result = pd.DataFrame([title_list[i],date_list[i],""])
                result.to_csv(file_path_list[i],header=None,index=None)   


In [None]:
#from NaverNewsCrawler import NaverNewsCrawler

nc = NaverNewsCrawler()
nc.indexing('20050101','20171231',False)
#nc.idx_result_df
nc.crawling_contents()

20050101
20050102
20050103
20050104
20050105
20050106
20050107
20050108
20050109
20050110
20050111
20050112
20050113
20050114
20050115
20050116
20050117
20050118
20050119
20050120
20050121
20050122
20050123
20050124
20050125
20050126
20050127
20050128
20050129
20050130
20050131
20050201
20050202
20050203
20050204
20050205
20050206
20050207
20050208
20050209
20050210
20050211
20050212
20050213
20050214
20050215
20050216
20050217
20050218
20050219
20050220
20050221
20050222
20050223
20050224
20050225
20050226
20050227
20050228
20050301
20050302
20050303
20050304
20050305
20050306
20050307
20050308
20050309
20050310
20050311
20050312
20050313
20050314
20050315
20050316
20050317
20050318
20050319
20050320
20050321
20050322
20050323
20050324
20050325
20050326
20050327
20050328
20050329
20050330
20050331
20050401
20050402
20050403
20050404
20050405
20050406
20050407
20050408
20050409
20050410
20050411
20050412
20050413
20050414
20050415
20050416
20050417
20050418
20050419
20050420
20050421
2

20070702
20070703
20070704
20070705
20070706
20070707
20070708
20070709
20070710
20070711
20070712
20070713
20070714
20070715
20070716
20070717
20070718
20070719
20070720
20070721
20070722
20070723
20070724
20070725
20070726
20070727
20070728
20070729
20070730
20070731
20070801
20070802
20070803
20070804
20070805
20070806
20070807
20070808
20070809
20070810
20070811
20070812
20070813
20070814
20070815
20070816
20070817
20070818
20070819
20070820
20070821
20070822
20070823
20070824
20070825
20070826
20070827
20070828
20070829
20070830
20070831
20070901
20070902
20070903
20070904
20070905
20070906
20070907
20070908
20070909
20070910
20070911
20070912
20070913
20070914
20070915
20070916
20070917
20070918
20070919
20070920
20070921
20070922
20070923
20070924
20070925
20070926
20070927
20070928
20070929
20070930
20071001
20071002
20071003
20071004
20071005
20071006
20071007
20071008
20071009
20071010
20071011
20071012
20071013
20071014
20071015
20071016
20071017
20071018
20071019
20071020
2

In [170]:
nc.idx_result_df