In [4]:
import newspaper
from newspaper import Article, Config
from newsplease import NewsPlease
from IPython.core.debugger import set_trace
import extraction
import requests
import json
import os
import time
import hashlib
import glob
import pandas as pd
import news_publishers
from langdetect import detect
from pubtime_extractor import extractArticlePublishedDate
import asyncio
from functools import partial

# Suppress UnknownTimezoneWarning
import warnings
from dateutil.parser import UnknownTimezoneWarning
warnings.filterwarnings('ignore', category=UnknownTimezoneWarning)

In [None]:
def _config():
    config = Config()
    config.fetch_images = False
    config.memoize_articles = False
    config.request_timeout = 10
    config.language = 'en'
    return config

In [None]:
def clean_url(pub, url):
    url = url.replace('http://', 'https://')
    
    if pub!='zdnet':
    # zdnet은 반드시 www가 붙어야되는 듯 (2019.08.30)
        url = url.replace('https://www.', 'https://')
    
    if pub!='thinkprogress' and url[-1]=='/':
    # thinkprogress는 뒤의 /가 반드시 필요한 듯 (2019.09.04)
        url = url[:-1]
    
    if pub=='reuters':
    # reuters는 뒤에 의미없이 ?il=0 이 붙는 경우가 허다. 무슨뜬인지는 모름 (2019.09.04)
        try: url = url[:url.index('?il=0')]
        except: pass
    
    if pub=='marketwatch':
        try: url = url[:url.index('?mod=')]
        except: pass        
    
    if pub=='wsj':
    # wsj paywall 뚫기
        try: url = url[:url.index('?mod=')]
        except: pass
        
        url += '?mod=rsswn'
        
    return url

In [None]:
def collect_urls(src):
    s = time.time()
    n = len(src)
    n_done = [0]
    status = '\r{pct}% collected: {pub:<20}'
    newspaper_config = partial(newspaper.build, config=_config())
    
    def progress(pub):
        n_done[0] += 1  # 그냥 n_done으로는 외부에서 변수값을 바꿀수 없으므로
        pct = '%.2f' % (n_done[0] / n * 100)
        print(status.format(pub=pub, pct=pct), end='')
        
    
    async def geturls(pub, domain):
        resp = await loop.run_in_executor(None, newspaper_config, domain)
        articles = resp.articles
        urls = {clean_url(pub, article.url) for article in articles}
        progress(pub)
        return pub, urls


    async def main():
        fts = [asyncio.ensure_future(geturls(pub, val['domain'])) for pub, val in src.items()]
        return await asyncio.gather(*fts)


    result = None
    asyncio.set_event_loop(asyncio.new_event_loop())
    loop = asyncio.get_event_loop()

    try:
        # 다음 코드를 주피터에서 돌리려면, tornado를 downgrade 해야함
        # pip install tornado==4.5.3
        result = loop.run_until_complete(main())
        result = dict(result) #set.union(*result)

    except Exception as ex:
        print(ex)

    finally:
        loop.close()

    print('\nURLs collected: {0:.2f} seconds'.format(time.time() - s))
    return result

In [5]:
def get_publish_time(article):
    pubtime = article.publish_date
    url = article.url
    datesrc = 'newspaper'
    
    # articleDateExtractor 내부의 print 를 suppress 하기 위한 장치
    # --- articleDateExtractor 코드 자체를 변경함 (2019.09.05)
    #class HiddenPrints:
    #    def __enter__(self):
    #        self._original_stdout = sys.stdout
    #        sys.stdout = open(os.devnull, 'w')

    #    def __exit__(self, exc_type, exc_val, exc_tb):
    #        sys.stdout.close()
    #        sys.stdout = self._original_stdout
        
    
    def _from_extractor(url):
        try:
            return extractArticlePublishedDate(url)
        
        except:
            return None
    
    
    def _from_newsplease(url):
        try:
            return NewsPlease.from_url(url).date_publish
        
        except:
            return None
    
    
    def _timize(time):
        if time is None:
            return None
        
        try:
            time = pd.Timestamp(time)
            
            if time.tz is None:
                return time.tz_localize('utc')
            
            else:
                return time.tz_convert('utc')
        
        except:
            return None
        
    
    def _datize(time):
        if time is None:
            return ''

        try:
            time = pd.Timestamp(time)

            if time.tz is None:
                return str(time.date())

            else:
                return str(time.tz_convert('utc').date())
            
        except:
            return ''
            

    if pubtime is None:
        datesrc = 'extractor'
        pubtime = _from_extractor(url)

        if pubtime is None:
            datesrc = 'newsplease'
            pubtime = _from_newsplease(url)

            if pubtime is None:
                datesrc = 'fail'

    #return _datize(pubtime)
    return _timize(pubtime)

In [None]:
def select(urls):
    
    selected = {}
    basedir = os.path.join(os.getcwd(), 'newsdata')
    ext = '.json'

    for pub, _urls in tqdm(urls.items()):
        selected[pub] = set()

        for _url in _urls:
            hash_url = hashlib.sha1(_url.encode('utf-8')).hexdigest()
            
            file_in_saved = os.path.join(basedir, 'saved', hash_url[0], hash_url + ext)
            file_in_downloaded = os.path.join(basedir, 'downloaded', hash_url + ext)
            file_in_trash = os.path.join(basedir, 'trash', hash_url[0], hash_url + ext)

            if os.path.isfile(file_in_saved) or os.path.isfile(file_in_downloaded) or os.path.isfile(file_in_trash):
                continue
                
            else:
                selected[pub].add(_url)
                
    return selected