In [29]:
import newspaper
from newspaper import Article, Config
from newsplease import NewsPlease
from IPython.core.debugger import set_trace
from IPython.display import display
import extraction
import requests
import json
import os
import time
import hashlib
import glob
import pandas as pd
import news_publishers
from langdetect import detect
from pubtime_extractor import extractArticlePublishedDate
import asyncio
from functools import partial

# Suppress UnknownTimezoneWarning
import warnings
from dateutil.parser import UnknownTimezoneWarning
warnings.filterwarnings('ignore', category=UnknownTimezoneWarning)

In [30]:
def _config():
    config = Config()
    config.fetch_images = False
    config.memoize_articles = False
    config.request_timeout = 10
    config.language = 'en'
    return config

In [23]:
def clean_url(pub, url):
    url = url.replace('http://', 'https://')

    try: url = url[:url.index('#')]
    except: pass

    try: url = url[:url.index('\n')]
    except: pass    
    
    if pub!='zdnet':
    # zdnet은 반드시 www가 붙어야되는 듯 (2019.08.30)
        url = url.replace('https://www.', 'https://')
    
    if pub!='thinkprogress' and url[-1]=='/':
    # thinkprogress는 뒤의 /가 반드시 필요한 듯 (2019.09.04)
        url = url[:-1]
    
    if pub=='reuters':
    # reuters는 뒤에 의미없이 ?il=0 이 붙는 경우가 허다. 무슨뜬인지는 모름 (2019.09.04)
        try: url = url[:url.index('?il=0')]
        except: pass
    
    if pub=='marketwatch':
        try: url = url[:url.index('?mod=')]
        except: pass
    
    if pub=='wsj':
    # wsj paywall 뚫기
        try: url = url[:url.index('?mod=')]
        except: pass
        
        url += '?mod=rsswn'
        
    return url

In [24]:
class Progressor:
    def __init__(self, ntotal, formater_suffix=None):
        self.start = time.time()
        self.n_total = ntotal
        self.n_progressed = 0
        self.formater = '\r{pct:.2f}% ({timestamp:.2f} seconds)'

        if formater_suffix:
            self.formater += (': ' + formater_suffix)

    def stamp(self, **vargs):
        self.n_progressed += 1
        pct = self.n_progressed / self.n_total * 100
        timestamp = time.time() - self.start
        print(self.formater.format(pct=pct, timestamp=timestamp, **vargs), end='')

In [25]:
def collect_urls(src):
    prg = Progressor(len(src), formater_suffix='URLs collecting... {pub:<20}')
    newspaper_config = partial(newspaper.build, config=_config())
        
    async def geturls(pub, domain):
        resp = await loop.run_in_executor(None, newspaper_config, domain)
        articles = resp.articles
        urls = {clean_url(pub, article.url) for article in articles}
        #progress(pub)
        prg.stamp(pub=pub)
        return pub, urls


    async def main():
        fts = [asyncio.ensure_future(geturls(pub, val['domain'])) for pub, val in src.items()]
        return await asyncio.gather(*fts)


    result = None
    asyncio.set_event_loop(asyncio.new_event_loop())
    loop = asyncio.get_event_loop()

    try:
        # 다음 코드를 주피터에서 돌리려면, tornado를 downgrade 해야함
        # pip install tornado==4.5.3
        result = loop.run_until_complete(main())
        result = dict(result) #set.union(*result)

    except Exception as ex:
        print(ex)

    finally:
        loop.close()

    return result

In [26]:
def get_publish_time(article):
    pubtime = article.publish_date
    url = article.url
    datesrc = 'newspaper'
    
    # articleDateExtractor 내부의 print 를 suppress 하기 위한 장치
    # --- articleDateExtractor 코드 자체를 변경함 (2019.09.05)
    #class HiddenPrints:
    #    def __enter__(self):
    #        self._original_stdout = sys.stdout
    #        sys.stdout = open(os.devnull, 'w')

    #    def __exit__(self, exc_type, exc_val, exc_tb):
    #        sys.stdout.close()
    #        sys.stdout = self._original_stdout
        
    
    def _from_extractor(url):
        try:
            return extractArticlePublishedDate(url)
        
        except:
            return None
    
    
    def _from_newsplease(url):
        try:
            return NewsPlease.from_url(url).date_publish
        
        except:
            return None
    
    
    def _timize(time):
        if time is None:
            return None
        
        try:
            time = pd.Timestamp(time)
            
            if time.tz is None:
                return time.tz_localize('utc')
            
            else:
                return time.tz_convert('utc')
        
        except:
            return None
        
    
    def _datize(time):
        if time is None:
            return ''

        try:
            time = pd.Timestamp(time)

            if time.tz is None:
                return str(time.date())

            else:
                return str(time.tz_convert('utc').date())
            
        except:
            return ''
            

    if pubtime is None:
        datesrc = 'extractor'
        pubtime = _from_extractor(url)

        if pubtime is None:
            datesrc = 'newsplease'
            pubtime = _from_newsplease(url)

            if pubtime is None:
                datesrc = 'fail'

    return _timize(pubtime)

In [7]:
def select_urls(urls):
    prg = Progressor(len(urls), formater_suffix='URLs selecting... {pub:<20}')
    selected = {}
    basedir = os.path.join(os.getcwd(), 'newsdata')
    ext = '.json'

    for pub, _urls in urls.items():
        selected[pub] = set()

        for _url in _urls:
            hash_url = hashlib.sha1(_url.encode('utf-8')).hexdigest()
            
            file_in_saved = os.path.join(basedir, 'saved', hash_url[0], hash_url + ext)
            file_in_downloaded = os.path.join(basedir, 'downloaded', hash_url + ext)
            file_in_trashed = os.path.join(basedir, 'trashed', hash_url[0], hash_url + ext)

            if os.path.isfile(file_in_saved) or os.path.isfile(file_in_downloaded) or os.path.isfile(file_in_trashed):
                continue
                
            else:
                selected[pub].add(_url)
                
        prg.stamp(pub=pub)
                
    return selected

In [44]:
def download(urls):
    n_total = sum([len(v) for _,v in urls.items()])
    prg = Progressor(n_total, formater_suffix='downloading... {pub:<20}')
    # prg = Progressor(len(set.union(*urls.values())), formater_suffix='downloading... {pub:<20}')
    basedir = os.path.join(os.getcwd(), 'newsdata')
    ext = '.json'
    newspaper_config = _config()
    
        
    def makedir_if_not_exists(file):
        _dir = os.path.dirname(file)
        
        if not os.path.isdir(_dir):
            os.makedirs(_dir)
            
            
    def detect_lang(article):
        lang = article.meta_lang
        
        if lang=='':
            return detect(article.text)
        
        else:
            return lang
            
    
    def get_article(url):
        article = Article(url, config=newspaper_config)
        article.download()
        article.parse()
        return article
        
        
    def get_title(article):
        if article.title in ['', '-', None]:
        # '':cbc, '-':townhall
            html = requests.get(article.url).text
            extracted_title = extraction.Extractor().extract(html, source_url=article.url).title
            
            if extracted_title in ['', '-', None]:
                if article.description=='':
                    return article.pub
                else:
                    return article.description
                
            else:
                return extracted_title
            
        else:
            return article.title
        
    
    async def _download(pub, _urls):
        out = {'downloaded':set(), 'trashed':set()}
        
        for url in _urls:
            hash_url = hashlib.sha1(url.encode('utf-8')).hexdigest()
            downloaded_at = pd.Timestamp.utcnow()
            
            content = {
                'pub': pub, 
                'url': url, 
                'downloaded_at': str(downloaded_at)
            }
            
            try: 
                article = await loop.run_in_executor(None, get_article, url)
                
                text = article.text
                language = detect_lang(article)
                published_at = await loop.run_in_executor(None, get_publish_time, article)
                is_too_short = (not article.is_valid_body()) and (len(article.text)<500)
                
                content['title'] = await loop.run_in_executor(None, get_title, article) #article.title
                content['language'] = language
                
                if text=='' or published_at==None or is_too_short or language!='en':
                    file = os.path.join(basedir, 'trashed', hash_url[0], hash_url + ext)
                    out['trashed'].add(url)

                else:
                    file = os.path.join(basedir, 'downloaded', hash_url + ext)
                    content['text'] = text
                    content['description'] = article.meta_description
                    content['authors'] = article.authors
                    content['top_image'] = article.top_image if article.top_image.split('.')[-1]!='ico' else ''
                    content['published_at'] = str(published_at.date()) if published_at<=downloaded_at else str(downloaded_at.date())
                    out['downloaded'].add(url)
            
            except:
                file = os.path.join(basedir, 'trashed', hash_url[0], hash_url + ext)
                out['trashed'].add(url)
            
            
            makedir_if_not_exists(file)
            with open(file, 'w') as f:
                json.dump(content, f)
                
                
            # 종종 100%가 넘어가는 경우가 있다
            # set.union(*urls.values()) 에 중복항목이 있는 듯: 요건 set이라서 문제였던것 같다. 해결한듯 (2019.09.27)
            prg.stamp(pub=pub)
            
        return pub, out


    async def main():
        fts = [asyncio.ensure_future(_download(pub, _urls)) for pub, _urls in urls.items()]
        return await asyncio.gather(*fts)

    
    result = None
    asyncio.set_event_loop(asyncio.new_event_loop())
    loop = asyncio.get_event_loop()

    try:
        result = loop.run_until_complete(main())
        result = dict(result)

    except Exception as ex:
        pass
        #print(ex)

    finally:
        loop.close()
        
    return result

In [57]:
class NewsCrawler:
    def __init__(self):
        self.src = news_publishers.src
       

    def collect(self):
        '''
        collecting 과정에서 pub간에 겹치는 url이 있을 수 있다: 예. foxnews, foxbusiness
        따라서 urls의 총 갯수와 UNION(urls)의 갯수는 다를 수 있다
        이는 아래 selecting도 마찬가지 (2019.09.27)
        '''
        self.urls_collected = collect_urls(self.src)
        return self._results_sub(self.urls_collected)

        
    def select(self):
        self.urls_selected = select_urls(self.urls_collected)
        return self._results_sub(self.urls_selected)
    
    
    def download(self):
        '''
        collecting, selecting 과정에서 pub간의 겹치는 url이 있었으나, 
        download된 파일명은 url의 hashcode이므로, 모든 파일은 유니크한 url만 담고있다
        async download 과정에서, 나중에 받아진 내용으로 이전 파일을 덮어쓴다 (2019.09.27)
        '''
        self.urls_final = download(self.urls_selected)
        return self._results_final(self.urls_final)

        
    def _summary_by_pubs(self, urls):
        return pd.Series({pub:len(_urls) for pub, _urls in urls.items()})
    
    
    def _uniquenese(self, urls):
        urls_list = sum([list(v) for _,v in urls.items()], [])
        n_total = len(urls_list)
        n_unique = len(set(urls_list))
        return pd.Series({'n_total':n_total, 'n_unique':n_unique})

    
    def _duplicates(self, urls):
        urls_tmp = {k:{_v:1 for _v in v} for k,v in urls.items()}
        df_dupl = pd.DataFrame.from_dict(urls_tmp, orient='columns')
        df_dupl = df_dupl[df_dupl.sum(axis=1)!=1]
        cols = df_dupl.columns
        
        duplicates = {}
        for i, row in df_dupl.iterrows():
            duplicates[i] = ', '.join(cols[row==1])

        return pd.DataFrame.from_dict(duplicates, orient='index', columns=['pubs'])
    
    
    def _results_sub(self, urls):
        uniqueness = self._uniquenese(urls)
        summary_by_pubs = self._summary_by_pubs(urls)        
        duplicates = self._duplicates(urls)
        return uniqueness, summary_by_pubs, duplicates
    
    
    def _results_final(self, urls_final):
        tmp = {pub:{state:len(_urls) for state, _urls in urls.items()} for pub,urls in urls_final.items()}
        return pd.DataFrame.from_dict(tmp, orient='index')

In [58]:
crawler = NewsCrawler()

In [59]:
collect_results = crawler.collect(); display(*collect_results)

100.00% (54.15 seconds): URLs collecting... reuters             

n_total     10678
n_unique    10657
dtype: int64

huffpost              63
cnn                  825
investingcom         925
politico             227
time                  14
cnbc                 159
foxnews              198
foxbusiness          163
bbc                  276
businessinsider      616
morningstar           96
wsj                  107
nyt                  102
guardian             155
reuters              963
washingtontimes      461
washingtonpost       148
cbs                  406
marketwatch          140
atlantic             100
vice                  20
npr                  837
newrepublic           20
yahoo                235
independent          739
heritage             281
zdnet                416
townhall             620
abcnews               99
hotair                85
cbc                  296
nymag                246
thestreet            145
thinkprogress         82
dailybeast            42
realclearpolitics    371
dtype: int64

Unnamed: 0,pubs
https://bearingarms.com/cam-e/2019/09/25/former-cop-tells-congress-will-not-comply-gun-ban,"townhall, hotair"
https://foxbusiness.com/business-leaders/rockstar-energy-drink-creator-selling-florida-homes-for-over-70m,"foxnews, foxbusiness"
https://foxbusiness.com/economy/millennials-continue-to-flee-big-cities-for-the-suburbs,"foxnews, foxbusiness"
https://foxbusiness.com/economy/the-us-has-the-most-multi-millionaires-in-the-world,"foxnews, foxbusiness"
https://foxbusiness.com/media/how-does-disney-ceo-bob-igers-new-book-compare-to-other-business-tomes-here-are-10-best-sellers,"foxnews, foxbusiness"
https://foxbusiness.com/real-estate/wework-halts-new-leases-in-bid-to-cut-losses,"foxnews, foxbusiness"
https://foxnews.com/opinion/marc-thiessen-the-rough-transcript-makes-it-clear-that-democrats-got-ahead-of-the-evidence,"foxnews, realclearpolitics"
https://foxnews.com/opinion/newt-gingrich-speaker-pelosi-investigation-congress,"foxnews, realclearpolitics"
https://hotair.com/archives/john-s-2/2019/09/26/ny-times-whistleblower-cia-officer-spent-time-white-house,"townhall, hotair"
https://hotair.com/archives/karen-townsend/2019/09/26/brewing-company-honors-carson-king-puts-critics-shame,"townhall, hotair"


In [60]:
select_results = crawler.select(); display(*select_results)

100.00% (1.15 seconds): URLs selecting... realclearpolitics   

n_total     235
n_unique    235
dtype: int64

huffpost              1
cnn                   8
investingcom         41
politico              1
time                  0
cnbc                  3
foxnews               4
foxbusiness           1
bbc                   5
businessinsider       8
morningstar           3
wsj                   3
nyt                   8
guardian              0
reuters              61
washingtontimes       4
washingtonpost        8
cbs                   3
marketwatch           4
atlantic              0
vice                  0
npr                   0
newrepublic           0
yahoo                51
independent           5
heritage              0
zdnet                 2
townhall              1
abcnews               1
hotair                1
cbc                   2
nymag                 0
thestreet             4
thinkprogress         0
dailybeast            2
realclearpolitics     0
dtype: int64

Unnamed: 0,pubs


In [61]:
download_results = crawler.download(); download_results

100.00% (163.52 seconds): downloading... yahoo               

Unnamed: 0,downloaded,trashed
abcnews,1,0
atlantic,0,0
bbc,4,1
businessinsider,3,5
cbc,2,0
cbs,0,3
cnbc,3,0
cnn,3,5
dailybeast,0,2
foxbusiness,0,1


In [None]:
arynews.tv/en
afr.com
axios.com: ?utm_source=linkedin&utm_medium=lisocialshare&utm_campaign=organic
us.blastingnews.com
breitbart.com
dailymail.co.uk
business.financialpost.com
metro.co.uk
msnbc.com
nationalreview.com
news24.com
techcrunch.com



arstechnica.com: ?comments=1
euronews.com
mirror.co.uk
nbcnews.com
news.com.au
nextbigfuture.com
rt.com
theamericanconservative.com: ?print=1
thehill.com
thenextweb.com
telegraph.co.uk
timesofindia.indiatimes.com: ?utm_source=TOInewHP_TILwidget&utm_medium=ABtest&utm_campaign=TOInewHP
theverge.com
usatoday.com
global.chinadaily.com.cn
scmp.com
the-japan-news.com
japantoday.com
english.chosun.com
koreajoongangdaily.joins.com
arirang.com

ccn.com
cointelegraph.com
cryptocoin.news
cryptonews.com

In [141]:
articles = newspaper.build('https://cryptonews.com', config=_config())

In [142]:
articles.article_urls()

['https://cryptonews.com/news/bitcoin-news/',
 'https://cryptonews.com/news/ethereum-news/',
 'https://cryptonews.com/news/altcoin-news/',
 'https://cryptonews.com/news/blockchain-news/',
 'https://cryptonews.com/news/press-releases/',
 'https://cryptonews.com/news/sponsored/',
 'https://cryptonews.com/news/ico-news/',
 'https://cryptonews.com/news/surprise-state-owned-bank-says-bitcoin-price-is-too-low-4776.htm',
 'https://cryptonews.com/news/japanese-regulator-set-to-sideline-crypto-trusts-4775.htm',
 'https://cryptonews.com/news/bitcoin-price-and-altcoins-struggle-to-clear-key-resistances-4779.htm',
 'https://cryptonews.com/news/blame-these-traders-for-bitcoin-price-crash-4772.htm',
 'https://cryptonews.com/news/new-bitcoin-payment-solution-to-emerge-in-germany-after-bitp-4770.htm',
 'https://cryptonews.com/news/another-settlement-ico-that-raised-usd-120k-to-pay-sec-usd-2-4774.htm',
 'https://cryptonews.com/exclusives/7-biggest-misconceptions-about-bitcoin-picked-by-6-crypto-ex-4740

In [8]:
aaa = Article('https://heritage.org/courts/report/overview-the-supreme-courts-2019-2020-term')
aaa.download()
aaa.parse()

In [9]:
aaa.text

'The Supreme Court’s recently concluded 2018–2019 term will more likely be remembered for Justice Brett Kavanaugh’s confirmation hearings than any particular case the Court decided. It seems the justices wanted a low-profile term following the bruising confirmation, and they put off or denied review in many cases raising hot-button issues. The decisions that produced the most media attention and scrutiny—the political gerrymandering cases on direct appeal and the census case that was on a tight deadline—were ones that the Court could not ignore (either by statutory command or as a practical matter).\n\nIt is still too early to make sweeping statements about the impact of President Donald Trump’s nominees to the Court, though the rapid destruction of America their opponents foresaw has yet to occur. Justices Kavanaugh and Neil Gorsuch have, however, lived up to the chief justice’s declaration last fall that we do not have “Obama judges or Trump judges, Bush judges or Clinton judges.”REF

In [12]:
get_publish_time(aaa)