In [2]:
import newspaper
from newspaper import Article, Config
from newsplease import NewsPlease
from IPython.core.debugger import set_trace
import requests
import json
import time
import hashlib
import pandas as pd
import articleDateExtractor
from tqdm.autonotebook import tqdm
tqdm.pandas()

import os
import sys
parentdir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parentdir)

# import django
# django.setup()

# from data.models import Newsdata

In [3]:
config = Config()
config.fetch_images = False
config.memoize_articles = False
config.language = 'en'

In [4]:
src = {
    'huffpost':        {'domain':'https://huffpost.com',          'pubname':'HuffPost', 'datesrc':'newspaper'}, #.tz_convert('utc')
    'cnn':             {'domain':'https://cnn.com',               'pubname':'CNN', 'datesrc':'extractor'}, #.tz_convert('utc')
    'investingcom':    {'domain':'https://investing.com',         'pubname':'Investing.com', 'datesrc':'newsplease'}, 
    'politico':        {'domain':'https://politico.com',          'pubname':'POLITICO'}, 
    'time':            {'domain':'https://time.com',              'pubname':'TIME'}, 
    'cnbc':            {'domain':'https://cnbc.com',              'pubname':'CNBC'}, 
    'foxnews':         {'domain':'https://foxnews.com',           'pubname':'FOX News'}, 
    'foxbusiness':     {'domain':'https://foxbusiness.com',       'pubname':'FOX Business'}, 
    'bbc':             {'domain':'https://bbc.com',               'pubname':'BBC'}, 
    'businessinsider': {'domain':'https://businessinsider.com',   'pubname':'Business Insider'}, 
    'morningstar':     {'domain':'https://morningstar.com',       'pubname':'Morningstar'}, 
    'wsj':             {'domain':'https://wsj.com',               'pubname':'Wall Street Journal'}, # ?mod=rsswn
    
    'nyt':             {'domain':'https://nytimes.com',           'pubname':'NewYork Times'}, 
    'guardian':        {'domain':'https://theguardian.com',       'pubname':'Guardian'}, 
    'reuters':         {'domain':'https://reuters.com',           'pubname':'Reuters'}, 
    'washingtontimes': {'domain':'https://washingtontimes.com',   'pubname':'Washington Times'}, 
    'washingtonpost':  {'domain':'https://washingtonpost.com',    'pubname':'Washington Post'}, 
    'cbs':             {'domain':'https://cbsnews.com',           'pubname':'CBS'}, 
    'marketwatch':     {'domain':'https://marketwatch.com',       'pubname':'MarketWatch'}, 
    'atlantic':        {'domain':'https://theatlantic.com',       'pubname':'Atlantic'}, 
    'vice':            {'domain':'https://vice.com',              'pubname':'VICE'}, 
    'npr':             {'domain':'https://npr.org',               'pubname':'npr'}, 
    'newrepublic':     {'domain':'https://newrepublic.com',       'pubname':'NEW REPUBLIC'}, 
    'yahoo':           {'domain':'https://yahoo.com',             'pubname':'yahoo'}, 
    'independent':     {'domain':'https://independent.co.uk',     'pubname':'INDEPENDENT'}, 
    'heritage':        {'domain':'https://heritage.org',          'pubname':'Heritage'}, 
    'zdnet':           {'domain':'https://www.zdnet.com',         'pubname':'ZDNet'}, # 반드시 www가 붙어야함
    'townhall':        {'domain':'https://townhall.com',          'pubname':'Townhall'}, 
    'abcnews':         {'domain':'https://abcnews.go.com',        'pubname':'ABC News'}, 
    'hotair':          {'domain':'https://hotair.com',            'pubname':'HOT AIR'}, 
    'cbc':             {'domain':'https://cbc.ca',                'pubname':'CBC'}, 
    'nymag':           {'domain':'https://nymag.com',             'pubname':'NewYork Magazine'}, 
    'thestreet':       {'domain':'https://thestreet.com',         'pubname':'TheStreet'}, 
    'thinkprogress':   {'domain':'https://thinkprogress.org',     'pubname':'ThinkProgress'}, 
    'dailybeast':      {'domain':'https://thedailybeast.com',     'pubname':'DAILY BEAST'}, 
    'realclearpolitcs':{'domain':'https://realclearpolitics.com', 'pubname':'RealClear Politics'}, 
    
    #'forbes':          'https://forbes.com', 
    #'hbr':             'https://hbr.org', 
    #'ft':              'https://ft.com', 
    #'economist':       'https://economist.com', 
}

In [5]:
def clean_url(pub, url):
    url = url.replace('http://', 'https://')
    
    if pub!='zdnet':
    # zdnet은 반드시 www가 붙어야되는 듯 (2019.08.30)
        url = url.replace('https://www.', 'https://')
    
    if url[-1]=='/':
        url = url[:-1]
    
    if pub=='wsj':
        try: url = url[:url.index('?mod=')]
        except: pass
        
        url += '?mod=rsswn'
        
    return url

In [6]:
import asyncio
from functools import partial

def collect_urls(src):
    s = time.time()
    n = len(src)
    n_done = [0]
    status = '\r{pct}% completed: {pub:<20}'
    #status = '\r{pct}% completed: {pub}' + ' '*20
    
    def progress(pub):
        n_done[0] += 1  # 그냥 n_done으로는 외부에서 변수값을 바꿀수 없으므로
        pct = '%.2f' % (n_done[0] / n * 100)
        print(status.format(pub=pub, pct=pct), end='')
        
    
    async def geturls(pub, domain):
        newspaper_config = partial(newspaper.build, config=config)
        resp = await loop.run_in_executor(None, newspaper_config, domain)
        articles = resp.articles
        urls = {clean_url(pub, article.url) for article in articles}
        progress(pub)
        return pub, urls


    async def main():
        fts = [asyncio.ensure_future(geturls(pub, val['domain'])) for pub, val in src.items()]
        return await asyncio.gather(*fts)


    result = None
    asyncio.set_event_loop(asyncio.new_event_loop())
    loop = asyncio.get_event_loop()

    try:
        # 다음 코드를 주피터에서 돌리려면, tornado를 downgrade 해야함
        # pip install tornado==4.5.3
        result = loop.run_until_complete(main())
        result = dict(result) #set.union(*result)

    except Exception as ex:
        print(ex)

    finally:
        loop.close()

    print('\nDONE: {0:.2f} seconds'.format(time.time() - s))
    return result

In [86]:
urls = collect_urls(dict(list(src.items())[5:6])); 
print(len(urls[list(urls.keys())[0]]))
urls

100.00% completed: cnbc                
DONE: 11.29 seconds
162


{'cnbc': {'https://buffett.cnbc.com/2018/04/01/warren-buffetts-bumpy-ride-with-the-airline-industry.html',
  'https://buffett.cnbc.com/2018/06/26/buffett-a-z-bitcoinand-cryptocurrencies.html',
  'https://buffett.cnbc.com/2018/07/02/buffett-on-america.html',
  'https://buffett.cnbc.com/2018/08/15/warren-watch.html',
  'https://buffett.cnbc.com/video/2018/03/29/buffett-how-kids-can-make-money.html',
  'https://buffett.cnbc.com/video/2018/05/07/warren-buffett-investor-teacher-icon-documentary.html',
  'https://buffett.cnbc.com/video/2018/09/10/buffett-remembers-the-2008-financial-crisis.html',
  'https://buffett.cnbc.com/video/2019/05/06/buffetts-politics-im-a-card-carrying-capitalist.html',
  'https://buffett.cnbc.com/video/2019/05/06/warren-buffett-dont-go-overboard-on-delayed-gratification.html',
  'https://cnbc.com/2018/02/27/who-is-eve-jobs-steve-jobs-youngest-daughter.html',
  'https://cnbc.com/2018/04/12/regulatory-scrutiny-could-end-up-helping-facebook-analyst.html',
  'https://cn

In [78]:
def get_publish_date(article):
    pubtime = article.publish_date
    url = article.url
    datesrc = 'newspaper'
    
    #_from_extractor = lambda url: articleDateExtractor.extractArticlePublishedDate(url)
    #_from_newsplease = lambda url: NewsPlease.from_url(url).date_publish
    
    def _from_extractor(url):
        try:
            return articleDateExtractor.extractArticlePublishedDate(url)
        
        except:
            return None
    
    
    def _from_newsplease(url):
        try:
            return NewsPlease.from_url(url).date_publish
        
        except:
            return None
    
    
    def _datize(time):
        if time is None:
            return ''
        
        time = pd.Timestamp(time)

        if time.tz is None:
            return str(time.date())

        else:
            return str(time.tz_convert('utc').date())
            
            
    if pubtime is None:
        datesrc = 'extractor'
        pubtime = _from_extractor(url)
        
        if pubtime is None:
            datesrc = 'newsplease'
            pubtime = _from_newsplease(url)
            
            if pubtime is None:
                datesrc = 'fail'
                pubtime = None
                    
    return _datize(pubtime), datesrc

In [89]:
url = 'https://huffpost.com/entry/13-amazing-photos-you-missed-this-week_n_5d6988e1e4b09bbc9eeefb8f'
url = 'https://cnn.com/2019/08/26/australia/australia-china-yang-hengjun-intl-hnk/index.html'
url = 'https://cnn.com/style/article/sa-designer-partners-with-hm/index.html'
url = 'https://bleacherreport.com/articles/2799967-lesean-mccoy-reportedly-released-by-bills-after-4-seasons-with-team?utm_source=cnn.com&utm_campaign=editorial&utm_medium=referral'
url = 'https://investing.com/news/stock-market-news/market-fragility-on-show-as-trade-war-china-data-curb-optimism-1967972'
url = 'https://politico.com/agenda/story/2019/08/29/appalachian-trail-dominion-energy-000943'
url = 'https://politico.eu/article/a-no-matteo-salvini-alliance-is-italy-best-hope-league-5star-movement-democratic-party'
url = 'https://politico.eu/sponsored-content/whats-needed-to-implement-the-paris-agreement-shells-ceo-shares-his-views'
url = 'https://arte.tv/en/videos/083967-024-A/re-brexit-chaos-in-the-caribbean' #
url = 'https://labs.time.com/2015/08/07/the-one-word-each-republican-candidate-wants-you-to-remember'
url = 'https://labs.time.com/story/is-it-fall-yet' #
url = 'https://labs.time.com/story/watch-how-the-drug-overdose-epidemic-spread-in-america' #
url = 'https://time.com/5622094/what-is-niksen'
url = 'https://time.com/magazine/us/5658416/september-2nd-2019-vol-194-no-8-u-s'
url = 'https://buffett.cnbc.com/2018/04/01/warren-buffetts-bumpy-ride-with-the-airline-industry.html'
url = 'https://cnbc.com/2018/04/12/regulatory-scrutiny-could-end-up-helping-facebook-analyst.html'
url = 'https://cnbc.com/definitive-guide-to-buying-your-first-home'
article = Article(url)
article.download()
article.parse()
get_publish_date(article)

('2017-07-19', 'newspaper')

In [90]:
url = 'https://cnbc.com/definitive-guide-to-buying-your-first-home'
article = Article(url)
article.download()
article.parse()
article.publish_date

datetime.datetime(2017, 7, 19, 19, 13, 17, tzinfo=tzutc())

In [92]:
article.text

'How much you need to earn to buy a home on the West Coast'

In [85]:
np = NewsPlease.from_url('https://labs.time.com/story/watch-how-the-drug-overdose-epidemic-spread-in-america')

RemoteDisconnected: Remote end closed connection without response

In [41]:
np.date_publish

datetime.datetime(2019, 8, 31, 9, 53, 30)

In [69]:
%%time
articleDateExtractor.extractArticlePublishedDate('https://labs.time.com/story/watch-how-the-drug-overdose-epidemic-spread-in-america')

Extracting date from https://labs.time.com/story/watch-how-the-drug-overdose-epidemic-spread-in-america
Wall time: 2.31 s


In [83]:
title = "Watch How the Drug Overdose Epidemic Spread in America"
url = 'https://newsapi.org/v2/everything?qInTitle="{title}"&apiKey=bfb917bca91c435d9c7efd247d95227b&pageSize=100'.format(title=title)
resp = requests.get(url)
resp.json()

{'status': 'ok', 'totalResults': 0, 'articles': []}

In [84]:
headers = {"Authorization":"sBBqsGXiYgF0Db5OV5tAw-akap7JBapGCe67Tuuz0lhquXd0k1oQC4B5B91GiHcVn2pHZrSf1gT2PUujH1YaQA"}
endpoint = 'https://api.newsriver.io/v2/search?'
title = "Watch How the Drug Overdose Epidemic Spread in America"
qry = 'query=title:"{title}"&sortOrder=DESC&limit=15'.format(title=title)
# qry = 'query=language:en AND website.domainName:"wsj.com"&sortOrder=DESC&limit=15'
url = endpoint + qry
response = requests.get(url, headers=headers)
out = response.json();out

[]

In [62]:
qry

'query=title:"How a game of rock, paper, scissors decided a $20 million auction consignment"&sortOrder=DESC&limit=15'

In [79]:
def get_download_urls(urls):
    download_urls = {}
    basedir = os.path.join(os.getcwd(), 'newsdata')
    ext = '.json'

    for pub, _urls in tqdm(urls.items()):
        download_urls[pub] = {}
        pubdir = os.path.join(basedir, pub)

        if not os.path.isdir(pubdir):
            os.makedirs(pubdir)

        for _url in _urls:
            hash_url = hashlib.sha1(_url.encode('utf-8')).hexdigest()
            file = os.path.join(pubdir, hash_url + ext)

            if not os.path.isfile(file):
                download_urls[pub][file] = _url
                
    return download_urls

In [80]:
download_urls = get_download_urls(urls)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




In [81]:
download_urls

{'huffpost': {'C:\\Users\\Sejin\\Documents\\GitHub\\protobed\\etc\\newsdata\\huffpost\\29b48cfa282143b5b56fc5494e9ae1aa02d35f5b.json': 'https://huffpost.com/entry/labor-day-2019-sales-on-tvs-furniture-and-appliances_l_5d66b90de4b022fbceb515f6',
  'C:\\Users\\Sejin\\Documents\\GitHub\\protobed\\etc\\newsdata\\huffpost\\0607064aca7aa47f99df3eb85588d1923068da77.json': 'https://huffpost.com/entry/nfl-ryan-russell-comes-out-bisexual_n_5d6838a7e4b02bc6bb36bc37',
  'C:\\Users\\Sejin\\Documents\\GitHub\\protobed\\etc\\newsdata\\huffpost\\00373a444ad611e2c52b4f8d3f08b31c3259fe5f.json': 'https://huffpost.com/entry/meet-andrea-londo-who-has-gone-from-border-child-to-inspiring-actress_n_5d236a6ce4b0f3125687f3ac',
  'C:\\Users\\Sejin\\Documents\\GitHub\\protobed\\etc\\newsdata\\huffpost\\cab5cb45b5efa210bedf673c20497c2773245d55.json': 'https://huffpost.com/entry/meet-clara-pablo-a-music-executive-helping-latinas-stand-up-to-breast-cancer_n_5d236cdde4b0f3125687fb21',
  'C:\\Users\\Sejin\\Documents\\

In [91]:
def download(what):
    s = time.time()
    n = len(src)
    n_done = [0]
    status = '\r{pct}% completed: {pub:<20}'
    #status = '\r{pct}% completed: {pub}' + ' '*20
    
    def progress(pub):
        n_done[0] += 1  # 그냥 n_done으로는 외부에서 변수값을 바꿀수 없으므로
        pct = '%.2f' % (n_done[0] / n * 100)
        print(status.format(pub=pub, pct=pct), end='')
        
    
    async def _download(urls):
        for file, url in urls.items():
            article = await loop.run_in_executor(None, Article, url)
            await loop.run_in_executor(None, article.download)
            await loop.run_in_executor(None, article.parse)
            
            content = {
                'title': article.title, 
                'pub_date': str(article.publish_date) ,
                'text': article.text, 
                'img': article.top_image, 
                'url': url,
            }
            
            with open(file, 'w') as f:
                json.dump(content, f)
        
        #progress(pub)
        #return pub, urls


    async def main():
        fts = [asyncio.ensure_future(_download(urls)) for pub, urls in what.items()]
        await asyncio.gather(*fts)


    result = None
    asyncio.set_event_loop(asyncio.new_event_loop())
    loop = asyncio.get_event_loop()

    try:
        # 다음 코드를 주피터에서 돌리려면, tornado를 downgrade 해야함
        # pip install tornado==4.5.3
        result = loop.run_until_complete(main())
        result = dict(result) #set.union(*result)

    except Exception as ex:
        print(ex)

    finally:
        loop.close()

    print('\nDONE: {0:.2f} seconds'.format(time.time() - s))
    #return result

In [92]:
download(download_urls)

'NoneType' object is not iterable

DONE: 176.10 seconds


In [122]:
# from urllib.parse import quote_plus
import re

# / ? & % : * " < > | \
delimiters = '/|\?|\&|\%|:|\*|\"|\<|\>|\||\\\\'
basedir = os.path.join(os.getcwd(), 'newsdata')
ext = '.json'

for pub, _urls in tqdm(urls.items()):
    #pubdir = os.path.join(os.getcwd(), 'newsdata', pub)
    
    #if not os.path.isdir(pubdir):
    #    os.makedirs(pubdir)
        
    for _url in _urls:
        fname = _url + ext
        file = os.path.join(basedir, pub, *re.split(delimiters, fname))
        fulldir = os.path.dirname(file)
        #file = os.path.join(pubdir, *re.split(delimiters, fname))
        #file = os.path.join(pubdir, quote_plus(_url) + '.json')
        
        if not os.path.isdir(fulldir):
            os.makedirs(fulldir)
        
        if not os.path.isfile(file):
            with open(file, 'w') as f:
                pass
                #json.dump(list(newsdata), f)

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))




FileNotFoundError: [WinError 3] 지정된 경로를 찾을 수 없습니다: 'C:\\Users\\infomax\\Documents\\GitHub\\protobed\\etc\\newsdata\\cnn\\https\\comparecards.com\\guide\\credit-cards-to-have-you-flying-in-no-time\\esourceid=6317676\\utm_source=cnn\\utm_medium=native\\pla=cnn.com\\bdst=rv\\acqs=prospecting\\utm_campaign=sectionfront\\grp=travel-no-time'

In [48]:
headers = {"Authorization":"sBBqsGXiYgF0Db5OV5tAw-akap7JBapGCe67Tuuz0lhquXd0k1oQC4B5B91GiHcVn2pHZrSf1gT2PUujH1YaQA"}
endpoint = 'https://api.newsriver.io/v2/search?'
qry = 'query=title:"Young Chinese Spend Like Americans—and Take on Worrisome Debt"&sortOrder=DESC&limit=15'
# qry = 'query=language:en AND website.domainName:"wsj.com"&sortOrder=DESC&limit=15'
url = endpoint + qry
response = requests.get(url, headers=headers)
out = response.json();out

[{'id': 'n8Gm8IoHnnJ18Ps04A8Zk2-luqiA81d11wuX99gxVHC9UOid9dv2ESmfHWRkI8OlMmfKoJCXoj0cQNPyar1nXQ',
  'discoverDate': '2019-08-29T16:15:47.964+0000',
  'title': 'Young Chinese Spend Like Americans -- And Take on Worrisome Debt',
  'language': 'en',
  'text': 'By Stella Yifan Xie, Shan Li and Julie Wernau | Photographs by Noah Sheldon for The Wall Street Journal Western economists have long said that China needed a base of American-style consumers to bring the country sustained economic growth. Now China has one: Its young people. While previous generations were frugal savers -- a product of their years growing up in a turbulent economy with a weak social safety net -- the 335 million people under 30 born in China between 1990 and 2009 behave much more like Americans, spending avidly on gadgets, entertainment and travel. The freewheeling consumption is helping China diversify its economy at a crucial time. Beijing has relied on exports and infrastructure-building to drive growth for decad

In [49]:
ar = Article('https://wsj.com/articles/young-chinese-spend-like-americansand-take-on-worrisome-debt-11567093953?mod=rsswn')

In [56]:
%%time
ar = Article('https://huffpost.com/entry/ad-joe-biden-elizabeth-warren-delaware-chancery-court_n_5d67d14ae4b01fcc6910188b')
ar.download()
ar.parse()
ar.text;

Wall time: 2.04 s


In [52]:
ar.top_image

'Woman’s $500,000 Ad Buy Against Joe Biden Uses Misleading Video Footage'

In [90]:
{'title':ar.title, 'pub_date':str(ar.publish_date) ,'text':ar.text, 'img':ar.top_image}

{'title': 'Woman’s $500,000 Ad Buy Against Joe Biden Uses Misleading Video Footage',
 'pub_date': '2019-08-29 18:09:12-04:00',
 'text': "An investor recently embroiled in a years-long legal battle has reportedly purchased $500,000 worth of ads against former Vice President Joe Biden that attempts to tie the 2020 candidate to allegations of corruption in a Delaware court.\n\nThe newly released video ad targeting Biden and Delaware’s Chancery Court is the largest third-party attack ad purchased this primary season, Bloomberg News reported.\n\nThough it’s reportedly scheduled to air in Delaware and the early nominating states of Iowa and New Hampshire, Shirley Shawe, a registered Republican who purchased the ad and had dealings with the Chancery Court, said its focus is less on the presidential race.\n\nYouTube The ad video shows edited footage of then-Sen. Joe Biden discussing bankruptcy reform with then-Harvard Professor Elizabeth Warren during a Senate Judiciary Committee hearing in 20

In [58]:
2*10000/3600

5.555555555555555