In [23]:
import newspaper
from newspaper import Article, Config
from IPython.core.debugger import set_trace
import requests
import json
import time
import hashlib
import pandas as pd
from tqdm.autonotebook import tqdm
tqdm.pandas()

import os
import sys
parentdir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parentdir)

# import django
# django.setup()

# from data.models import Newsdata

In [24]:
config = Config()
config.fetch_images = False
config.memoize_articles = False
config.language = 'en'

In [25]:
src = {
    'huffpost':        {'domain':'https://huffpost.com',          'pubname':'HuffPost'}, 
    'cnn':             {'domain':'https://cnn.com',               'pubname':'CNN'}, 
    'investingcom':    {'domain':'https://investing.com',         'pubname':'Investing.com'}, 
    'politico':        {'domain':'https://politico.com',          'pubname':'POLITICO'}, 
    'time':            {'domain':'https://time.com',              'pubname':'TIME'}, 
    'cnbc':            {'domain':'https://cnbc.com',              'pubname':'CNBC'}, 
    'foxnews':         {'domain':'https://foxnews.com',           'pubname':'FOX News'}, 
    'foxbusiness':     {'domain':'https://foxbusiness.com',       'pubname':'FOX Business'}, 
    'bbc':             {'domain':'https://bbc.com',               'pubname':'BBC'}, 
    'businessinsider': {'domain':'https://businessinsider.com',   'pubname':'Business Insider'}, 
    'morningstar':     {'domain':'https://morningstar.com',       'pubname':'Morningstar'}, 
    
    'wsj':             {'domain':'https://wsj.com',               'pubname':'Wall Street Journal'}, # ?mod=rsswn
    'nyt':             {'domain':'https://nytimes.com',           'pubname':'NewYork Times'}, 
    'guardian':        {'domain':'https://theguardian.com',       'pubname':'Guardian'}, 
    'reuters':         {'domain':'https://reuters.com',           'pubname':'Reuters'}, 
    'washingtontimes': {'domain':'https://washingtontimes.com',   'pubname':'Washington Times'}, 
    'washingtonpost':  {'domain':'https://washingtonpost.com',    'pubname':'Washington Post'}, 
    'cbs':             {'domain':'https://cbsnews.com',           'pubname':'CBS'}, 
    'marketwatch':     {'domain':'https://marketwatch.com',       'pubname':'MarketWatch'}, 
    'atlantic':        {'domain':'https://theatlantic.com',       'pubname':'Atlantic'}, 
    'vice':            {'domain':'https://vice.com',              'pubname':'VICE'}, 
    'npr':             {'domain':'https://npr.org',               'pubname':'npr'}, 
    'newrepublic':     {'domain':'https://newrepublic.com',       'pubname':'NEW REPUBLIC'}, 
    'yahoo':           {'domain':'https://yahoo.com',             'pubname':'yahoo'}, 
    'independent':     {'domain':'https://independent.co.uk',     'pubname':'INDEPENDENT'}, 
    'heritage':        {'domain':'https://heritage.org',          'pubname':'Heritage'}, 
    'zdnet':           {'domain':'https://www.zdnet.com',         'pubname':'ZDNet'}, # 반드시 www가 붙어야함
    'townhall':        {'domain':'https://townhall.com',          'pubname':'Townhall'}, 
    'abcnews':         {'domain':'https://abcnews.go.com',        'pubname':'ABC News'}, 
    'hotair':          {'domain':'https://hotair.com',            'pubname':'HOT AIR'}, 
    'cbc':             {'domain':'https://cbc.ca',                'pubname':'CBC'}, 
    'nymag':           {'domain':'https://nymag.com',             'pubname':'NewYork Magazine'}, 
    'thestreet':       {'domain':'https://thestreet.com',         'pubname':'TheStreet'}, 
    'thinkprogress':   {'domain':'https://thinkprogress.org',     'pubname':'ThinkProgress'}, 
    'dailybeast':      {'domain':'https://thedailybeast.com',     'pubname':'DAILY BEAST'}, 
    'realclearpolitcs':{'domain':'https://realclearpolitics.com', 'pubname':'RealClear Politics'}, 
    
    #'forbes':          'https://forbes.com', 
    #'hbr':             'https://hbr.org', 
    #'ft':              'https://ft.com', 
    #'economist':       'https://economist.com', 
}

In [26]:
def clean_url(pub, url):
    url = url.replace('http://', 'https://')
    
    if pub!='zdnet':
    # zdnet은 반드시 www가 붙어야되는 듯 (2019.08.30)
        url = url.replace('https://www.', 'https://')
    
    if url[-1]=='/':
        url = url[:-1]
    
    if pub=='wsj':
        try: url = url[:url.index('?mod=')]
        except: pass
        
        url += '?mod=rsswn'
        
    return url

In [27]:
import asyncio
from functools import partial

def collect_urls(src):
    s = time.time()
    n = len(src)
    n_done = [0]
    status = '\r{pct}% completed: {pub:<20}'
    #status = '\r{pct}% completed: {pub}' + ' '*20
    
    def progress(pub):
        n_done[0] += 1  # 그냥 n_done으로는 외부에서 변수값을 바꿀수 없으므로
        pct = '%.2f' % (n_done[0] / n * 100)
        print(status.format(pub=pub, pct=pct), end='')
        
    
    async def geturls(pub, domain):
        newspaper_config = partial(newspaper.build, config=config)
        resp = await loop.run_in_executor(None, newspaper_config, domain)
        articles = resp.articles
        urls = {clean_url(pub, article.url) for article in articles}
        progress(pub)
        return pub, urls


    async def main():
        fts = [asyncio.ensure_future(geturls(pub, val['domain'])) for pub, val in src.items()]
        return await asyncio.gather(*fts)


    result = None
    asyncio.set_event_loop(asyncio.new_event_loop())
    loop = asyncio.get_event_loop()

    try:
        # 다음 코드를 주피터에서 돌리려면, tornado를 downgrade 해야함
        # pip install tornado==4.5.3
        result = loop.run_until_complete(main())
        result = dict(result) #set.union(*result)

    except Exception as ex:
        print(ex)

    finally:
        loop.close()

    print('\nDONE: {0:.2f} seconds'.format(time.time() - s))
    return result

In [28]:
urls = collect_urls(dict(list(src.items())[:11])); urls

100.00% completed: investingcom        
DONE: 22.39 seconds


{'huffpost': {'https://huffingtonpost.co.uk/p/contributor-terms-of-use-and-agreement',
  'https://huffpost.com/entry/ad-joe-biden-elizabeth-warren-delaware-chancery-court_n_5d67d14ae4b01fcc6910188b',
  'https://huffpost.com/entry/alexandria-ocasio-cortez-loves-gop-attack-ad_n_5d68917ae4b06beb649bbf66',
  'https://huffpost.com/entry/alexandria-ocasio-cortez-trump-puerto-rico_n_5d673db2e4b063c341fa8ad6',
  'https://huffpost.com/entry/american-families-trump-china-tariffs-jpmorgan_n_5d5ade7ce4b04e1e14de8a9e',
  'https://huffpost.com/entry/best-home-equity-loan-ways-to-use_l_5d5af341e4b036065b6abf17',
  'https://huffpost.com/entry/black-womens-equal-pay-day-2019_n_5d5d4bc6e4b09e2b9fe4e415',
  'https://huffpost.com/entry/bruce-lees-daughter-quentin-tarantino-could-shut-up-about-his-depiction-of-her-dad_n_5d55a322e4b0d8840ff01357',
  'https://huffpost.com/entry/california-students-two-years-tuition-free-community-college_n_5d6736dce4b01fcc690eb36f',
  'https://huffpost.com/entry/california-t

In [33]:
basedir = os.path.join(os.getcwd(), 'newsdata')
ext = '.json'

for pub, _urls in tqdm(urls.items()):
    pubdir = os.path.join(basedir, pub)
    
    if not os.path.isdir(pubdir):
        os.makedirs(pubdir)
        
    for _url in _urls:
        hash_url = hashlib.sha1(_url.encode('utf-8')).hexdigest()
        file = os.path.join(pubdir, hash_url + ext)
        
        if not os.path.isfile(file):
            with open(file, 'w') as f:
                pass
                #json.dump(list(newsdata), f)

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))




In [34]:
hashlib.sha1('https://huffpost.com/entry/bruce-lees-daughter-quentin-tarantino-could-shut-up-about-his-depiction-of-her-dad_n_5d55a322e4b0d8840ff01357'.encode('utf-8')).hexdigest()

'd4776ee6fa1cf8a12ff3cb0b0b5edae4607c1631'

In [35]:
hashlib.sha1('https://huffpost.com/entry/bruce-lees-daughter-quentin-tarantino-could-shut-up-about-his-depiction-of-her-dad_n_5d55a322e4b0d8840ff01357'.encode('utf-8')).hexdigest()

'd4776ee6fa1cf8a12ff3cb0b0b5edae4607c1631'

In [122]:
# from urllib.parse import quote_plus
import re

# / ? & % : * " < > | \
delimiters = '/|\?|\&|\%|:|\*|\"|\<|\>|\||\\\\'
basedir = os.path.join(os.getcwd(), 'newsdata')
ext = '.json'

for pub, _urls in tqdm(urls.items()):
    #pubdir = os.path.join(os.getcwd(), 'newsdata', pub)
    
    #if not os.path.isdir(pubdir):
    #    os.makedirs(pubdir)
        
    for _url in _urls:
        fname = _url + ext
        file = os.path.join(basedir, pub, *re.split(delimiters, fname))
        fulldir = os.path.dirname(file)
        #file = os.path.join(pubdir, *re.split(delimiters, fname))
        #file = os.path.join(pubdir, quote_plus(_url) + '.json')
        
        if not os.path.isdir(fulldir):
            os.makedirs(fulldir)
        
        if not os.path.isfile(file):
            with open(file, 'w') as f:
                pass
                #json.dump(list(newsdata), f)

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))




FileNotFoundError: [WinError 3] 지정된 경로를 찾을 수 없습니다: 'C:\\Users\\infomax\\Documents\\GitHub\\protobed\\etc\\newsdata\\cnn\\https\\comparecards.com\\guide\\credit-cards-to-have-you-flying-in-no-time\\esourceid=6317676\\utm_source=cnn\\utm_medium=native\\pla=cnn.com\\bdst=rv\\acqs=prospecting\\utm_campaign=sectionfront\\grp=travel-no-time'

In [18]:
ar = Article('https://www.politico.com/story/2019/08/29/republicans-trump-economy-anxiety-1476780')
ar.download()
ar.parse()

In [20]:
ar.title

'Republicans grow anxious about the Trump economy'

In [21]:
title = "Republicans grow anxious about the Trump economy"
url = 'https://newsapi.org/v2/everything?qInTitle="{title}"&apiKey=bfb917bca91c435d9c7efd247d95227b&pageSize=100'.format(title=title)
resp = requests.get(url)
resp.json()

{'status': 'ok',
 'totalResults': 4,
 'articles': [{'source': {'id': 'politico', 'name': 'Politico'},
   'author': 'Burgess Everett',
   'title': 'Republicans grow anxious about the Trump economy',
   'description': "Trump's trade war with China could undermine GOP chances of holding the White House and Senate in 2020.",
   'url': 'https://www.politico.com/story/2019/08/29/republicans-trump-economy-anxiety-1476780',
   'urlToImage': 'https://static.politico.com/16/30/e63e94e8488a90d8218dc7bb34c8/190828-toomey-ap-773.jpg',
   'publishedAt': '2019-08-29T09:09:31Z',
   'content': 'Republican Sen. Patrick Toomey fears that trade uncertainty is contributing to an economic slowdown. | Jacquelyn Martin/AP Photo\r\nRepublicans have sat patiently with President Donald Trump on his tariff roller-coaster ride with China. Now theyre starting to f… [+7865 chars]'},
  {'source': {'id': None, 'name': 'Yahoo.com'},
   'author': 'By Burgess Everett',
   'title': 'Republicans grow anxious about the Trum

In [26]:
headers = {"Authorization":"sBBqsGXiYgF0Db5OV5tAw-akap7JBapGCe67Tuuz0lhquXd0k1oQC4B5B91GiHcVn2pHZrSf1gT2PUujH1YaQA"}
endpoint = 'https://api.newsriver.io/v2/search?'
qry = 'query=title:"Republicans grow anxious about the Trump economy"&sortOrder=DESC&limit=15'
# qry = 'query=language:en AND website.domainName:"wsj.com"&sortOrder=DESC&limit=15'
url = endpoint + qry
response = requests.get(url, headers=headers)
out = response.json();out

[{'id': 'plePuRZae336w7aAe8Vx7rWkiAf5VYVAD-S0zKDqnsDnZ5LXmnnTsF1QcrtXlj6L6rUDtlYwddpj8rtuO1EtEQ',
  'publishDate': '2019-08-29T00:00:00',
  'discoverDate': '2019-08-29T09:32:48.176+0000',
  'title': 'Republicans grow anxious about the Trump economy',
  'language': 'en',
  'text': "Republicans have sat patiently with President Donald Trump on his tariff roller-coaster ride with China. Now they’re starting to feel queasy. Trump argues his escalating trade war will force China to the table for a deal. But his ever-rising tariffs — and his market-rattling tweets — are increasingly alarming the GOP. Story Continued Below “There’s no question that trade uncertainty is contributing to the slowdown,” said Sen. Pat Toomey (R-Pa.), a leading free-trader. “We’re in a very good place. The danger is: Where are we going to be a year from now if concerns about trade continue to be an irritant to growth?” Particularly as the global economy cools, key Republicans say new levies on almost all Chinese go

In [35]:
ar = Article('https://politico.eu/article/italian-pm-conte-starts-work-to-form-new-government')

In [36]:
ar.download()
ar.parse()
ar.text

'Italian Prime Minister Giuseppe Conte | Sean Gallup/Getty Images Italian PM Conte starts work to form new government Matteo Salvini’s League will end up in opposition if the 5Stars/Democratic Party alliance succeeds.\n\nROME — Italian Prime Minister Giuseppe Conte has begun trying to form a new Cabinet after accepting President Sergio Mattarella\'s offer to lead the government.\n\nConte will meet with delegations from all political groups, including the far-right League and Brothers of Italy, starting on Thursday afternoon. The meetings will conclude on Friday. The prime minister is tasked with looking for an alternative parliamentary majority after Matteo Salvini\'s League decided to pull the plug on the previous government earlier this month.\n\nThe populist 5Star Movement, the League\'s former coalition partner, and the center-left Democratic Party told Mattarella on Wednesday that they would like to form an alliance. However, they don\'t have a majority in the Senate so they will 