In [13]:
import newspaper
from newspaper import Article, Config
from newsplease import NewsPlease
from IPython.core.debugger import set_trace
import requests
import json
import time
import hashlib
import pandas as pd
from langdetect import detect
import articleDateExtractor
from tqdm.autonotebook import tqdm
tqdm.pandas()

import os
import sys
parentdir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parentdir)

# import django
# django.setup()

# from data.models import Newsdata

In [64]:
src = {
    'huffpost':         {'domain':'https://huffpost.com',              'pubname':'HuffPost'}, 
    'cnn':              {'domain':'https://cnn.com',                   'pubname':'CNN'}, 
    'investingcom':     {'domain':'https://investing.com',             'pubname':'Investing.com'}, 
    'politico':         {'domain':'https://politico.com',              'pubname':'POLITICO'}, 
    'time':             {'domain':'https://time.com',                  'pubname':'TIME'}, 
    'cnbc':             {'domain':'https://cnbc.com',                  'pubname':'CNBC'}, 
    'foxnews':          {'domain':'https://foxnews.com',               'pubname':'FOX News'}, 
    'foxbusiness':      {'domain':'https://foxbusiness.com',           'pubname':'FOX Business'}, 
    'bbc':              {'domain':'https://bbc.com',                   'pubname':'BBC'}, 
    'businessinsider':  {'domain':'https://businessinsider.com',       'pubname':'Business Insider'}, 
    'morningstar':      {'domain':'https://morningstar.com',           'pubname':'Morningstar'}, 
    'wsj':              {'domain':'https://wsj.com',                   'pubname':'Wall Street Journal'}, # ?mod=rsswn
    
    'nyt':              {'domain':'https://nytimes.com',               'pubname':'NewYork Times'}, 
    'guardian':         {'domain':'https://theguardian.com',           'pubname':'Guardian'}, 
    'reuters':          {'domain':'https://reuters.com',               'pubname':'Reuters'}, 
    'washingtontimes':  {'domain':'https://washingtontimes.com',       'pubname':'Washington Times'}, 
    'washingtonpost':   {'domain':'https://washingtonpost.com',        'pubname':'Washington Post'}, 
    'cbs':              {'domain':'https://cbsnews.com',               'pubname':'CBS'}, 
    'marketwatch':      {'domain':'https://marketwatch.com',           'pubname':'MarketWatch'}, 
    'atlantic':         {'domain':'https://theatlantic.com',           'pubname':'Atlantic'}, 
    'vice':             {'domain':'https://vice.com',                  'pubname':'VICE'}, 
    'npr':              {'domain':'https://npr.org',                   'pubname':'npr'}, 
    'newrepublic':      {'domain':'https://newrepublic.com',           'pubname':'NEW REPUBLIC'}, 
    'yahoo':            {'domain':'https://yahoo.com',                 'pubname':'yahoo'}, 
    'independent':      {'domain':'https://independent.co.uk',         'pubname':'INDEPENDENT'}, 
    'heritage':         {'domain':'https://heritage.org',              'pubname':'Heritage'}, 
    'zdnet':            {'domain':'https://www.zdnet.com',             'pubname':'ZDNet'}, # 반드시 www가 붙어야함
    'townhall':         {'domain':'https://townhall.com',              'pubname':'Townhall'}, 
    'abcnews':          {'domain':'https://abcnews.go.com',            'pubname':'ABC News'}, 
    'hotair':           {'domain':'https://hotair.com',                'pubname':'HOT AIR'}, 
    'cbc':              {'domain':'https://cbc.ca',                    'pubname':'CBC'}, 
    'nymag':            {'domain':'https://nymag.com',                 'pubname':'NewYork Magazine'}, 
    'thestreet':        {'domain':'https://www.thestreet.com',         'pubname':'TheStreet'}, # 반드시 www가 붙어야함 
    'thinkprogress':    {'domain':'https://thinkprogress.org',         'pubname':'ThinkProgress'}, 
    'dailybeast':       {'domain':'https://thedailybeast.com',         'pubname':'DAILY BEAST'}, 
    'realclearpolitics':{'domain':'https://www.realclearpolitics.com', 'pubname':'RealClear Politics'}, # 반드시 www가 붙어야함
    
    #'forbes':          'https://forbes.com', 
    #'hbr':             'https://hbr.org', 
    #'ft':              'https://ft.com', 
    #'economist':       'https://economist.com', 
}

In [65]:
def _config():
    config = Config()
    config.fetch_images = False
    config.memoize_articles = False
    config.language = 'en'
    return config

In [66]:
def clean_url(pub, url):
    url = url.replace('http://', 'https://')
    
    if pub!='zdnet':
    # zdnet은 반드시 www가 붙어야되는 듯 (2019.08.30)
        url = url.replace('https://www.', 'https://')
    
    if url[-1]=='/':
        url = url[:-1]
    
    if pub=='wsj':
        try: url = url[:url.index('?mod=')]
        except: pass
        
        url += '?mod=rsswn'
        
    return url

In [67]:
import asyncio
from functools import partial

def collect_urls(src):
    s = time.time()
    n = len(src)
    n_done = [0]
    status = '\r{pct}% completed: {pub:<20}'
    newspaper_config = partial(newspaper.build, config=_config())
    
    def progress(pub):
        n_done[0] += 1  # 그냥 n_done으로는 외부에서 변수값을 바꿀수 없으므로
        pct = '%.2f' % (n_done[0] / n * 100)
        print(status.format(pub=pub, pct=pct), end='')
        
    
    async def geturls(pub, domain):
        resp = await loop.run_in_executor(None, newspaper_config, domain)
        articles = resp.articles
        urls = {clean_url(pub, article.url) for article in articles}
        progress(pub)
        return pub, urls


    async def main():
        fts = [asyncio.ensure_future(geturls(pub, val['domain'])) for pub, val in src.items()]
        return await asyncio.gather(*fts)


    result = None
    asyncio.set_event_loop(asyncio.new_event_loop())
    loop = asyncio.get_event_loop()

    try:
        # 다음 코드를 주피터에서 돌리려면, tornado를 downgrade 해야함
        # pip install tornado==4.5.3
        result = loop.run_until_complete(main())
        result = dict(result) #set.union(*result)

    except Exception as ex:
        print(ex)

    finally:
        loop.close()

    print('\nDONE: {0:.2f} seconds'.format(time.time() - s))
    return result

In [68]:
# urls = collect_urls(dict(list(src.items())[5:6])); 
urls = collect_urls(src); 
# print(len(urls[list(urls.keys())[0]]))
urls;

100.00% completed: investingcom        
DONE: 38.92 seconds


In [69]:
len(set.union(*urls.values()))

10710

In [63]:
newspaper.build('https://thinkprogress.org').articles

[<newspaper.article.Article at 0x17bbee93668>,
 <newspaper.article.Article at 0x17bbee934a8>,
 <newspaper.article.Article at 0x17babc5bcc0>,
 <newspaper.article.Article at 0x17bb1be4048>,
 <newspaper.article.Article at 0x17bbee93470>,
 <newspaper.article.Article at 0x17bbee93f98>,
 <newspaper.article.Article at 0x17bbee93278>,
 <newspaper.article.Article at 0x17bbee93438>,
 <newspaper.article.Article at 0x17bbee93160>,
 <newspaper.article.Article at 0x17bbee93da0>,
 <newspaper.article.Article at 0x17bab672ba8>,
 <newspaper.article.Article at 0x17bb1c43080>,
 <newspaper.article.Article at 0x17bab5f2908>,
 <newspaper.article.Article at 0x17bab5f2ba8>,
 <newspaper.article.Article at 0x17bb0108e10>,
 <newspaper.article.Article at 0x17bab5f29b0>,
 <newspaper.article.Article at 0x17bbeecdfd0>,
 <newspaper.article.Article at 0x17bb0108a20>,
 <newspaper.article.Article at 0x17bb01089b0>,
 <newspaper.article.Article at 0x17baaa1b358>,
 <newspaper.article.Article at 0x17b8f514198>,
 <newspaper.a

In [70]:
def get_publish_date(article):
    pubtime = article.publish_date
    url = article.url
    datesrc = 'newspaper'
    
    def _from_extractor(url):
        try:
            return articleDateExtractor.extractArticlePublishedDate(url)
        
        except:
            return None
    
    
    def _from_newsplease(url):
        try:
            return NewsPlease.from_url(url).date_publish
        
        except:
            return None
    
    
    def _datize(time):
        if time is None:
            return ''

        try:
            time = pd.Timestamp(time)

            if time.tz is None:
                return str(time.date())

            else:
                return str(time.tz_convert('utc').date())
            
        except:
            return ''
            

    if pubtime is None:
        datesrc = 'extractor'
        pubtime = _from_extractor(url)
        
        if pubtime is None:
            datesrc = 'newsplease'
            pubtime = _from_newsplease(url)
            
            if pubtime is None:
                datesrc = 'fail'
                    
    return _datize(pubtime)#, datesrc

In [257]:
url = 'https://huffpost.com/entry/13-amazing-photos-you-missed-this-week_n_5d6988e1e4b09bbc9eeefb8f'
url = 'https://cnn.com/2019/08/26/australia/australia-china-yang-hengjun-intl-hnk/index.html'
url = 'https://cnn.com/style/article/sa-designer-partners-with-hm/index.html'
url = 'https://bleacherreport.com/articles/2799967-lesean-mccoy-reportedly-released-by-bills-after-4-seasons-with-team?utm_source=cnn.com&utm_campaign=editorial&utm_medium=referral'
url = 'https://investing.com/news/stock-market-news/market-fragility-on-show-as-trade-war-china-data-curb-optimism-1967972'
url = 'https://politico.com/agenda/story/2019/08/29/appalachian-trail-dominion-energy-000943'
url = 'https://politico.eu/article/a-no-matteo-salvini-alliance-is-italy-best-hope-league-5star-movement-democratic-party'
url = 'https://politico.eu/sponsored-content/whats-needed-to-implement-the-paris-agreement-shells-ceo-shares-his-views'
url = 'https://arte.tv/en/videos/083967-024-A/re-brexit-chaos-in-the-caribbean' #
url = 'https://labs.time.com/2015/08/07/the-one-word-each-republican-candidate-wants-you-to-remember'
url = 'https://labs.time.com/story/is-it-fall-yet' #
# url = 'https://labs.time.com/story/watch-how-the-drug-overdose-epidemic-spread-in-america' #
# url = 'https://time.com/5622094/what-is-niksen'
# url = 'https://time.com/magazine/us/5658416/september-2nd-2019-vol-194-no-8-u-s'
# url = 'https://buffett.cnbc.com/2018/04/01/warren-buffetts-bumpy-ride-with-the-airline-industry.html'
# url = 'https://cnbc.com/2018/04/12/regulatory-scrutiny-could-end-up-helping-facebook-analyst.html'
# url = 'https://cnbc.com/definitive-guide-to-buying-your-first-home'
url = 'https://marketwatch.com/video/opinion-hong-kong-police-crack-down-on-protest-leaders/F2B22E96-81D9-486A-AF3D-CCE3DA52E941.html'
article = Article(url)
article.download()
article.parse()
get_publish_date(article)

Extracting date from https://marketwatch.com/video/opinion-hong-kong-police-crack-down-on-protest-leaders/F2B22E96-81D9-486A-AF3D-CCE3DA52E941.html


'2019-08-30'

In [71]:
def select(urls):
    selected = {}
    basedir = os.path.join(os.getcwd(), 'newsdata')
    ext = '.json'

    for pub, _urls in tqdm(urls.items()):
        selected[pub] = set()

        for _url in _urls:
            hash_url = hashlib.sha1(_url.encode('utf-8')).hexdigest()
            
            file_in_saved = os.path.join(basedir, 'saved', hash_url[0], hash_url + ext)
            file_in_downloaded = os.path.join(basedir, 'downloaded', hash_url + ext)
            file_in_trash = os.path.join(basedir, 'trash', hash_url[0], hash_url + ext)

            if os.path.isfile(file_in_saved) or os.path.isfile(file_in_downloaded) or os.path.isfile(file_in_trash):
                continue
                
            else:
                selected[pub].add(_url)
                
    return selected

In [76]:
## 아무리 반복해도 늘 urls_selected가 큰데,,, try,except에서 처리 안된 아티클이 사라지기 때문인가?

urls_selected = select(urls); urls_selected;

HBox(children=(IntProgress(value=0, max=36), HTML(value='')))




In [77]:
len(set.union(*urls_selected.values()))

1406

In [74]:
def download(urls):
    s = time.time()
    n = len(urls)
    n_done = [0]
    status = '\r{pct}% completed: {pub:<20}'

    basedir = os.path.join(os.getcwd(), 'newsdata')
    ext = '.json'
    
    
    def progress(pub):
        n_done[0] += 1
        pct = '%.2f' % (n_done[0] / n * 100)
        print(status.format(pub=pub, pct=pct), end='')
        
        
    def makedir_if_not_exists(file):
        _dir = os.path.dirname(file)
        
        if not os.path.isdir(_dir):
            os.makedirs(_dir)
            
            
    def detect_lang(article):
        lang = article.meta_lang
        
        if lang=='':
            return detect(article)
        
        else:
            return lang
            
    
    def get_article(url):
        article = Article(url)
        article.download()
        article.parse()
        return article
        
    
    async def _download(pub, _urls):
        for url in _urls:
            try: 
                article = await loop.run_in_executor(None, get_article, url)
                
                text = article.text
                language = detect_lang(article) #article.meta_lang
                published_at = await loop.run_in_executor(None, get_publish_date, article)
                downloaded_at = str(pd.Timestamp.utcnow())

                hash_url = hashlib.sha1(url.encode('utf-8')).hexdigest()
                is_too_short = (not article.is_valid_body()) and (len(article.text)<500)

                content = {
                    'pub': pub, 
                    'title': article.title, 
                    'url': url, 
                    'language': language, 
                    'published_at': published_at, 
                    'downloaded_at': downloaded_at
                }

                if text=='' or published_at=='' or is_too_short or language!='en':
                    file = os.path.join(basedir, 'trash', hash_url[0], hash_url + ext)

                else:
                    file = os.path.join(basedir, 'downloaded', hash_url + ext)
                    content['text'] = text
                    content['description'] = article.meta_description
                    content['authors'] = article.authors
                    content['top_image'] = article.top_image

                makedir_if_not_exists(file)

                with open(file, 'w') as f:
                    json.dump(content, f)
            
            except:
                continue
#             article = Article(url)
            
#             try:
#                 await loop.run_in_executor(None, article.download)
#                 await loop.run_in_executor(None, article.parse)
                
#             except:
#                 #print('******************************************')
#                 continue
        
        progress(pub)


    async def main():
        fts = [asyncio.ensure_future(_download(pub, _urls)) for pub, _urls in urls.items()]
        await asyncio.gather(*fts)


    asyncio.set_event_loop(asyncio.new_event_loop())
    loop = asyncio.get_event_loop()

    try:
        loop.run_until_complete(main())

    except Exception as ex:
        pass
        #print(ex)

    finally:
        loop.close()

    print('\nDONE: {0:.2f} seconds'.format(time.time() - s))

In [75]:
download(urls_selected);

27.78% completed: marketwatch         Extracting date from https://kr.investing.com/news/economy/article-243278
30.56% completed: atlantic            Exception in extractArticlePublishedDate for https://kr.investing.com/news/economy/article-243278
()
Extracting date from https://foxnews.com/world/bahamas-hurricane-dorian-rescues-desperate-measures
33.33% completed: nyt                 Extracting date from https://hotair.com/headlines/archives/2019/09/travel-writer-part-climate-problem/#comments
Exception in extractArticlePublishedDate for https://hotair.com/headlines/archives/2019/09/travel-writer-part-climate-problem/#comments
()
36.11% completed: cbc                 Extracting date from https://abcnews.go.com/Entertainment/wireStory/wife-hart-fine-car-crash-65346953
Extracting date from https://bbc.co.uk/news/world-us-canada-49559563
47.22% completed: cnbc                Extracting date from https://hotair.com/headlines/archives/2019/09/travel-writer-part-climate-problem
Exception in



72.22% completed: wsj                 Extracting date from https://kr.investing.com/news/economy/3-things-under-the-radar-this-week-242261
Exception in extractArticlePublishedDate for https://kr.investing.com/news/economy/3-things-under-the-radar-this-week-242261
()
Extracting date from https://jp.investing.com/news/stock-market-news/article-256913
Exception in extractArticlePublishedDate for https://jp.investing.com/news/stock-market-news/article-256913
()
75.00% completed: zdnet               Extracting date from https://id.investing.com/news/economy/airlangga-dorong-industri-lebih-ramah-lingkungan-1915008
Exception in extractArticlePublishedDate for https://id.investing.com/news/economy/airlangga-dorong-industri-lebih-ramah-lingkungan-1915008
()
Extracting date from https://newslink.reuters.com/article/us-global-markets/stocks-hurt-by-trade-war-pound-hit-by-no-deal-brexit-fears-idUSKCN1VO01L
Extracting date from https://hk.investing.com/news/forex-news/article-75160
Exception in ext

In [122]:
# from urllib.parse import quote_plus
import re

# / ? & % : * " < > | \
delimiters = '/|\?|\&|\%|:|\*|\"|\<|\>|\||\\\\'
basedir = os.path.join(os.getcwd(), 'newsdata')
ext = '.json'

for pub, _urls in tqdm(urls.items()):
    #pubdir = os.path.join(os.getcwd(), 'newsdata', pub)
    
    #if not os.path.isdir(pubdir):
    #    os.makedirs(pubdir)
        
    for _url in _urls:
        fname = _url + ext
        file = os.path.join(basedir, pub, *re.split(delimiters, fname))
        fulldir = os.path.dirname(file)
        #file = os.path.join(pubdir, *re.split(delimiters, fname))
        #file = os.path.join(pubdir, quote_plus(_url) + '.json')
        
        if not os.path.isdir(fulldir):
            os.makedirs(fulldir)
        
        if not os.path.isfile(file):
            with open(file, 'w') as f:
                pass
                #json.dump(list(newsdata), f)

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))




FileNotFoundError: [WinError 3] 지정된 경로를 찾을 수 없습니다: 'C:\\Users\\infomax\\Documents\\GitHub\\protobed\\etc\\newsdata\\cnn\\https\\comparecards.com\\guide\\credit-cards-to-have-you-flying-in-no-time\\esourceid=6317676\\utm_source=cnn\\utm_medium=native\\pla=cnn.com\\bdst=rv\\acqs=prospecting\\utm_campaign=sectionfront\\grp=travel-no-time'

In [81]:
headers = {"Authorization":"sBBqsGXiYgF0Db5OV5tAw-akap7JBapGCe67Tuuz0lhquXd0k1oQC4B5B91GiHcVn2pHZrSf1gT2PUujH1YaQA"}
endpoint = 'https://api.newsriver.io/v2/search?'
qry = 'query=title:"Here are the biggest analyst calls of the day: Monster, JetBlue & more"&sortOrder=DESC&limit=15'
# qry = 'query=language:en AND website.domainName:"wsj.com"&sortOrder=DESC&limit=15'
url = endpoint + qry
response = requests.get(url, headers=headers)
out = response.json();out

[]

In [204]:
%%time
articleDateExtractor.extractArticlePublishedDate('https://labs.time.com/story/watch-how-the-drug-overdose-epidemic-spread-in-america')

Extracting date from https://labs.time.com/story/watch-how-the-drug-overdose-epidemic-spread-in-america
Wall time: 734 ms


In [184]:
title = "Chinese ride-hailing giant Didi plans to launch a robotaxi service in Shanghai"
url = 'https://newsapi.org/v2/everything?qInTitle="{title}"&apiKey=bfb917bca91c435d9c7efd247d95227b&pageSize=100'.format(title=title)
resp = requests.get(url)
resp.json()

{'status': 'ok',
 'totalResults': 1,
 'articles': [{'source': {'id': 'cnbc', 'name': 'CNBC'},
   'author': 'Arjun Kharpal',
   'title': 'Chinese ride-hailing giant Didi plans to launch a robotaxi service in Shanghai',
   'description': 'Ride-hailing giant Didi Chuxing joins a number of other Chinese companies in announcing robotaxi services, as China seeks to race ahead in driverless technology.',
   'url': 'https://www.cnbc.com/2019/08/30/china-uber-didi-plans-to-launch-a-robo-taxi-service-in-shanghai.html',
   'urlToImage': 'https://image.cnbcfm.com/api/v1/image/106104268-1567143051659didishowcasesitsautonomousdrivingfleetatthewaicinshanghai.jpg?v=1567143214',
   'publishedAt': '2019-08-30T06:17:00Z',
   'content': "Didi Chuxing, China's version of Uber, announced plans to launch a robotaxi service in Shanghai where users will be able to hail driverless cars from the app.\r\nThe ride-hailing company will deploy 30 different models of so-called level four autonomous vehicle… [+1165 ch

In [186]:
headers = {"Authorization":"sBBqsGXiYgF0Db5OV5tAw-akap7JBapGCe67Tuuz0lhquXd0k1oQC4B5B91GiHcVn2pHZrSf1gT2PUujH1YaQA"}
endpoint = 'https://api.newsriver.io/v2/search?'
title = "Millennials are making travel a priority more than previous generations—that's not a bad thing"
qry = 'query=title:"{title}"&sortOrder=DESC&limit=15'.format(title=title)
# qry = 'query=language:en AND website.domainName:"wsj.com"&sortOrder=DESC&limit=15'
url = endpoint + qry
response = requests.get(url, headers=headers)
out = response.json();out

[{'id': 'DfQ6U8fs2HIZffk3CxEdt6U1fD1TUdvlbXHYfsdTx8wOZQZmsl1EA8hfWaeO1JjQFTg7hl-dnHe9D2QyrQqT5w',
  'publishDate': '2019-08-30T12:30:34',
  'discoverDate': '2019-08-30T12:51:36.329+0000',
  'title': "Millennials are making travel a priority more than previous generations—that's not a bad thing",
  'language': 'en',
  'text': 'Your Instagram feed is probably littered with photos from your friends on vacation sharing breathtaking views of exotic beaches or snapshots of once-in-a-lifetime meals. And if you think that the sheer number of these vacation photos is growing, you\'re not wrong. That\'s because millennials, more than previous generations, are making travel a priority. In 2019, the average millennial (ages 21 to 37) plans on taking roughly five trips throughout the year, three of which are expected to be international, according to AARP\'s 2019 Travel Trends report. That\'s more international trips than Gen X (ages 38 to 53) and more overall trips than Baby Boomers (ages 54 to 72

In [160]:
0 not in [1,2,3]

True

In [174]:
set.union(*[{12,3}, {4,3}])

{3, 4, 12}

In [188]:
str(pd.Timestamp.utcnow())

'2019-09-02 05:47:14.689847+00:00'