In [2]:
import newspaper
from newspaper import Article, Config
from newsplease import NewsPlease
from IPython.core.debugger import set_trace
import requests
import json
import time
import hashlib
import pandas as pd
import articleDateExtractor
from tqdm.autonotebook import tqdm
tqdm.pandas()

import os
import sys
parentdir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parentdir)

# import django
# django.setup()

# from data.models import Newsdata

In [3]:
src = {
    'huffpost':        {'domain':'https://huffpost.com',          'pubname':'HuffPost'}, 
    'cnn':             {'domain':'https://cnn.com',               'pubname':'CNN'}, 
    'investingcom':    {'domain':'https://investing.com',         'pubname':'Investing.com'}, 
    'politico':        {'domain':'https://politico.com',          'pubname':'POLITICO'}, 
    'time':            {'domain':'https://time.com',              'pubname':'TIME'}, 
    'cnbc':            {'domain':'https://cnbc.com',              'pubname':'CNBC'}, 
    'foxnews':         {'domain':'https://foxnews.com',           'pubname':'FOX News'}, 
    'foxbusiness':     {'domain':'https://foxbusiness.com',       'pubname':'FOX Business'}, 
    'bbc':             {'domain':'https://bbc.com',               'pubname':'BBC'}, 
    'businessinsider': {'domain':'https://businessinsider.com',   'pubname':'Business Insider'}, 
    'morningstar':     {'domain':'https://morningstar.com',       'pubname':'Morningstar'}, 
    'wsj':             {'domain':'https://wsj.com',               'pubname':'Wall Street Journal'}, # ?mod=rsswn
    
    'nyt':             {'domain':'https://nytimes.com',           'pubname':'NewYork Times'}, 
    'guardian':        {'domain':'https://theguardian.com',       'pubname':'Guardian'}, 
    'reuters':         {'domain':'https://reuters.com',           'pubname':'Reuters'}, 
    'washingtontimes': {'domain':'https://washingtontimes.com',   'pubname':'Washington Times'}, 
    'washingtonpost':  {'domain':'https://washingtonpost.com',    'pubname':'Washington Post'}, 
    'cbs':             {'domain':'https://cbsnews.com',           'pubname':'CBS'}, 
    'marketwatch':     {'domain':'https://marketwatch.com',       'pubname':'MarketWatch'}, 
    'atlantic':        {'domain':'https://theatlantic.com',       'pubname':'Atlantic'}, 
    'vice':            {'domain':'https://vice.com',              'pubname':'VICE'}, 
    'npr':             {'domain':'https://npr.org',               'pubname':'npr'}, 
    'newrepublic':     {'domain':'https://newrepublic.com',       'pubname':'NEW REPUBLIC'}, 
    'yahoo':           {'domain':'https://yahoo.com',             'pubname':'yahoo'}, 
    'independent':     {'domain':'https://independent.co.uk',     'pubname':'INDEPENDENT'}, 
    'heritage':        {'domain':'https://heritage.org',          'pubname':'Heritage'}, 
    'zdnet':           {'domain':'https://www.zdnet.com',         'pubname':'ZDNet'}, # 반드시 www가 붙어야함
    'townhall':        {'domain':'https://townhall.com',          'pubname':'Townhall'}, 
    'abcnews':         {'domain':'https://abcnews.go.com',        'pubname':'ABC News'}, 
    'hotair':          {'domain':'https://hotair.com',            'pubname':'HOT AIR'}, 
    'cbc':             {'domain':'https://cbc.ca',                'pubname':'CBC'}, 
    'nymag':           {'domain':'https://nymag.com',             'pubname':'NewYork Magazine'}, 
    'thestreet':       {'domain':'https://thestreet.com',         'pubname':'TheStreet'}, 
    #'thinkprogress':   {'domain':'https://thinkprogress.org',     'pubname':'ThinkProgress'}, 
    'dailybeast':      {'domain':'https://thedailybeast.com',     'pubname':'DAILY BEAST'}, 
    'realclearpolitcs':{'domain':'https://realclearpolitics.com', 'pubname':'RealClear Politics'}, 
    
    #'forbes':          'https://forbes.com', 
    #'hbr':             'https://hbr.org', 
    #'ft':              'https://ft.com', 
    #'economist':       'https://economist.com', 
}

In [4]:
def _config():
    config = Config()
    config.fetch_images = False
    config.memoize_articles = False
    config.language = 'en'
    return config

In [5]:
def clean_url(pub, url):
    url = url.replace('http://', 'https://')
    
    if pub!='zdnet':
    # zdnet은 반드시 www가 붙어야되는 듯 (2019.08.30)
        url = url.replace('https://www.', 'https://')
    
    if url[-1]=='/':
        url = url[:-1]
    
    if pub=='wsj':
        try: url = url[:url.index('?mod=')]
        except: pass
        
        url += '?mod=rsswn'
        
    return url

In [6]:
import asyncio
from functools import partial

def collect_urls(src):
    s = time.time()
    n = len(src)
    n_done = [0]
    status = '\r{pct}% completed: {pub:<20}'
    newspaper_config = partial(newspaper.build, config=_config())
    
    def progress(pub):
        n_done[0] += 1  # 그냥 n_done으로는 외부에서 변수값을 바꿀수 없으므로
        pct = '%.2f' % (n_done[0] / n * 100)
        print(status.format(pub=pub, pct=pct), end='')
        
    
    async def geturls(pub, domain):
        resp = await loop.run_in_executor(None, newspaper_config, domain)
        articles = resp.articles
        urls = {clean_url(pub, article.url) for article in articles}
        progress(pub)
        return pub, urls


    async def main():
        fts = [asyncio.ensure_future(geturls(pub, val['domain'])) for pub, val in src.items()]
        return await asyncio.gather(*fts)


    result = None
    asyncio.set_event_loop(asyncio.new_event_loop())
    loop = asyncio.get_event_loop()

    try:
        # 다음 코드를 주피터에서 돌리려면, tornado를 downgrade 해야함
        # pip install tornado==4.5.3
        result = loop.run_until_complete(main())
        result = dict(result) #set.union(*result)

    except Exception as ex:
        print(ex)

    finally:
        loop.close()

    print('\nDONE: {0:.2f} seconds'.format(time.time() - s))
    return result

In [7]:
# urls = collect_urls(dict(list(src.items())[5:6])); 
urls = collect_urls(src); 
# print(len(urls[list(urls.keys())[0]]))
urls;

100.00% completed: investingcom        
DONE: 36.46 seconds


In [8]:
len(set.union(*urls.values()))

10241

In [9]:
def get_publish_date(article):
    pubtime = article.publish_date
    url = article.url
    datesrc = 'newspaper'
    
    def _from_extractor(url):
        try:
            return articleDateExtractor.extractArticlePublishedDate(url)
        
        except:
            return None
    
    
    def _from_newsplease(url):
        try:
            return NewsPlease.from_url(url).date_publish
        
        except:
            return None
    
    
    def _datize(time):
        if time is None:
            return ''

        try:
            time = pd.Timestamp(time)

            if time.tz is None:
                return str(time.date())

            else:
                return str(time.tz_convert('utc').date())
            
        except:
            return ''
            

    if pubtime is None:
        datesrc = 'extractor'
        pubtime = _from_extractor(url)
        
        if pubtime is None:
            datesrc = 'newsplease'
            pubtime = _from_newsplease(url)
            
            if pubtime is None:
                datesrc = 'fail'
                    
    return _datize(pubtime)#, datesrc

In [257]:
url = 'https://huffpost.com/entry/13-amazing-photos-you-missed-this-week_n_5d6988e1e4b09bbc9eeefb8f'
url = 'https://cnn.com/2019/08/26/australia/australia-china-yang-hengjun-intl-hnk/index.html'
url = 'https://cnn.com/style/article/sa-designer-partners-with-hm/index.html'
url = 'https://bleacherreport.com/articles/2799967-lesean-mccoy-reportedly-released-by-bills-after-4-seasons-with-team?utm_source=cnn.com&utm_campaign=editorial&utm_medium=referral'
url = 'https://investing.com/news/stock-market-news/market-fragility-on-show-as-trade-war-china-data-curb-optimism-1967972'
url = 'https://politico.com/agenda/story/2019/08/29/appalachian-trail-dominion-energy-000943'
url = 'https://politico.eu/article/a-no-matteo-salvini-alliance-is-italy-best-hope-league-5star-movement-democratic-party'
url = 'https://politico.eu/sponsored-content/whats-needed-to-implement-the-paris-agreement-shells-ceo-shares-his-views'
url = 'https://arte.tv/en/videos/083967-024-A/re-brexit-chaos-in-the-caribbean' #
url = 'https://labs.time.com/2015/08/07/the-one-word-each-republican-candidate-wants-you-to-remember'
url = 'https://labs.time.com/story/is-it-fall-yet' #
# url = 'https://labs.time.com/story/watch-how-the-drug-overdose-epidemic-spread-in-america' #
# url = 'https://time.com/5622094/what-is-niksen'
# url = 'https://time.com/magazine/us/5658416/september-2nd-2019-vol-194-no-8-u-s'
# url = 'https://buffett.cnbc.com/2018/04/01/warren-buffetts-bumpy-ride-with-the-airline-industry.html'
# url = 'https://cnbc.com/2018/04/12/regulatory-scrutiny-could-end-up-helping-facebook-analyst.html'
# url = 'https://cnbc.com/definitive-guide-to-buying-your-first-home'
url = 'https://marketwatch.com/video/opinion-hong-kong-police-crack-down-on-protest-leaders/F2B22E96-81D9-486A-AF3D-CCE3DA52E941.html'
article = Article(url)
article.download()
article.parse()
get_publish_date(article)

Extracting date from https://marketwatch.com/video/opinion-hong-kong-police-crack-down-on-protest-leaders/F2B22E96-81D9-486A-AF3D-CCE3DA52E941.html


'2019-08-30'

In [10]:
def select(urls):
    selected = {}
    basedir = os.path.join(os.getcwd(), 'newsdata')
    ext = '.json'

    for pub, _urls in tqdm(urls.items()):
        selected[pub] = set()

        for _url in _urls:
            hash_url = hashlib.sha1(_url.encode('utf-8')).hexdigest()
            
            file_in_saved = os.path.join(basedir, 'saved', hash_url[0], hash_url + ext)
            file_in_downloaded = os.path.join(basedir, 'downloaded', hash_url + ext)
            file_in_trash = os.path.join(basedir, 'trash', hash_url[0], hash_url + ext)

            if os.path.isfile(file_in_saved) or os.path.isfile(file_in_downloaded) or os.path.isfile(file_in_trash):
                continue
                
            else:
                selected[pub].add(_url)
                
    return selected

In [11]:
urls_selected = select(urls); urls_selected;

HBox(children=(IntProgress(value=0, max=35), HTML(value='')))




In [12]:
len(set.union(*urls_selected.values()))

1589

In [13]:
def download(urls):
    s = time.time()
    n = len(urls)
    n_done = [0]
    status = '\r{pct}% completed: {pub:<20}'

    basedir = os.path.join(os.getcwd(), 'newsdata')
    ext = '.json'
    
    
    def progress(pub):
        n_done[0] += 1
        pct = '%.2f' % (n_done[0] / n * 100)
        print(status.format(pub=pub, pct=pct), end='')
        
        
    def makedir_if_not_exists(file):
        _dir = os.path.dirname(file)
        
        if not os.path.isdir(_dir):
            os.makedirs(_dir)
            
    
    async def _download(pub, _urls):
        for url in _urls:
            article = Article(url)
            
            try:
                await loop.run_in_executor(None, article.download)
                await loop.run_in_executor(None, article.parse)
                
            except:
                continue
            
            text = article.text
            language = article.meta_lang
            published_at = await loop.run_in_executor(None, get_publish_date, article)
            downloaded_at = str(pd.Timestamp.utcnow())
            
            hash_url = hashlib.sha1(url.encode('utf-8')).hexdigest()
            is_too_short = (not article.is_valid_body()) and (len(article.text)<500)
            
            content = {
                'pub': pub, 
                'title': article.title, 
                'url': url, 
                'language': language, 
                'published_at': published_at, 
                'downloaded_at': downloaded_at
            }
            
            if text=='' or published_at=='' or is_too_short or (language not in ['en', '']):
                file = os.path.join(basedir, 'trash', hash_url[0], hash_url + ext)
            
            else:
                file = os.path.join(basedir, 'downloaded', hash_url + ext)
                content['text'] = text
                content['description'] = article.meta_description
                content['authors'] = article.authors
                content['top_image'] = article.top_image
                            
            makedir_if_not_exists(file)
            
            with open(file, 'w') as f:
                json.dump(content, f)
        
        progress(pub)


    async def main():
        fts = [asyncio.ensure_future(_download(pub, _urls)) for pub, _urls in urls.items()]
        await asyncio.gather(*fts)


    asyncio.set_event_loop(asyncio.new_event_loop())
    loop = asyncio.get_event_loop()

    try:
        loop.run_until_complete(main())

    except Exception as ex:
        print(ex)

    finally:
        loop.close()

    print('\nDONE: {0:.2f} seconds'.format(time.time() - s))

In [14]:
download(urls_selected);

17.14% completed: realclearpolitcs    

Building prefix dict from C:\Users\infomax\Anaconda3\lib\site-packages\jieba\dict.txt ...
Loading model from cache C:\Users\infomax\AppData\Local\Temp\jieba.cache


20.00% completed: foxbusiness         

Loading model cost 1.0778782367706299 seconds.
Prefix dict has been built succesfully.


22.86% completed: washingtonpost      Extracting date from https://video.vice.com/en_asia/video/olympic-skateboarding-coach-taji-dew-tour/5d07b1cebe407755a107ed21
Extracting date from https://cbc.ca/news/canada/ottawa/ottawa-overdose-peter-eady-son-2017-1.5266661
Extracting date from https://foxnews.com/world/disney-says-staff-is-safe-on-bahamas-castaway-cay-report
Extracting date from https://cio.zhiding.cn/cio/2019/0902/3120864.shtml
Extracting date from https://thedailybeast.com/hurricane-dorian-first-death-reported-as-storm-hits-the-bahamas
Exception in extractArticlePublishedDate for https://thedailybeast.com/hurricane-dorian-first-death-reported-as-storm-hits-the-bahamas
()
25.71% completed: hotair              Extracting date from https://abcnews.go.com/International/wireStory/double-cuddle-berlin-zoo-celebrates-birth-panda-cubs-65336045
Extracting date from https://it.investing.com/analysis/obiettivi-del-rialzo-raggiunti--analisi-settimanale-dei-mercati-200431692
Extracting dat



Extracting date from https://marketwatch.com/articles/best-stocks-labor-day-2019-51567182378
34.29% completed: huffpost            



Extracting date from https://foxnews.com/us/texas-shooting-victims-include-dad-gunned-down-in-front-of-wife-kids
40.00% completed: vice                Extracting date from https://abcnews.go.com/GMA/Family/kids-rough-day-school/story?id=65294525
Extracting date from https://cbc.ca/news/canada/ottawa/12-year-old-barrhaven-ballet-dancer-1.5267078
Extracting date from https://soft.zhiding.cn/software_zone/2019/0902/3120821.shtml
Exception in extractArticlePublishedDate for https://soft.zhiding.cn/software_zone/2019/0902/3120821.shtml
(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1056)'),)
Extracting date from https://thedailybeast.com/seth-ator-odessa-shooter-was-fired-hours-before-murder-spree-says-report
Exception in extractArticlePublishedDate for https://thedailybeast.com/seth-ator-odessa-shooter-was-fired-hours-before-murder-spree-says-report
()
42.86% completed: npr                 Extracting 



Extracting date from https://abcnews.go.com/GMA/Food/grilled-drumstick-baby-back-rib-recipes-labor-day/story?id=57418744
Extracting date from https://theatlantic.com/magazine/archive/2014/06/the-case-for-reparations/361631
Extracting date from https://watch.cbc.ca/media/media/short-docs/fourth-period-burnout/38e815a-00e0f7c4485
Extracting date from https://sports.yahoo.com/dodgers-pitcher-may-takes-line-drive-head-232548522--mlb.html
Extracting date from https://bbc.co.uk/news/av/newsbeat-49531364/central-park-five-raymond-santana-on-being-wrongfully-jailed
Extracting date from https://abcnews.go.com/International/wireStory/official-warns-iran-strong-step-deal-65336732
Extracting date from https://www.zdnet.com/article/cisco-releases-guides-for-incident-responders-handling-hacked-cisco-gear
Extracting date from https://newslink.reuters.com/article/us-eurozone-economy-pmi/euro-zone-manufacturing-slump-dragged-on-in-august-pmi-idUSKCN1VN0M6?il=0
Extracting date from https://uk.investing.

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


62.86% completed: atlantic            Extracting date from https://biz.zhiding.cn/2019/0902/3120861.shtml
Exception in extractArticlePublishedDate for https://biz.zhiding.cn/2019/0902/3120861.shtml
(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1056)'),)
Extracting date from https://abcnews.go.com/International/wireStory/japan-urges-nuke-plants-prepare-decommissioning-era-65335945
Extracting date from https://cbc.ca/news/technology/carbon-capture-faq-1.5250140
Extracting date from https://bbc.com/news/world-latin-america-49541485
65.71% completed: cbc                 Extracting date from https://businessinsider.co.za/the-iconic-milkybar-is-about-to-get-a-coconut-makeover-in-south-africa-and-believe-it-or-not-our-tasters-very-much-approved-2019-8
Exception in extractArticlePublishedDate for https://businessinsider.co.za/the-iconic-milkybar-is-about-to-get-a-coconut-makeover-in-south-africa-and-beli

Exception in extractArticlePublishedDate for https://independent.co.uk/news/world/australasiaException in extractArticlePublishedDate for https://ru.investing.com/news/forex-news/article-1911634

()()

Extracting date from https://biz.zhiding.cn/2019/0829/3120759.shtml
Exception in extractArticlePublishedDate for https://biz.zhiding.cn/2019/0829/3120759.shtml
(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1056)'),)
Extracting date from https://independent.co.uk/topic/american-flag-national-republican-congressional-committee
Extracting date from https://www.zdnet.fr/actualites/4g-pres-de-47-000-sites-deployes-en-france-39887119.htm
Exception in extractArticlePublishedDate for https://independent.co.uk/topic/american-flag-national-republican-congressional-committee
()
Extracting date from https://bbc.co.uk/news/entertainment-arts-49522172
Extracting date from https://pt.investing.com/news/economy/bo

Exception in extractArticlePublishedDate for https://pl.investing.com/news/stock-market-news/krakow-airport-mial-8234-tys-pasazerow-w-sierpniu-wzrost-o-32-rr-231745
()
Extracting date from https://newslink.reuters.com/article/us-mideast-iran-france-spokesman/iran-says-it-closes-gaps-with-france-in-talks-on-nuclear-deal-idUSKCN1VN0NJ?il=0
82.86% completed: cnbc                Extracting date from https://bbc.com/news/uk-40358825
Extracting date from https://investing.com/analysis/forget-zirp-top-10-big-yields-bdcs-cefs-and-reits-200460636
Exception in extractArticlePublishedDate for https://investing.com/analysis/forget-zirp-top-10-big-yields-bdcs-cefs-and-reits-200460636
()
Extracting date from https://newslink.reuters.com/article/us-spain-politics/spains-socialists-would-extended-lead-in-snap-election-poll-shows-idUSKCN1VN0JW
Extracting date from https://bbc.com/urdu/media/video
Extracting date from https://investing.com/news/technology-news/chinese-faceswapping-app-goes-viral-sparks-



Extracting date from https://businesstimes.com.sg/stocks/hong-kong-stocks-rocked-by-citys-latest-violent-protests 
Extracting date from https://independent.co.uk/topic/american-association-for-the-advancement-of-science
Exception in extractArticlePublishedDate for https://independent.co.uk/topic/american-association-for-the-advancement-of-science
()
Extracting date from https://bbc.com/news/world-latin-america-49544618
Extracting date from https://au.investing.com/news/stock-market-news/briefabacus-property-exchanged-contracts-for-acquisition-of-5-self-storage-properties-1948548
Exception in extractArticlePublishedDate for https://au.investing.com/news/stock-market-news/briefabacus-property-exchanged-contracts-for-acquisition-of-5-self-storage-properties-1948548
()
Extracting date from https://newslink.reuters.com/article/us-mtg-huya/mtg-eyes-expansion-into-china-as-huya-buys-stake-in-e-sports-firm-esl-idUSKCN1VN0NG
85.71% completed: independent         Extracting date from https://id.

Exception in extractArticlePublishedDate for https://investing.com/certificates/cb-call-102-volkswagen-preferred-st
()
Extracting date from https://investing.com/certificates/ing-call-2048.6233-usd-s-p-500
Exception in extractArticlePublishedDate for https://investing.com/certificates/ing-call-2048.6233-usd-s-p-500
()
Extracting date from https://bbc.com/persian/media/photogalleries
Extracting date from https://uk.investing.com/analysis/uk-manufacturing-activity-falls-to-7year-low-200432350
Exception in extractArticlePublishedDate for https://uk.investing.com/analysis/uk-manufacturing-activity-falls-to-7year-low-200432350
()
Extracting date from https://kr.investing.com/news/economy/article-243009
Exception in extractArticlePublishedDate for https://kr.investing.com/news/economy/article-243009
()
Extracting date from https://sa.investing.com/news/economy/article-1926662
Exception in extractArticlePublishedDate for https://sa.investing.com/news/economy/article-1926662
()
Extracting date

Extracting date from https://bbc.com/hausa/media/photogalleries
Exception in extractArticlePublishedDate for https://hk.investing.com/news/stock-market-news/article-75145
()
Extracting date from https://es.investing.com/news/commodities-news/oil-dips-as-uschina-trade-war-intensifiesfurther-1914678
Exception in extractArticlePublishedDate for https://es.investing.com/news/commodities-news/oil-dips-as-uschina-trade-war-intensifiesfurther-1914678
()
Extracting date from https://mx.investing.com/news/forex-news/alarma-en-el-euro-minimos-de-dos-anos-a-la-espera-de-draghi-y-la-fed-1909645
Extracting date from https://newslink.reuters.com/article/us-global-forex/euro-holds-below-1-10-as-economic-clouds-darken-idUSKCN1VM1PB?il=0
Exception in extractArticlePublishedDate for https://mx.investing.com/news/forex-news/alarma-en-el-euro-minimos-de-dos-anos-a-la-espera-de-draghi-y-la-fed-1909645
()
Extracting date from https://th.investing.com/news/cryptocurrency-news/crypto-mixed-security-issues-fou

Exception in extractArticlePublishedDate for https://fr.investing.com/news/stock-market-news/le-fonds-suedois-eqt-lance-son-projet-dipo-1909059
()
Extracting date from https://hk.investing.com/news/economy/article-75146
Exception in extractArticlePublishedDate for https://hk.investing.com/news/economy/article-75146
()
Extracting date from https://tr.investing.com/news/economic-indicators/turkiye-ekonomisi-2-ceyrekte-15-darald-1905529
Exception in extractArticlePublishedDate for https://tr.investing.com/news/economic-indicators/turkiye-ekonomisi-2-ceyrekte-15-darald-1905529
()
Extracting date from https://bbc.co.uk/news/extra/BoWIe4x0Lj/Ireland_hidden_survivors
Extracting date from https://in.investing.com/news/stock-market-news/update-1ftse-weathers-trade-fears-as-weaker-pound-boosts-exporters-1972421
Exception in extractArticlePublishedDate for https://in.investing.com/news/stock-market-news/update-1ftse-weathers-trade-fears-as-weaker-pound-boosts-exporters-1972421
()
Extracting date 

Exception in extractArticlePublishedDate for https://kr.investing.com/news/cryptocurrency-news/article-242988
()
Extracting date from https://bbc.com/news/uk-49545743
Extracting date from https://investing.com/news/economy/china-downplays-latest-trump-tariffs-with-path-to-talks-unclear-1968215
Exception in extractArticlePublishedDate for https://investing.com/news/economy/china-downplays-latest-trump-tariffs-with-path-to-talks-unclear-1968215
()
Extracting date from https://uk.investing.com/news/stock-market-news/ms-faces-relegation-from-ftse-100-for-first-time-shares-drop-1953689
Exception in extractArticlePublishedDate for https://uk.investing.com/news/stock-market-news/ms-faces-relegation-from-ftse-100-for-first-time-shares-drop-1953689
()
Extracting date from https://nl.investing.com/news/stock-market-news/beursblik-ubs-verlaagt-koersdoel-aegon-131197
Exception in extractArticlePublishedDate for https://nl.investing.com/news/stock-market-news/beursblik-ubs-verlaagt-koersdoel-aegon-

Extracting date from https://ms.investing.com/news/stock-market-news/pasaran-asia-tutup-lebih-rendah-nikkei-turun-041-365512
Exception in extractArticlePublishedDate for https://ms.investing.com/news/stock-market-news/pasaran-asia-tutup-lebih-rendah-nikkei-turun-041-365512
()
Extracting date from https://cn.investing.com/news/stock-market-news/article-1903468
Exception in extractArticlePublishedDate for https://cn.investing.com/news/stock-market-news/article-1903468
()
Extracting date from https://tr.investing.com/news/commodities-news/tekrarkasrga-dorian-bahamalar-vurdu-floridaya-ilerlemesi-bekleniyor-1905555
Exception in extractArticlePublishedDate for https://tr.investing.com/news/commodities-news/tekrarkasrga-dorian-bahamalar-vurdu-floridaya-ilerlemesi-bekleniyor-1905555
()
Extracting date from https://es.investing.com/news/economic-indicators/la-guerra-comercial-castiga-a-las-fabricas-asiaticas-y-ralentiza-la-demanda-1914693
Exception in extractArticlePublishedDate for https://es.

Extracting date from https://bbc.com/news/live/business-49524835
Extracting date from https://bbc.com/burmese/media/audio
Extracting date from https://bbc.co.uk/news/extra/pjfxZM72Gj/house-buyer-time-machine
Extracting date from https://bbc.com/uzbek/media/video
Extracting date from https://bbc.com/news/10628994
Extracting date from https://bbc.com/news/uk-politics-49494795
Extracting date from https://bbc.com/ukrainian/news
Extracting date from https://bbc.com/reel/video/p07lq60x/the-ancient-tombs-kept-under-lock-and-key
Extracting date from https://bbc.com/news/world-africa-49546617
Extracting date from https://bbc.co.uk/programmes/articles/3MlHBKcMc9Jvm4zSgwpwTRr/my-partner-earns-10k-more-but-we-split-everything-50-50
Extracting date from https://bbc.co.uk/news/entertainment-arts-49509496
Extracting date from https://bbc.co.uk/news/world-us-canada-49240582
Extracting date from https://bbc.com/nepali/media/photogalleries
Extracting date from https://bbc.co.uk/bengali/institutional/20

In [152]:
for fname in fnames:
    with open(fname, encoding='UTF-8-sig') as f:
        content = json.load(f)
        if not content['is_valid']:
            print(fname, len(content['text']))

newsdata/downloaded/0139c4c8befa8a22b9626fc8d02cb0e71178520e.json 1593
newsdata/downloaded/01547dabfe1d32eeb9152c76eaaffa752c3c66ff.json 526
newsdata/downloaded/0aad570c5c79b65f41b94a6cc6978e0379b32654.json 1401
newsdata/downloaded/1816db9b8a2b3dd2c46838c495b99a13225d8c34.json 819
newsdata/downloaded/1a65e35c01657b0a26c99c5242701b9de8c3bc8b.json 1160
newsdata/downloaded/203325d820773c707eeaa744d2fac965394c18a0.json 269
newsdata/downloaded/206d5c352b618fbbf944bc577bcb75c98cd0e4a0.json 1520
newsdata/downloaded/296c35673158d88c4120aba21f83e6464222ce94.json 269
newsdata/downloaded/2a5b847d6dbeab28397be48f5c03361243d26b57.json 350
newsdata/downloaded/311e3b633759a6ce09de1f54a576ba4979eade02.json 269
newsdata/downloaded/31ebe9181b19b70087d05b9f1c20528638316c32.json 970
newsdata/downloaded/333b6d16d260e67fb7c424b086a6bc569c6db340.json 269
newsdata/downloaded/3fee25663dc3824ce11a9f528374b29cc82319cf.json 1887
newsdata/downloaded/426169ac5200ed6274c39dcbf236f30d16a1cb9e.json 1665
newsdata/downl

In [122]:
# from urllib.parse import quote_plus
import re

# / ? & % : * " < > | \
delimiters = '/|\?|\&|\%|:|\*|\"|\<|\>|\||\\\\'
basedir = os.path.join(os.getcwd(), 'newsdata')
ext = '.json'

for pub, _urls in tqdm(urls.items()):
    #pubdir = os.path.join(os.getcwd(), 'newsdata', pub)
    
    #if not os.path.isdir(pubdir):
    #    os.makedirs(pubdir)
        
    for _url in _urls:
        fname = _url + ext
        file = os.path.join(basedir, pub, *re.split(delimiters, fname))
        fulldir = os.path.dirname(file)
        #file = os.path.join(pubdir, *re.split(delimiters, fname))
        #file = os.path.join(pubdir, quote_plus(_url) + '.json')
        
        if not os.path.isdir(fulldir):
            os.makedirs(fulldir)
        
        if not os.path.isfile(file):
            with open(file, 'w') as f:
                pass
                #json.dump(list(newsdata), f)

HBox(children=(IntProgress(value=0, max=11), HTML(value='')))




FileNotFoundError: [WinError 3] 지정된 경로를 찾을 수 없습니다: 'C:\\Users\\infomax\\Documents\\GitHub\\protobed\\etc\\newsdata\\cnn\\https\\comparecards.com\\guide\\credit-cards-to-have-you-flying-in-no-time\\esourceid=6317676\\utm_source=cnn\\utm_medium=native\\pla=cnn.com\\bdst=rv\\acqs=prospecting\\utm_campaign=sectionfront\\grp=travel-no-time'

In [81]:
headers = {"Authorization":"sBBqsGXiYgF0Db5OV5tAw-akap7JBapGCe67Tuuz0lhquXd0k1oQC4B5B91GiHcVn2pHZrSf1gT2PUujH1YaQA"}
endpoint = 'https://api.newsriver.io/v2/search?'
qry = 'query=title:"Here are the biggest analyst calls of the day: Monster, JetBlue & more"&sortOrder=DESC&limit=15'
# qry = 'query=language:en AND website.domainName:"wsj.com"&sortOrder=DESC&limit=15'
url = endpoint + qry
response = requests.get(url, headers=headers)
out = response.json();out

[]

In [204]:
%%time
articleDateExtractor.extractArticlePublishedDate('https://labs.time.com/story/watch-how-the-drug-overdose-epidemic-spread-in-america')

Extracting date from https://labs.time.com/story/watch-how-the-drug-overdose-epidemic-spread-in-america
Wall time: 734 ms


In [184]:
title = "Chinese ride-hailing giant Didi plans to launch a robotaxi service in Shanghai"
url = 'https://newsapi.org/v2/everything?qInTitle="{title}"&apiKey=bfb917bca91c435d9c7efd247d95227b&pageSize=100'.format(title=title)
resp = requests.get(url)
resp.json()

{'status': 'ok',
 'totalResults': 1,
 'articles': [{'source': {'id': 'cnbc', 'name': 'CNBC'},
   'author': 'Arjun Kharpal',
   'title': 'Chinese ride-hailing giant Didi plans to launch a robotaxi service in Shanghai',
   'description': 'Ride-hailing giant Didi Chuxing joins a number of other Chinese companies in announcing robotaxi services, as China seeks to race ahead in driverless technology.',
   'url': 'https://www.cnbc.com/2019/08/30/china-uber-didi-plans-to-launch-a-robo-taxi-service-in-shanghai.html',
   'urlToImage': 'https://image.cnbcfm.com/api/v1/image/106104268-1567143051659didishowcasesitsautonomousdrivingfleetatthewaicinshanghai.jpg?v=1567143214',
   'publishedAt': '2019-08-30T06:17:00Z',
   'content': "Didi Chuxing, China's version of Uber, announced plans to launch a robotaxi service in Shanghai where users will be able to hail driverless cars from the app.\r\nThe ride-hailing company will deploy 30 different models of so-called level four autonomous vehicle… [+1165 ch

In [186]:
headers = {"Authorization":"sBBqsGXiYgF0Db5OV5tAw-akap7JBapGCe67Tuuz0lhquXd0k1oQC4B5B91GiHcVn2pHZrSf1gT2PUujH1YaQA"}
endpoint = 'https://api.newsriver.io/v2/search?'
title = "Millennials are making travel a priority more than previous generations—that's not a bad thing"
qry = 'query=title:"{title}"&sortOrder=DESC&limit=15'.format(title=title)
# qry = 'query=language:en AND website.domainName:"wsj.com"&sortOrder=DESC&limit=15'
url = endpoint + qry
response = requests.get(url, headers=headers)
out = response.json();out

[{'id': 'DfQ6U8fs2HIZffk3CxEdt6U1fD1TUdvlbXHYfsdTx8wOZQZmsl1EA8hfWaeO1JjQFTg7hl-dnHe9D2QyrQqT5w',
  'publishDate': '2019-08-30T12:30:34',
  'discoverDate': '2019-08-30T12:51:36.329+0000',
  'title': "Millennials are making travel a priority more than previous generations—that's not a bad thing",
  'language': 'en',
  'text': 'Your Instagram feed is probably littered with photos from your friends on vacation sharing breathtaking views of exotic beaches or snapshots of once-in-a-lifetime meals. And if you think that the sheer number of these vacation photos is growing, you\'re not wrong. That\'s because millennials, more than previous generations, are making travel a priority. In 2019, the average millennial (ages 21 to 37) plans on taking roughly five trips throughout the year, three of which are expected to be international, according to AARP\'s 2019 Travel Trends report. That\'s more international trips than Gen X (ages 38 to 53) and more overall trips than Baby Boomers (ages 54 to 72

In [160]:
0 not in [1,2,3]

True

In [174]:
set.union(*[{12,3}, {4,3}])

{3, 4, 12}

In [188]:
str(pd.Timestamp.utcnow())

'2019-09-02 05:47:14.689847+00:00'