In [86]:
import newspaper
from newspaper import Article, Config
from newsplease import NewsPlease
from IPython.core.debugger import set_trace
import extraction
import requests
import json
import time
import hashlib
import glob
import pandas as pd
import asyncio
from functools import partial
from pathlib import Path
from langdetect import detect
from pubtime_extractor import extractArticlePublishedDate

# Suppress UnknownTimezoneWarning
import warnings
from dateutil.parser import UnknownTimezoneWarning
warnings.filterwarnings('ignore', category=UnknownTimezoneWarning)

from tqdm.auto import tqdm
tqdm.pandas()

import os
import sys
parentdir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parentdir)

In [87]:
src = {
    'huffpost':         {'domain':'https://huffpost.com',              'pubname':'HuffPost'}, 
    'cnn':              {'domain':'https://cnn.com',                   'pubname':'CNN'}, # ?=
    'investingcom':     {'domain':'https://investing.com',             'pubname':'Investing.com'}, 
    'politico':         {'domain':'https://politico.com',              'pubname':'POLITICO'}, 
    'time':             {'domain':'https://time.com',                  'pubname':'TIME'}, 
    'cnbc':             {'domain':'https://cnbc.com',                  'pubname':'CNBC'}, 
    'foxnews':          {'domain':'https://foxnews.com',               'pubname':'FOX News'}, 
    'foxbusiness':      {'domain':'https://foxbusiness.com',           'pubname':'FOX Business'}, 
    'bbc':              {'domain':'https://bbc.com',                   'pubname':'BBC'}, 
    'businessinsider':  {'domain':'https://businessinsider.com',       'pubname':'Business Insider'}, 
    'morningstar':      {'domain':'https://morningstar.com',           'pubname':'Morningstar'}, 
    'wsj':              {'domain':'https://wsj.com',                   'pubname':'Wall Street Journal'}, # ?mod=rsswn
    
    'nyt':              {'domain':'https://nytimes.com',               'pubname':'NewYork Times'}, 
    'guardian':         {'domain':'https://theguardian.com',           'pubname':'Guardian'}, 
    'reuters':          {'domain':'https://reuters.com',               'pubname':'Reuters'}, # ?=불필요, 심각
    'washingtontimes':  {'domain':'https://washingtontimes.com',       'pubname':'Washington Times'}, 
    'washingtonpost':   {'domain':'https://washingtonpost.com',        'pubname':'Washington Post'}, # ?=
    'cbs':              {'domain':'https://cbsnews.com',               'pubname':'CBS'}, 
    'marketwatch':      {'domain':'https://marketwatch.com',           'pubname':'MarketWatch'}, # ?= 심각
    'atlantic':         {'domain':'https://theatlantic.com',           'pubname':'Atlantic'}, 
    'vice':             {'domain':'https://vice.com',                  'pubname':'VICE'}, 
    'npr':              {'domain':'https://npr.org',                   'pubname':'npr'}, 
    'newrepublic':      {'domain':'https://newrepublic.com',           'pubname':'NEW REPUBLIC'}, 
    'yahoo':            {'domain':'https://yahoo.com',                 'pubname':'yahoo'}, 
    'independent':      {'domain':'https://independent.co.uk',         'pubname':'INDEPENDENT'}, 
    'heritage':         {'domain':'https://heritage.org',              'pubname':'Heritage'}, 
    'zdnet':            {'domain':'https://www.zdnet.com',             'pubname':'ZDNet'}, # 반드시 www가 붙어야함
    'townhall':         {'domain':'https://townhall.com',              'pubname':'Townhall'}, 
    'abcnews':          {'domain':'https://abcnews.go.com',            'pubname':'ABC News'}, # ?=많긴한데 반드시필요
    'hotair':           {'domain':'https://hotair.com',                'pubname':'HOT AIR'}, 
    'cbc':              {'domain':'https://cbc.ca',                    'pubname':'CBC'}, 
    'nymag':            {'domain':'https://nymag.com',                 'pubname':'NewYork Magazine'}, 
    'thestreet':        {'domain':'https://www.thestreet.com',         'pubname':'TheStreet'}, # 반드시 www가 붙어야함 
    'thinkprogress':    {'domain':'https://thinkprogress.org',         'pubname':'ThinkProgress'}, 
    'dailybeast':       {'domain':'https://thedailybeast.com',         'pubname':'DAILY BEAST'}, 
    'realclearpolitics':{'domain':'https://www.realclearpolitics.com', 'pubname':'RealClear Politics'}, # 반드시 www가 붙어야함
    
    #'forbes':          'https://forbes.com', 
    #'hbr':             'https://hbr.org', 
    #'ft':              'https://ft.com', 
    #'economist':       'https://economist.com', 
}

In [88]:
def _config():
    config = Config()
    config.fetch_images = False
    config.memoize_articles = False
    config.request_timeout = 10
    config.language = 'en'
    return config

In [89]:
def clean_url(pub, url):
    url = url.replace('http://', 'https://')

    try: url = url[:url.index('#')]
    except: pass

    try: url = url[:url.index('\n')]
    except: pass    
    
    if pub!='zdnet':
    # zdnet은 반드시 www가 붙어야되는 듯 (2019.08.30)
        url = url.replace('https://www.', 'https://')
    
    if pub!='thinkprogress' and url[-1]=='/':
    # thinkprogress는 뒤의 /가 반드시 필요한 듯 (2019.09.04)
        url = url[:-1]
    
    if pub=='reuters':
    # reuters는 뒤에 의미없이 ?il=0 이 붙는 경우가 허다. 무슨뜬인지는 모름 (2019.09.04)
        try: url = url[:url.index('?il=0')]
        except: pass
    
    if pub=='marketwatch':
        try: url = url[:url.index('?mod=')]
        except: pass
    
    if pub=='wsj':
    # wsj paywall 뚫기
        try: url = url[:url.index('?mod=')]
        except: pass
        
        url += '?mod=rsswn'
        
    return url

In [90]:
class Progressor:
    def __init__(self, ntotal, formater_suffix=None):
        self.start = time.time()
        self.n_total = ntotal
        self.n_progressed = 0
        self.formater = '\r{pct:.2f}% ({timestamp:.2f} seconds)'

        if formater_suffix:
            self.formater += (': ' + formater_suffix)

    def stamp(self, **vargs):
        self.n_progressed += 1
        pct = self.n_progressed / self.n_total * 100
        timestamp = time.time() - self.start
        print(self.formater.format(pct=pct, timestamp=timestamp, **vargs), end='')

In [91]:
def collect_urls(src):
#     s = time.time()
#     n = len(src)
#     n_done = [0]
#     status = '\rURLs collecting... {pct}% ({time:.2f} seconds): {pub:<20}'

    prg = Progressor(len(src), formater_suffix='URLs collecting... {pub:<20}')
    newspaper_config = partial(newspaper.build, config=_config())
    
#     def progress(pub):
#         n_done[0] += 1  # 그냥 n_done으로는 외부에서 변수값을 바꿀수 없으므로
#         pct = '%.2f' % (n_done[0] / n * 100)
#         print(status.format(pub=pub, pct=pct, time=time.time()-s), end='')
        
    
    async def geturls(pub, domain):
        resp = await loop.run_in_executor(None, newspaper_config, domain)
        articles = resp.articles
        urls = {clean_url(pub, article.url) for article in articles}
        #progress(pub)
        prg.stamp(pub=pub)
        return pub, urls


    async def main():
        fts = [asyncio.ensure_future(geturls(pub, val['domain'])) for pub, val in src.items()]
        return await asyncio.gather(*fts)


    result = None
    asyncio.set_event_loop(asyncio.new_event_loop())
    loop = asyncio.get_event_loop()

    try:
        # 다음 코드를 주피터에서 돌리려면, tornado를 downgrade 해야함
        # pip install tornado==4.5.3
        result = loop.run_until_complete(main())
        result = dict(result) #set.union(*result)

    except Exception as ex:
        print(ex)

    finally:
        loop.close()

    # print('\nDONE: {0:.2f} seconds'.format(time.time() - s))
    return result

In [111]:
urls = collect_urls(src)

100.00% (43.12 seconds): URLs collecting... investingcom        

In [112]:
#print(len(set.union(*urls.values())))
ddff = pd.DataFrame.from_dict({pub:len(_urls) for pub, _urls in urls.items()}, orient='index', columns=[len(set.union(*urls.values()))]); ddff

Unnamed: 0,10867
huffpost,62
cnn,828
investingcom,1021
politico,227
time,14
cnbc,163
foxnews,212
foxbusiness,164
bbc,272
businessinsider,553


In [113]:
def get_publish_time(article):
    pubtime = article.publish_date
    url = article.url
    datesrc = 'newspaper'
    
    # articleDateExtractor 내부의 print 를 suppress 하기 위한 장치
    # --- articleDateExtractor 코드 자체를 변경함 (2019.09.05)
    #class HiddenPrints:
    #    def __enter__(self):
    #        self._original_stdout = sys.stdout
    #        sys.stdout = open(os.devnull, 'w')

    #    def __exit__(self, exc_type, exc_val, exc_tb):
    #        sys.stdout.close()
    #        sys.stdout = self._original_stdout
        
    
    def _from_extractor(url):
        try:
            return extractArticlePublishedDate(url)
        
        except:
            return None
    
    
    def _from_newsplease(url):
        try:
            return NewsPlease.from_url(url).date_publish
        
        except:
            return None
    
    
    def _timize(time):
        if time is None:
            return None
        
        try:
            time = pd.Timestamp(time)
            
            if time.tz is None:
                return time.tz_localize('utc')
            
            else:
                return time.tz_convert('utc')
        
        except:
            return None
        
    
    def _datize(time):
        if time is None:
            return ''

        try:
            time = pd.Timestamp(time)

            if time.tz is None:
                return str(time.date())

            else:
                return str(time.tz_convert('utc').date())
            
        except:
            return ''
            

    if pubtime is None:
        datesrc = 'extractor'
        pubtime = _from_extractor(url)

        if pubtime is None:
            datesrc = 'newsplease'
            pubtime = _from_newsplease(url)

            if pubtime is None:
                datesrc = 'fail'

    #return _datize(pubtime)
    return _timize(pubtime)

In [114]:
def select(urls):
    prg = Progressor(len(urls), formater_suffix='URLs selecting... {pub:<20}')
    selected = {}
    basedir = os.path.join(os.getcwd(), 'newsdata')
    ext = '.json'

    for pub, _urls in urls.items():
        selected[pub] = set()

        for _url in _urls:
            hash_url = hashlib.sha1(_url.encode('utf-8')).hexdigest()
            
            file_in_saved = os.path.join(basedir, 'saved', hash_url[0], hash_url + ext)
            file_in_downloaded = os.path.join(basedir, 'downloaded', hash_url + ext)
            file_in_trashed = os.path.join(basedir, 'trashed', hash_url[0], hash_url + ext)

            if os.path.isfile(file_in_saved) or os.path.isfile(file_in_downloaded) or os.path.isfile(file_in_trashed):
                continue
                
            else:
                selected[pub].add(_url)
                
        prg.stamp(pub=pub)
                
    return selected

In [115]:
urls_selected = select(urls); urls_selected;

100.00% (1.24 seconds): URLs selecting... realclearpolitics   

In [116]:
len(set.union(*urls_selected.values()))

438

In [117]:
def download(urls):
#     s = time.time()
#     n = len(set.union(*urls.values()))
#     n_done = [0]
#     status = '\r{pct}% completed'

    prg = Progressor(len(set.union(*urls.values())), formater_suffix='downloading... {pub:<20}')
    basedir = os.path.join(os.getcwd(), 'newsdata')
    ext = '.json'
    newspaper_config = _config()
    
    
#     def progress():
#         n_done[0] += 1
#         pct = '%.2f' % (n_done[0] / n * 100)
#         print(status.format(pct=pct), end='')
        
        
    def makedir_if_not_exists(file):
        _dir = os.path.dirname(file)
        
        if not os.path.isdir(_dir):
            os.makedirs(_dir)
            
            
    def detect_lang(article):
        lang = article.meta_lang
        
        if lang=='':
            return detect(article.text)
        
        else:
            return lang
            
    
    def get_article(url):
        article = Article(url, config=newspaper_config)
        article.download()
        article.parse()
        return article
        
        
    def get_title(article):
        if article.title in ['', '-', None]:
        # '':cbc, '-':townhall
            html = requests.get(article.url).text
            extracted_title = extraction.Extractor().extract(html, source_url=article.url).title
            
            if extracted_title in ['', '-', None]:
                if article.description=='':
                    return article.pub
                else:
                    return article.description
                
            else:
                return extracted_title
            
        else:
            return article.title
        
    
    async def _download(pub, _urls):
        for url in _urls:
            hash_url = hashlib.sha1(url.encode('utf-8')).hexdigest()
            downloaded_at = pd.Timestamp.utcnow()
            
            content = {
                'pub': pub, 
                'url': url, 
                'downloaded_at': str(downloaded_at)
            }
            
            try: 
                article = await loop.run_in_executor(None, get_article, url)
                
                text = article.text
                language = detect_lang(article)
                published_at = await loop.run_in_executor(None, get_publish_time, article)
                is_too_short = (not article.is_valid_body()) and (len(article.text)<500)
                
                content['title'] = await loop.run_in_executor(None, get_title, article) #article.title
                content['language'] = language
                
                if text=='' or published_at==None or is_too_short or language!='en':
                    file = os.path.join(basedir, 'trashed', hash_url[0], hash_url + ext)

                else:
                    file = os.path.join(basedir, 'downloaded', hash_url + ext)
                    content['text'] = text
                    content['description'] = article.meta_description
                    content['authors'] = article.authors
                    content['top_image'] = article.top_image if article.top_image.split('.')[-1]!='ico' else ''
                    content['published_at'] = str(published_at.date()) if published_at<=downloaded_at else str(downloaded_at.date())
                    
                    #if article.top_image.split('.')[-1]=='ico': print(url)
            
            except:
                file = os.path.join(basedir, 'trashed', hash_url[0], hash_url + ext)
            
            
            makedir_if_not_exists(file)
            with open(file, 'w') as f:
                json.dump(content, f)
                
                
            # 종종 100%가 넘어가는 경우가 있다
            # set.union(*urls.values()) 에 중복항목이 있는 듯
            #progress()
            prg.stamp(pub=pub)


    async def main():
        fts = [asyncio.ensure_future(_download(pub, _urls)) for pub, _urls in urls.items()]
        await asyncio.gather(*fts)


    asyncio.set_event_loop(asyncio.new_event_loop())
    loop = asyncio.get_event_loop()

    try:
        loop.run_until_complete(main())

    except Exception as ex:
        pass
        #print(ex)

    finally:
        loop.close()

    #print('\nDONE: {0:.2f} seconds'.format(time.time() - s))

In [118]:
download(urls_selected)

100.00% (583.91 seconds): downloading... investingcom        

In [81]:
headers = {"Authorization":"sBBqsGXiYgF0Db5OV5tAw-akap7JBapGCe67Tuuz0lhquXd0k1oQC4B5B91GiHcVn2pHZrSf1gT2PUujH1YaQA"}
endpoint = 'https://api.newsriver.io/v2/search?'
qry = 'query=title:"Here are the biggest analyst calls of the day: Monster, JetBlue & more"&sortOrder=DESC&limit=15'
# qry = 'query=language:en AND website.domainName:"wsj.com"&sortOrder=DESC&limit=15'
url = endpoint + qry
response = requests.get(url, headers=headers)
out = response.json();out

[]

In [204]:
%%time
articleDateExtractor.extractArticlePublishedDate('https://labs.time.com/story/watch-how-the-drug-overdose-epidemic-spread-in-america')

Extracting date from https://labs.time.com/story/watch-how-the-drug-overdose-epidemic-spread-in-america
Wall time: 734 ms


In [184]:
title = "Chinese ride-hailing giant Didi plans to launch a robotaxi service in Shanghai"
url = 'https://newsapi.org/v2/everything?qInTitle="{title}"&apiKey=bfb917bca91c435d9c7efd247d95227b&pageSize=100'.format(title=title)
resp = requests.get(url)
resp.json()

{'status': 'ok',
 'totalResults': 1,
 'articles': [{'source': {'id': 'cnbc', 'name': 'CNBC'},
   'author': 'Arjun Kharpal',
   'title': 'Chinese ride-hailing giant Didi plans to launch a robotaxi service in Shanghai',
   'description': 'Ride-hailing giant Didi Chuxing joins a number of other Chinese companies in announcing robotaxi services, as China seeks to race ahead in driverless technology.',
   'url': 'https://www.cnbc.com/2019/08/30/china-uber-didi-plans-to-launch-a-robo-taxi-service-in-shanghai.html',
   'urlToImage': 'https://image.cnbcfm.com/api/v1/image/106104268-1567143051659didishowcasesitsautonomousdrivingfleetatthewaicinshanghai.jpg?v=1567143214',
   'publishedAt': '2019-08-30T06:17:00Z',
   'content': "Didi Chuxing, China's version of Uber, announced plans to launch a robotaxi service in Shanghai where users will be able to hail driverless cars from the app.\r\nThe ride-hailing company will deploy 30 different models of so-called level four autonomous vehicle… [+1165 ch

In [186]:
headers = {"Authorization":"sBBqsGXiYgF0Db5OV5tAw-akap7JBapGCe67Tuuz0lhquXd0k1oQC4B5B91GiHcVn2pHZrSf1gT2PUujH1YaQA"}
endpoint = 'https://api.newsriver.io/v2/search?'
title = "Millennials are making travel a priority more than previous generations—that's not a bad thing"
qry = 'query=title:"{title}"&sortOrder=DESC&limit=15'.format(title=title)
# qry = 'query=language:en AND website.domainName:"wsj.com"&sortOrder=DESC&limit=15'
url = endpoint + qry
response = requests.get(url, headers=headers)
out = response.json();out

[{'id': 'DfQ6U8fs2HIZffk3CxEdt6U1fD1TUdvlbXHYfsdTx8wOZQZmsl1EA8hfWaeO1JjQFTg7hl-dnHe9D2QyrQqT5w',
  'publishDate': '2019-08-30T12:30:34',
  'discoverDate': '2019-08-30T12:51:36.329+0000',
  'title': "Millennials are making travel a priority more than previous generations—that's not a bad thing",
  'language': 'en',
  'text': 'Your Instagram feed is probably littered with photos from your friends on vacation sharing breathtaking views of exotic beaches or snapshots of once-in-a-lifetime meals. And if you think that the sheer number of these vacation photos is growing, you\'re not wrong. That\'s because millennials, more than previous generations, are making travel a priority. In 2019, the average millennial (ages 21 to 37) plans on taking roughly five trips throughout the year, three of which are expected to be international, according to AARP\'s 2019 Travel Trends report. That\'s more international trips than Gen X (ages 38 to 53) and more overall trips than Baby Boomers (ages 54 to 72

In [79]:
fnames = glob.glob('newsdata/downloaded/*.json')
dummy = {}

for fname in tqdm(fnames):
    
    with open(fname, encoding='UTF-8-sig') as f:
        content = json.load(f)
        title = content['title']
        url = content['url']
        description = content['description']
        hash_url = hashlib.sha1(url.encode('utf-8')).hexdigest()

        if title=='' or title=='-':
            html = requests.get(url).text
            extracted_title = extraction.Extractor().extract(html, source_url=url).title
            
            if extracted_title=='' or extracted_title=='-':
                if description!='':
                    dummy[hash_url] = description
                else:
                    print(url)
                    
            else:
                dummy[hash_url] = extracted_title

HBox(children=(IntProgress(value=0, max=12783), HTML(value='')))




In [124]:
fnames = glob.glob('newsdata/downloaded/*.json')
dummy = []

for fname in tqdm(fnames):
    
    with open(fname, encoding='UTF-8-sig') as f:
        content = json.load(f)
        url = content['url']
        if content['top_image'].split('.')[-1]=='ico':
            print(url)

HBox(children=(IntProgress(value=0, max=13368), HTML(value='')))




In [122]:
len(dummy)

286

In [123]:
for file in dummy:
    with open(file, 'r') as f:
        data = json.load(f)
        #set_trace()
        data['top_image'] = ''

    os.remove(file)
    with open(file, 'w') as f:
        json.dump(data, f)

In [159]:
fnames = glob.glob('newsdata/downloaded/*.json')

In [160]:
fnames_update = []
for fname in tqdm(fnames):
    js = json.loads(Path(fname).read_text())
    url = js['url']
    if ('#' in url) or ('\n' in url):
        fnames_update.append(fname)

HBox(children=(IntProgress(value=0, max=75264), HTML(value='')))

In [161]:
fnames_update

[]

In [86]:
def fix_url(url):
    try: url = url[:url.index('/#')]
    except: pass
    
    try: url = url[:url.index('#')]
    except: pass
    
    try: url = url[:url.index('\n')]
    except: pass
    
    return url

In [103]:
tmp = {}
basedir = os.path.join(os.getcwd(), 'newsdata')

for fname in tqdm(fnames_update[:]):
    js = json.loads(Path(fname).read_text())
    url = fix_url(js['url'])
    js['url'] = url
    
    hash_url = hashlib.sha1(url.encode('utf-8')).hexdigest()
    file = os.path.join(basedir, 'downloaded', hash_url + '.json')    
    tmp[file] = js

HBox(children=(IntProgress(value=0, max=755), HTML(value='')))

In [132]:
len(tmp)

723

In [125]:
for file, js in tmp.items():
    with open(file, 'w') as f:
        json.dump(js, f)

In [130]:
len(fnames_update)

755

In [133]:
for fname in fnames_update:
    os.remove(fname)

In [145]:
len(fnames)

74865