In [175]:
import feedparser as fp
import json
import newspaper
from newspaper import Article
from time import mktime
from datetime import datetime
import pandas as pd

# Set the limit for number of articles to download
LIMIT = 5000

data = {}
data['newspapers'] = {}

# Loads the JSON files with news sites
with open('NewsPapers.json') as data_file:
    companies = json.load(data_file)

count = 1

# Iterate through each news company
for company, value in companies.items():
    # If a RSS link is provided in the JSON file, this will be the first choice.
    # Reason for this is that, RSS feeds often give more consistent and correct data.
    # If you do not want to scrape from the RSS-feed, just leave the RSS attr empty in the JSON file.
    if 'rss' in value:
        d = fp.parse(value['rss'])
        print("Downloading articles from ", company)
        newsPaper = {
            "rss": value['rss'],
            "link": value['link'],
            "articles": []
        }
        for entry in d.entries:
            # Check if publish date is provided, if no the article is skipped.
            # This is done to keep consistency in the data and to keep the script from crashing.
            if hasattr(entry, 'published'):
                if count > LIMIT:
                    break
                article = {}
                article['link'] = entry.link
                date = entry.published_parsed
                article['published'] = datetime.fromtimestamp(mktime(date)).isoformat()
                try:
                    content = Article(entry.link)
                    content.download()
                    content.parse()
                except Exception as e:
                    # If the download for some reason fails (ex. 404) the script will continue downloading
                    # the next article.
                    print(e)
                    print("continuing...")
                    continue
                article['title'] = content.title
                article['text'] = content.text
                newsPaper['articles'].append(article)
                print(count, "articles downloaded from", company, ", url: ", entry.link)
                count = count + 1
    else:
        # This is the fallback method if a RSS-feed link is not provided.
        # It uses the python newspaper library to extract articles
        print("Building site for ", company)
        paper = newspaper.build(value['link'], memoize_articles=False)
        newsPaper = {
            "link": value['link'],
            "articles": []
        }
        noneTypeCount = 0
        for content in paper.articles:
            if count > LIMIT:
                break
            try:
                content.download()
                content.parse()
            except Exception as e:
                print(e)
                print("continuing...")
                continue
            # Again, for consistency, if there is no found publish date the article will be skipped.
            # After 100 downloaded articles from the same newspaper without publish date, the company will be skipped.
            if content.publish_date is None:
                print(count, " Article has date of type None...")
                noneTypeCount = noneTypeCount + 1
                if noneTypeCount > 100:
                    print("Too many noneType dates, aborting...")
                    noneTypeCount = 0
                    break
                count = count + 1
                continue
            article = {}
            article['title'] = content.title
            article['text'] = content.text
            article['link'] = content.url
            article['published'] = content.publish_date.isoformat()
            newsPaper['articles'].append(article)
            print(count, "articles downloaded from", company, " using newspaper, url: ", content.url)
            count = count + 1
            noneTypeCount = 0
    count = 1
    data['newspapers'][company] = newsPaper

# Finally it saves the articles as a JSON-file.
try:
    with open('scraped_articles.json', 'w') as outfile:
        json.dump(data, outfile)
except Exception as e: print(e)


Building site for  yahoo finance
1 articles downloaded from yahoo finance  using newspaper, url:  https://finance.yahoo.com/video/eurasia-group-impeachment-may-cause-203414770.html
2 articles downloaded from yahoo finance  using newspaper, url:  https://finance.yahoo.com/news/micron-earnings-what-to-know-in-markets-thursday-000457651.html
3 articles downloaded from yahoo finance  using newspaper, url:  https://finance.yahoo.com/news/trump-impeachment-inquiry-could-kill-the-usmca-233627048.html
4 articles downloaded from yahoo finance  using newspaper, url:  https://finance.yahoo.com/news/one-way-trump-could-send-stocks-soaring-233628338.html
5 articles downloaded from yahoo finance  using newspaper, url:  https://finance.yahoo.com/news/blackberry-plunges-below-devilish-6-135130404.html
6 articles downloaded from yahoo finance  using newspaper, url:  https://finance.yahoo.com/news/fairfax-watsa-lost-121-million-164238512.html
7 articles downloaded from yahoo finance  using newspaper, ur

55 articles downloaded from yahoo finance  using newspaper, url:  https://finance.yahoo.com/news/fitbit-versa-2-review-130030290.html
56 articles downloaded from yahoo finance  using newspaper, url:  https://finance.yahoo.com/news/sonos-ceo-investors-are-underestimating-the-power-of-our-new-speakers-152630312.html
57 articles downloaded from yahoo finance  using newspaper, url:  https://finance.yahoo.com/news/sonos-move-portable-speaker-130025107.html
58 articles downloaded from yahoo finance  using newspaper, url:  https://finance.yahoo.com/news/slacks-first-quarterly-earnings-report-after-ipo-190935065.html
59 articles downloaded from yahoo finance  using newspaper, url:  https://finance.yahoo.com/news/roku-smart-soundbar-subwoofer-speaker-130147197.html
60 articles downloaded from yahoo finance  using newspaper, url:  https://finance.yahoo.com/calendar/earnings?day=2019-09-26
61 articles downloaded from yahoo finance  using newspaper, url:  https://finance.yahoo.com/calendar/splits?

112 articles downloaded from yahoo finance  using newspaper, url:  https://finance.yahoo.com/video/author-actress-jill-kargman-money-200000592.html
113 articles downloaded from yahoo finance  using newspaper, url:  https://finance.yahoo.com/video/trumps-risky-tax-hike-213310868.html
Downloading articles from  globenewswire
Downloading articles from  cnn
Downloading articles from  google
1 articles downloaded from google , url:  https://www.nytimes.com/2019/09/25/us/politics/trump-ukraine-whistleblower.html
2 articles downloaded from google , url:  https://www.cnn.com/2019/09/25/politics/mike-quigley-trump-whistleblower-complaint-cnntv/index.html
3 articles downloaded from google , url:  https://www.politico.com/news/2019/09/25/mcconnell-ukraine-trump-001034
4 articles downloaded from google , url:  https://www.npr.org/2019/09/25/764453663/pentagon-letter-undercuts-trump-assertion-on-delaying-aid-to-ukraine-over-corrup
5 articles downloaded from google , url:  https://www.politico.com/s

15 articles downloaded from theguardian , url:  https://www.theguardian.com/us-news/2019/sep/25/stanford-sexual-assault-victim-chanel-miller-interview
16 articles downloaded from theguardian , url:  https://www.theguardian.com/stage/2019/sep/26/glass-kill-bluebeard-imp-review-finding-fascination-in-bloodshot-fables
17 articles downloaded from theguardian , url:  https://www.theguardian.com/politics/2019/sep/25/incredible-sulks-anger-is-followed-by-ranting-of-geoffrey-cox
18 articles downloaded from theguardian , url:  https://www.theguardian.com/politics/2019/sep/25/labour-media-tactics-more-trumpian-admit-jeremy-corbyn
19 articles downloaded from theguardian , url:  https://www.theguardian.com/stage/2019/sep/25/two-ladies-review-zoe-wanamaker-bridge-theatre-london-nancy-harris-nicholas-hytner
20 articles downloaded from theguardian , url:  https://www.theguardian.com/politics/audio/2019/sep/26/can-labour-unite-and-plot-a-path-to-power
21 articles downloaded from theguardian , url:  ht

67 articles downloaded from theguardian , url:  https://www.theguardian.com/music/2019/sep/25/bowie-gnome-and-dr-dre-hospital-musicians-embarrassing-early-songs
68 articles downloaded from theguardian , url:  https://www.theguardian.com/film/2019/sep/25/to-tokyo-review-caspar-seale-jones-florence-kosky
69 articles downloaded from theguardian , url:  https://www.theguardian.com/music/2019/sep/25/louis-tomlinson-one-direction-dark-side-gives-me-strength
70 articles downloaded from theguardian , url:  https://www.theguardian.com/live-victoriously/2019/sep/20/early-afternoon-delight-the-boundless-appeal-of-brunching
71 articles downloaded from theguardian , url:  https://www.theguardian.com/live-victoriously/2019/sep/17/neil-rankin-if-you-keep-your-food-and-drink-simple-the-night-is-more-sociable
72 articles downloaded from theguardian , url:  https://www.theguardian.com/live-victoriously/2019/sep/18/jessie-ware-i-feel-much-more-comfortable-in-my-30s-im-living-victoriously
73 articles down

91  Article has date of type None...
92  Article has date of type None...
93  Article has date of type None...
94  Article has date of type None...
95  Article has date of type None...
96  Article has date of type None...
97  Article has date of type None...
98  Article has date of type None...
99  Article has date of type None...
100  Article has date of type None...
101  Article has date of type None...
102  Article has date of type None...
103  Article has date of type None...
104  Article has date of type None...
105  Article has date of type None...
106  Article has date of type None...
107  Article has date of type None...
108  Article has date of type None...
109  Article has date of type None...
110  Article has date of type None...
111  Article has date of type None...
112  Article has date of type None...
113  Article has date of type None...
114  Article has date of type None...
115  Article has date of type None...
116  Article has date of type None...
117  Article has date

48 articles downloaded from nbcnews  using newspaper, url:  https://www.nbcnews.com/news/us-news/wisconsin-woman-accused-hiding-mom-s-corpse-cash-checks-n1058796
49 articles downloaded from nbcnews  using newspaper, url:  https://www.nbcnews.com/news/us-news/florida-grandmother-accused-fatally-drugging-disabled-grandson-n1058711
50 articles downloaded from nbcnews  using newspaper, url:  https://www.nbcnews.com/news/us-news/man-arrested-after-video-shows-attack-los-angeles-real-estate-n1058526
51 articles downloaded from nbcnews  using newspaper, url:  https://www.nbcnews.com/news/us-news/iowa-football-fan-carson-king-whose-viral-sign-raised-1m-n1058601
52 articles downloaded from nbcnews  using newspaper, url:  https://www.nbcnews.com/tech/social-media/how-anti-vaxxers-target-grieving-moms-turn-them-crusaders-n1057566
53 articles downloaded from nbcnews  using newspaper, url:  https://www.nbcnews.com/think/opinion/trump-s-impeachment-was-once-politically-risky-democrats-not-anymore-nc

14 articles downloaded from washingtonpost , url:  https://www.washingtonpost.com/local/weather/capital-weather-gang/three-key-takeaways-from-the-new-un-climate-report-on-the-earths-oceans-glaciers-and-ice-sheets/2019/09/25/d31244c0-ce03-4af8-af1f-1366c4b728ad_story.html
15 articles downloaded from washingtonpost , url:  https://www.washingtonpost.com/world/trump-impeachment-proceedings-trigger-caution-and-a-little-schadenfreude-overseas/2019/09/25/77b2aa08-df49-11e9-be7f-4cc85017c36f_story.html
16 articles downloaded from washingtonpost , url:  https://www.washingtonpost.com/politics/monkey-cage/saudi-uae-twitter-takedowns-wont-curb-rampant-disinformation-on-arab-twitter/2019/09/25/0c5d2012-26fb-4d35-82fa-4f33f5b56d04_story.html
17 articles downloaded from washingtonpost , url:  https://www.washingtonpost.com/national/morning-mix/a-college-student-died-in-his-dorm-room-it-took-almost-two-months-for-anyone-to-notice/2019/09/25/2679f548-8ea0-4124-98b5-f81fc4c384cf_story.html
18 articles

47 articles downloaded from bbc , url:  https://www.bbc.co.uk/news/world-us-canada-49800181
48 articles downloaded from bbc , url:  https://www.bbc.co.uk/sport/football/49736856
49 articles downloaded from bbc , url:  https://www.bbc.co.uk/sport/football/49834154
50 articles downloaded from bbc , url:  https://www.bbc.co.uk/sport/football/49736850
51 articles downloaded from bbc , url:  https://www.bbc.co.uk/sport/tennis/49823518
52 articles downloaded from bbc , url:  https://www.bbc.co.uk/sport/judo/49833292
53 articles downloaded from bbc , url:  https://www.bbc.co.uk/sport/av/athletics/49799142
54 articles downloaded from bbc , url:  https://www.bbc.co.uk/sport/av/cycling/49832085
55 articles downloaded from bbc , url:  https://www.bbc.co.uk/news/uk-politics-49833561
56 articles downloaded from bbc , url:  https://www.bbc.co.uk/news/uk-politics-49827455
57 articles downloaded from bbc , url:  https://www.bbc.co.uk/news/uk-politics-49832110
58 articles downloaded from bbc , url:  ht

40 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2019/09/25/joe-biden-trumps-ukraine-call-is-a-tragedy-for-this-country/
41 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2019/09/25/joe-biden-trumps-ukraine-call-is-a-tragedy-for-this-country/#disqus_thread
42 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/2020-election/2019/09/25/npr-democrats-think-trumps-base-is-too-dumb-to-support-impeachment/
43 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/2020-election/2019/09/25/npr-democrats-think-trumps-base-is-too-dumb-to-support-impeachment/#disqus_thread
44 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2019/09/25/nancy-pelosi-swells-gun-violence-deaths-66-percent-2/
45 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2019/09/25/nancy-pe

87 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/the-media/2019/09/24/pollak-5-times-fox-news-judge-andrew-napolitano-said-trump-committed-a-crime-and-was-wrong/#disqus_thread
88 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2019/09/25/ap-lawmakers-pursue-secondhand-complaint-after-ukraine-denies-pressure-from-trump/
89 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2019/09/25/ap-lawmakers-pursue-secondhand-complaint-after-ukraine-denies-pressure-from-trump/#disqus_thread
90 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2019/09/25/ocasio-cortez-impeachment-inquiry-puts-democrat-party-new-direction/
91 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2019/09/25/ocasio-cortez-impeachment-inquiry-puts-democrat-party-new-direction/#disqus_thread
92 articles downloaded fro

133 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/tech/2019/09/25/police-homeless-man-threatened-to-shoot-all-the-rich-kids-at-university-of-texas/
134 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/tech/2019/09/25/police-homeless-man-threatened-to-shoot-all-the-rich-kids-at-university-of-texas/#disqus_thread
135 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/tech/2019/09/25/nz-college-student-died-in-dorm-room-found-two-months-later/
136 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/tech/2019/09/25/nz-college-student-died-in-dorm-room-found-two-months-later/#disqus_thread
137 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/tech/2019/09/25/columbia-university-welcomes-antisemitic-malaysian-pm-for-speech/
138 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/tech/2019/09

177 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/entertainment/2019/09/25/harrison-ford-demands-world-get-the-hell-out-of-greta-thunbergs-way/
178 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/entertainment/2019/09/25/harrison-ford-demands-world-get-the-hell-out-of-greta-thunbergs-way/#disqus_thread
179 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/environment/2019/09/25/climate-alarmist-greta-thunberg-wins-alternative-nobel-prize/
180 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/environment/2019/09/25/climate-alarmist-greta-thunberg-wins-alternative-nobel-prize/#disqus_thread
181 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2019/09/25/watch-students-world-wide-gather-to-pray-during-see-you-at-the-pole-event/
182 articles downloaded from breitbart  using newspaper, url:  http://www.breit

222 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/border/2019/09/23/juarez-rages-20-murders-in-40-hours/
223 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/border/2019/09/23/watch-large-group-of-migrant-families-cross-arizona-border/
224 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/border/2019/09/25/migracion-mexicana-detiene-a-tres-migrantes-de-georgia-rumbo-a-texas/
225 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/border/2019/09/23/cocaine-meth-busts-by-border-patrol-hit-5-year-high-in-2019/
226 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/border/2019/09/22/woman-bites-camels-testicles-at-louisiana-truck-stop/
227 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/border/2019/09/22/video-pirates-rob-tourists-on-party-boat-in-mexico/
228 articles downloaded from b

305 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/latin-america/2019/09/24/socialist-venezuela-envoy-pretends-to-read-during-trump-u-n-speech/
306 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/environment/2019/09/24/bolsonaro-asks-u-n-to-fight-colonialist-macron-environmentalists-on-amazon-fires/
307 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/latin-america/2019/09/24/brazils-bolsonaro-accuses-u-n-of-aiding-slave-labor-at-general-assembly/
308 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/latin-america/2019/09/23/uninvited-venezuelas-guaido-sends-delegation-u-n-general-assembly/
309 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/national-security/2019/09/20/u-s-expels-cuban-diplomats-threatening-national-security/
310 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.c

350 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/middle-east/2019/09/23/joint-list-of-arab-parties-back-benny-gantz-as-pm/
351 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/middle-east/2019/09/23/lieberman-wont-recommend-gantz-or-netanyahu-as-pm/
352 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/clips/2019/09/22/dem-sen-cardin-trump-isolated-america-rather-than-iran-when-he-withdrew-from-iran-nuclear-deal/
353 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/clips/2019/09/22/kerry-one-way-or-another-iran-was-responsible-for-saudi-attack/
354 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/clips/2019/09/22/iranian-foreign-minister-if-u-s-starts-a-war-they-will-not-be-the-one-who-finishes-it/
355 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/middle-east/2019/09/22/cra

398 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/economy/2019/09/25/capital-spending-plans-of-u-s-companies-hits-record-high/#disqus_thread
399 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/economy/2019/09/25/repo-madness-day-7-banks-seek-92-billion-of-repo-funding-from-ny-fed/
400 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2019/09/25/report-georgia-loses-more-than-300-million-in-payroll-from-h-1b-workers/
401 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/economy/2019/09/25/veterans-thriving-in-strong-post-recession-economy/
402 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/tech/2019/09/24/report-wework-ceo-adam-neumann-stepping-down/
403 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/economy/2019/09/24/stocks-rebound-2/
404 articles downloaded from

444 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/sports/2019/09/24/watch-pat-fitzgerald-to-social-media-critics-my-e-mail-is-hashtag-i-dont-care/
445 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/sports/2019/09/24/patriots-withhold-first-installment-antonio-browns-signing-bonus/
446 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/sports/2019/09/24/olympics-protesters-tommie-smith-john-carlos-get-hall-fame-nod-after-recent-sanctions/
447 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/sports/2019/09/24/pga-tour-cancels-china-series-event-in-hong-kong-for-safety/
448 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/sports/2019/09/24/kansas-receives-notice-of-allegations-from-ncaa-in-mens-hoops/
449 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2019/09/24/change

490 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2019/09/25/jjosh-hawley-leftist-reporters-twist-and-distort-facts-to-advance-agenda/
491 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2019/09/25/jjosh-hawley-leftist-reporters-twist-and-distort-facts-to-advance-agenda/#disqus_thread
494 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/entertainment/2019/09/25/larry-david-debra-messing-starring-in-mueller-report-play-fundraiser-for-democratic-pac/
495 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/entertainment/2019/09/25/larry-david-debra-messing-starring-in-mueller-report-play-fundraiser-for-democratic-pac/#disqus_thread
496 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/entertainment/2019/09/25/mattel-introduces-gender-neutral-barbie-doll-line/#disqus_thread
497 articles download

535 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2019/09/25/blue-collars-boosting-wages-switching-jobs-11514597/#disqus_thread
536 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/economy/2019/09/25/juul-ceo-steps-down-after-trump-cracks-down-on-vaping/
537 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/economy/2019/09/25/juul-ceo-steps-down-after-trump-cracks-down-on-vaping/#disqus_thread
538 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/economy/2019/09/25/repo-madness-new-york-fed-boosts-overnight-funds-to-100-billion/
539 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/economy/2019/09/25/repo-madness-new-york-fed-boosts-overnight-funds-to-100-billion/#disqus_thread
540 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/economy/2019/09/25/blockbuster-new-

581 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/economy/2019/09/24/sec-charges-comscore-former-ceo-with-accounting-and-disclosure-fraud/
582 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/economy/2019/09/24/sec-charges-comscore-former-ceo-with-accounting-and-disclosure-fraud/#disqus_thread
583 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/tech/2019/09/25/report-leaked-documents-show-tiktok-censors-topics-for-chinese-government/
584 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/tech/2019/09/25/report-leaked-documents-show-tiktok-censors-topics-for-chinese-government/#disqus_thread
585 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/tech/2019/09/25/sony-teams-up-with-the-un-to-fight-climate-change-with-video-games/
586 articles downloaded from breitbart  using newspaper, url:  https://www.breitbar

626 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2019/09/25/senate-staffer-indicates-gun-control-dead-impeachment-focus/#disqus_thread
627 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2019/09/25/man-allegedly-pulls-gun-on-estranged-wife-gets-shot-dead/
628 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2019/09/25/man-allegedly-pulls-gun-on-estranged-wife-gets-shot-dead/#disqus_thread
629 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/entertainment/2019/09/25/some-aurora-shooting-survivors-relatives-urge-joker-studio-to-support-gun-control/
630 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/entertainment/2019/09/25/some-aurora-shooting-survivors-relatives-urge-joker-studio-to-support-gun-control/#disqus_thread
631 articles downloaded from breitbart  using newspaper, url

670 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/clips/2019/09/24/mccarthy-pelosis-announcement-made-no-difference-theyve-been-investigating-trump-before-he-was-elected/#disqus_thread
671 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2019/09/24/president-trump-democrats-gave-up-gun-control-chance-hurt-gop/
672 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2019/09/24/president-trump-democrats-gave-up-gun-control-chance-hurt-gop/#disqus_thread
673 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2019/09/24/arabella-advisors-front-groups-behind-push-for-federally-funded-abortions/
674 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2019/09/24/arabella-advisors-front-groups-behind-push-for-federally-funded-abortions/#disqus_thread
675 articles downloaded from breitba

715 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/africa/2019/09/18/zimbabwes-regime-wont-say-how-much-robert-mugabes-mausoleum-will-cost/
716 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/national-security/2019/09/16/al-qaedas-somali-branch-kills-17-people-over-weekend/
717 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/national-security/2019/09/16/libya-warlord-forces-airstrikes-government-held-positions-isirte/
718 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/africa/2019/09/16/central-african-president-no-stability-in-europe-if-there-is-none-in-africa/
719 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/tech/2019/09/11/doj-global-sweep-nets-281-email-scammers/
720 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/africa/2019/09/10/zimbabweans-plan-to-protest-robert

16 articles downloaded from infowars  using newspaper, url:  https://www.infowars.com/war-on-reality-intensifies-transgender-man-upset-to-not-be-listed-as-father-of-child-she-birthed/
17 articles downloaded from infowars  using newspaper, url:  https://www.infowars.com/war-on-reality-intensifies-transgender-man-upset-to-not-be-listed-as-father-of-child-she-birthed/#vuukle-comments
18 articles downloaded from infowars  using newspaper, url:  https://www.infowars.com/alex-jones-sues-young-turks-official-statement/
19 articles downloaded from infowars  using newspaper, url:  https://www.infowars.com/alex-jones-sues-young-turks-official-statement/#vuukle-comments
20 articles downloaded from infowars  using newspaper, url:  https://www.infowars.com/msm-deception-watch-dishonest-pundits-blatantly-lie-about-trump-ukraine-transcript/
21 articles downloaded from infowars  using newspaper, url:  https://www.infowars.com/msm-deception-watch-dishonest-pundits-blatantly-lie-about-trump-ukraine-tran

68 articles downloaded from infowars  using newspaper, url:  https://www.infowars.com/watch-live-dems-drink-impeachment-kool-aid/#vuukle-comments
69 articles downloaded from infowars  using newspaper, url:  https://www.infowars.com/can-beto-come-and-take-it/
70 articles downloaded from infowars  using newspaper, url:  https://www.infowars.com/can-beto-come-and-take-it/#vuukle-comments
71 articles downloaded from infowars  using newspaper, url:  https://www.infowars.com/not-so-shocking-liberals-triggered-at-chicago-anti-trump-rally/
72 articles downloaded from infowars  using newspaper, url:  https://www.infowars.com/not-so-shocking-liberals-triggered-at-chicago-anti-trump-rally/#vuukle-comments
73 articles downloaded from infowars  using newspaper, url:  https://www.infowars.com/pueblo-sin-fronteras-united-methodist-church-in-chicago/
74 articles downloaded from infowars  using newspaper, url:  https://www.infowars.com/pueblo-sin-fronteras-united-methodist-church-in-chicago/#vuukle-com

38  Article has date of type None...
39  Article has date of type None...
40  Article has date of type None...
41  Article has date of type None...
42  Article has date of type None...
43  Article has date of type None...
44  Article has date of type None...
45  Article has date of type None...
46  Article has date of type None...
47  Article has date of type None...
48  Article has date of type None...
49  Article has date of type None...
50  Article has date of type None...
51  Article has date of type None...
52  Article has date of type None...
53  Article has date of type None...
54  Article has date of type None...
55  Article has date of type None...
56  Article has date of type None...
57  Article has date of type None...
58  Article has date of type None...
59  Article has date of type None...
60  Article has date of type None...
61  Article has date of type None...
62  Article has date of type None...
63  Article has date of type None...
64  Article has date of type None...
6

4  Article has date of type None...
5  Article has date of type None...
6  Article has date of type None...
7  Article has date of type None...
8  Article has date of type None...
9  Article has date of type None...
10  Article has date of type None...
11  Article has date of type None...
Building site for  rsschomp
1  Article has date of type None...
2  Article has date of type None...
3  Article has date of type None...
4 articles downloaded from rsschomp  using newspaper, url:  https://commonthought.net/what-is-the-difference-between-tired-and-sleepy/?utm_source=rss&utm_medium=rss&utm_campaign=what-is-the-difference-between-tired-and-sleepy
5  Article has date of type None...
6  Article has date of type None...
Building site for  realtyfeed
1  Article has date of type None...
2  Article has date of type None...
3  Article has date of type None...
4  Article has date of type None...
5  Article has date of type None...
Downloading articles from  topsite
Building site for  feedsee
Down

Article `download()` failed with 404 Client Error: Not Found for url: https://www.thestreet.com/_yahoo/video/strategysession/10438485.html?cm_ven=YAHOOV&cm_cat=FREE&cm_ite=NA#10438485 on URL http://www.thestreet.com/_yahoo/video/strategysession/10438485.html?cm_ven=YAHOOV&cm_cat=FREE&cm_ite=NA#10438485
continuing...
41 articles downloaded from philstockworld  using newspaper, url:  http://www.fool.com/investing/high-growth/2008/09/19/amds-fusion-confusion.aspx
Article `download()` failed with 403 Client Error: Forbidden for url: https://seekingalpha.com/article/96355-the-next-smart-green-grid-tech-wimax?source=yahoo on URL http://seekingalpha.com/article/96355-the-next-smart-green-grid-tech-wimax?source=yahoo
continuing...
42  Article has date of type None...
43 articles downloaded from philstockworld  using newspaper, url:  http://www.philstockworld.com/2009/08/28/educational-videos/
44 articles downloaded from philstockworld  using newspaper, url:  http://www.philstockworld.com/2010/

In [228]:
import pandas as pd
df=pd.DataFrame(columns=["STOCK_ID", "URL", "DATE_OF_PUBLISH", "HEADLINE", "CONTENT"])
content=[]
stock=[]
url=[]
dop=[]
title=[]
# with open("scraped_articles.json",'r') as load_f:
#       data4 = json.load(load_f)
newspapers=data["newspapers"]
# print(newspapers)
articles=newspapers["theguardian"]
#print (articles)

for articles in newspapers:



    news=newspapers[articles]
#     print (d)
#     print (d["articles"])

    if news["articles"]!= []:
        e=news['articles']
#         print(e)
        for i in range(len(e)):
            j=e[i]
            
            content.append(j['text'])
            title.append(j['title'])
            url.append(j['link'])
            dop.append(j['published'])
            
df["URL"]=url
df["DATE_OF_PUBLISH"]=dop
df["HEADLINE"]=title
df["CONTENT"]=content
df.head(100)
df.drop_duplicates(inplace=True)
df.to_csv("C:/data/CS/CS_NEWSARTICLE_SCRAPE.csv")