In [1]:
import requests
from datetime import datetime
import io
import sys
import re
import json
import traceback

from bs4 import BeautifulSoup
from newspaper import Article

import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/lisa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def save_article(symbol, link):
    try:
        print("<<< save_article({}, {}) >>>".format(symbol, link))

        article = Article(link)
        article.download()
        article.parse()
        article.nlp()

        # dictionary / List
        item = {
            "symbol": symbol,
            "link": link
        }

        item["title"] = article.title
        item["keywords"] = article.keywords
        item["summary"] = article.summary
        item["authors"] = article.authors
        item["publish_date"] = article.publish_date.isoformat() #2020-05-05T20:44:00-0800
        item["top_image"] = article.top_image
        item["text"] = article.text
        item["html"] = article.html

        #print(item)

        filename = "articles/%s-%s.json" % (symbol, item["publish_date"])
        with open(filename, 'w') as outfile:
            json.dump(item, outfile)
    except Exception as ex:
        print(ex)
        traceback.print_exc(file=sys.stdout)

In [3]:
# save_article('FB', "https://www.marketwatch.com/story/online-ad-market-is-recovering-from-coronavirus-shock-but-little-else-is-clear-2020-05-05?siteid=yhoof2")

In [4]:
def scrape_article_links(url):
    try:
        print("<<< scrape_article_links({}) >>>".format(url))

        headers = {
            'User-Agent': 'PostmanRuntime/7.24.1'
        }
        biznews = requests.get(url, headers=headers).content
        #print(biznews)
        #tab-link-news

        soup = BeautifulSoup(biznews)
        links = soup.findAll('a', {'class': 'tab-link-news'})

        return links
    
    except Exception as ex:
        print(ex)
        traceback.print_exc(file=sys.stdout)

In [5]:
# scrape_article_links("https://finviz.com/quote.ashx?t=FB")

In [6]:
def collect_articles(symbol):
    print("<<< collect_articles(", symbol, ") >>>")
    url = "https://finviz.com/quote.ashx?t=" + symbol 
    links = scrape_article_links(url)
    for link in links:
        save_article(symbol, link.get("href"))


In [None]:
symbols = ["FB", "AMZN", "AAPL", "NFLX", "GOOG"]

for symbol in symbols: 
    collect_articles(symbol)

<<< collect_articles( FB ) >>>
<<< scrape_article_links(https://finviz.com/quote.ashx?t=FB) >>>
<<< save_article(FB, https://finance.yahoo.com/news/facebook-takes-down-white-nationalist-014817673.html) >>>
<<< save_article(FB, https://finance.yahoo.com/news/facebook-invests-indonesia-gojek-whatsapp-010000918.html) >>>
<<< save_article(FB, https://finance.yahoo.com/news/tech-companies-support-racial-justice-004958039.html) >>>
<<< save_article(FB, https://www.marketwatch.com/story/is-the-market-totally-heartless-cnbcs-jim-cramer-says-nobody-is-investing-to-make-the-world-a-better-place-2020-06-02?siteid=yhoof2) >>>
<<< save_article(FB, https://finance.yahoo.com/video/amazon-balances-worker-safety-customer-000112047.html) >>>
<<< save_article(FB, https://finance.yahoo.com/video/facebook-employees-criticize-zuckerbergs-inaction-000041479.html) >>>
<<< save_article(FB, https://finance.yahoo.com/video/facebook-ceo-zuckerberg-defends-trump-235903735.html) >>>
<<< save_article(FB, https://fin