# Stocks News Scraper & Summarizer | Felix GG

In [4]:
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

In [None]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = TFPegasusForConditionalGeneration.from_pretrained(model_name)

## Summarizing a single test article

In [6]:
url = "https://finance.yahoo.com/news/amd-tracks-nvidia-gains-wall-093932551.html"
r = requests.get(url)
# Using BeautifulSoup to parse the HTML and make it easier to work with the data
soup = BeautifulSoup(r.content, "html.parser")
# identify paragraphs in the text
paragraphs = soup.find_all("p")

In [7]:
text = [paragraph.text for paragraph in paragraphs]
# Joining all text together into one string, then splitting into words, and limiting words to 500
words = ' '.join(text).split(' ')[:500]
ARTICLE = ' '.join(words)

In [8]:
ARTICLE



In [10]:
# Converting text to tensor of ids
input_ids = tokenizer.encode(ARTICLE, return_tensors="tf")

# Generate Summary
output= model.generate(input_ids, max_length=100, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [11]:
summary

'Shares of the chipmaker have surged this year on bets on AI. AMD’s strategy is focused on data-center chips, not AI'

## Building a News and Sentiment Pipeline to Automate the Summarization

In [65]:
monitored_tickets = ['NVDA', 'ASML', 'AAPL']

### Searching for Stock News using Google and Yahoo Finance

After some testing directly from google search, I got some restrictions regarding cookies and scraping google news. I decided to use the newsapi python library to scrape the news instead. This also allows me to scrape news for multiple tickers and remove the need to clean the output URLs as this is done automatically. 

In [None]:
!pip install newsapi-python

In [66]:
# Function to automate the process of listing latest news for specific tickers.
from newsapi import NewsApiClient

def search_for_news_urls(ticker):
    newsapi = NewsApiClient(api_key='e2dcc510ce164a4a99fde3c1d32100b4')

    # Get articles related to the ticker and specific sources
    all_articles = newsapi.get_everything(q=f'{ticker} "Motley Fool" OR "Yahoo Finance" OR "Marketwatch"',
                                          language='en')

    unwanted_substrings = ['biztoc', 'consent', 'bossip', 'sparkfun', 'avclub', 'rapradar', 'archdaily', 'chaos2ch', 'huffpost', 'lwn', 'spring', 'comics', 'libsyn']

    hrefs = []
    for article in all_articles['articles']:
        url = article['url']
        # Only include the URL if it doesn't contain any of the unwanted substrings
        if not any(substring in url for substring in unwanted_substrings):
            hrefs.append(url)

    return hrefs

In [67]:
raw_urls = {ticker:search_for_news_urls(ticker) for ticker in monitored_tickets}
raw_urls

{'NVDA': ['https://www.fool.com/investing/2023/06/04/nvidia-stock-2-reasons-to-wait-for-the-dip-and-1-r/',
  'https://www.marketwatch.com/story/investors-should-put-more-money-in-gold-and-cash-as-rally-in-stocks-wont-last-top-jpmorgan-analyst-says-e5fce3bb',
  'https://markets.businessinsider.com/news/stocks/nvidia-stock-price-artificial-intelligence-stocks-ai-chips-gold-rush-2023-5',
  'https://www.fool.com/earnings/call-transcripts/2023/05/24/nvidia-nvda-q1-2024-earnings-call-transcript/',
  'https://www.marketwatch.com/story/stock-markets-ai-frenzy-reinforces-this-crucial-rule-for-traders-renmac-says-fc248f84',
  'https://www.businessinsider.com/nvidia-stock-market-outlook-earnings-ai-race-economy-bank-bofa-2023-5',
  'https://www.fool.com/investing/2023/05/24/nvidia-stock-rockets-25-after-earnings-nvda-stock/',
  'https://www.marketwatch.com/story/u-s-stock-futures-slip-as-debt-ceiling-fretting-lingers-e80c49bc',
  'https://www.marketwatch.com/story/the-dow-has-worst-month-by-this-

### Search and Scrape Desired News

In [68]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, "html.parser")
        paragraphs = soup.find_all("p")
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [69]:
articles = {ticker:scrape_and_process(raw_urls[ticker]) for ticker in monitored_tickets}

In [74]:
len(articles["AAPL"])

83

In [71]:
articles["NVDA"][2]

'Jump to   \n Instead of trying to pick a winner in the artificial intelligence race, why not just buy the company that will be supplying all AI competitors with the necessary hardware? That\'s the idea behind a recent note from Bank of America, which said Nvidia is well positioned to monetize the AI race that\'s heating up between Microsoft and Alphabet — no matter which company ultimately reigns supreme. Nvidia is the "picks and shovels leader in the AI gold rush," BofA said, referencing the idea that merchants who sold supplies during the 1850s gold rush did better than the actual gold miners. Nvidia sells AI accelerator graphics processing units, which help power the large language models that are behind the conversational AI chatbots like ChatGPT and Bard. According to the note, the total addressable market for these chips could grow to $60 billion by 2027, and Nvidia currently dominates about 75% of the market. The semiconductor company\'s H100 accelerator has surged in demand in

### Summarizing all articles

In [76]:
# Function to summarize all of the scraped articles
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors="tf")
        output= model.generate(input_ids, max_length=100, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [None]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickets}
summaries