# Stocks News Scraper & Summarizer | Felix GG

In [2]:
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

In [None]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = TFPegasusForConditionalGeneration.from_pretrained(model_name)

## Summarizing a single test article

In [4]:
url = "https://finance.yahoo.com/news/amd-tracks-nvidia-gains-wall-093932551.html"
r = requests.get(url)
# Using BeautifulSoup to parse the HTML and make it easier to work with the data
soup = BeautifulSoup(r.content, "html.parser")
# identify paragraphs in the text
paragraphs = soup.find_all("p")

In [8]:
text = [paragraph.text for paragraph in paragraphs]
# Joining all text together into one string, then splitting into words, and limiting words to 500
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [9]:
ARTICLE

'(Bloomberg) -- Advanced Micro Devices Inc.’s stellar share-price performance this year reflects its place in the eyes of investors looking to make an artificial intelligence trade: the best backup plan. Most Read from Bloomberg Jeff Bezos Has Gained $10 on Mystery Purchase of One Amazon Share Odey Sacked From His Hedge Fund Firm After Assault Allegations Americans Are Leaving Portugal as Golden Visa Honeymoon Ends Crypto Weekend Slump Compounds Jitters of Investors Already on Edge Trump’s Own Words Play a Star Role in the Case Against Him The stock’s 87% surge would make it the top performer on the Philadelphia Stock Exchange Semiconductor Index if it wasn’t for the stratospheric gain of rival Nvidia Corp. Nvidia briefly became the first chipmaker to have a trillion-dollar market value after delivering concrete evidence that the rush to develop new AI services is translating into a surge in orders for hardware. AMD gets its shot to prove it’s also a player in AI computing on Tuesday w

In [10]:
# Converting text to tensor of ids
input_ids = tokenizer.encode(ARTICLE, return_tensors="tf")

# Generate Summary
output= model.generate(input_ids, max_length=100, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [11]:
summary

'Shares of the world’s top two chipmakers are trading at record levels. Investors are looking to see if AMD can make money from artificial intelligence'

## Building a News and Sentiment Pipeline to Automate the Summarization

In [12]:
monitored_tickets = ['NVDA', 'ASML', 'AAPL']

### Searching for Stock News using Google and Yahoo Finance

After some testing directly from google search, I got some restrictions regarding cookies and scraping google news. I decided to use the newsapi python library to scrape the news instead. This also allows me to scrape news for multiple tickers and remove the need to clean the output URLs as this is done automatically. 

In [None]:
!pip install newsapi-python

In [13]:
# Function to automate the process of listing latest news for specific tickers.
from newsapi import NewsApiClient

def search_for_news_urls(ticker):
    newsapi = NewsApiClient(api_key='APIKEY')

    # Get articles related to the ticker and specific sources
    all_articles = newsapi.get_everything(q=f'{ticker} "Motley Fool" OR "Yahoo Finance" OR "Marketwatch"',
                                          language='en')

    unwanted_substrings = ['biztoc', 'consent', 'bossip', 'sparkfun', 'avclub', 'rapradar', 'archdaily', 'chaos2ch', 'huffpost', 'lwn', 'spring', 'comics', 'libsyn']

    hrefs = []
    for article in all_articles['articles']:
        url = article['url']
        # Only include the URL if it doesn't contain any of the unwanted substrings
        if not any(substring in url for substring in unwanted_substrings):
            hrefs.append(url)

    return hrefs[:10]

In [14]:
raw_urls = {ticker:search_for_news_urls(ticker) for ticker in monitored_tickets}
raw_urls

{'NVDA': ['https://www.fool.com/investing/2023/06/04/nvidia-stock-2-reasons-to-wait-for-the-dip-and-1-r/',
  'https://www.marketwatch.com/story/investors-should-put-more-money-in-gold-and-cash-as-rally-in-stocks-wont-last-top-jpmorgan-analyst-says-e5fce3bb',
  'https://markets.businessinsider.com/news/stocks/nvidia-stock-price-artificial-intelligence-stocks-ai-chips-gold-rush-2023-5',
  'https://www.fool.com/earnings/call-transcripts/2023/05/24/nvidia-nvda-q1-2024-earnings-call-transcript/',
  'https://www.marketwatch.com/story/stock-markets-ai-frenzy-reinforces-this-crucial-rule-for-traders-renmac-says-fc248f84',
  'https://www.businessinsider.com/nvidia-stock-market-outlook-earnings-ai-race-economy-bank-bofa-2023-5',
  'https://www.fool.com/investing/2023/05/24/nvidia-stock-rockets-25-after-earnings-nvda-stock/',
  'https://www.marketwatch.com/story/u-s-stock-futures-slip-as-debt-ceiling-fretting-lingers-e80c49bc',
  'https://www.marketwatch.com/story/the-dow-has-worst-month-by-this-

### Search and Scrape Desired News

In [15]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, "html.parser")
        paragraphs = soup.find_all("p")
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [16]:
articles = {ticker:scrape_and_process(raw_urls[ticker]) for ticker in monitored_tickets}

In [17]:
len(articles["AAPL"])

10

In [18]:
articles["NVDA"][2]

'Jump to   \n Instead of trying to pick a winner in the artificial intelligence race, why not just buy the company that will be supplying all AI competitors with the necessary hardware? That\'s the idea behind a recent note from Bank of America, which said Nvidia is well positioned to monetize the AI race that\'s heating up between Microsoft and Alphabet — no matter which company ultimately reigns supreme. Nvidia is the "picks and shovels leader in the AI gold rush," BofA said, referencing the idea that merchants who sold supplies during the 1850s gold rush did better than the actual gold miners. Nvidia sells AI accelerator graphics processing units, which help power the large language models that are behind the conversational AI chatbots like ChatGPT and Bard. According to the note, the total addressable market for these chips could grow to $60 billion by 2027, and Nvidia currently dominates about 75% of the market. The semiconductor company\'s H100 accelerator has surged in demand in

### Summarizing all articles

In [19]:
# Function to summarize all of the scraped articles
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors="tf")
        output= model.generate(input_ids, max_length=100, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [20]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickets}
summaries

{'NVDA': ['AI chipmaker Nvidia is going to sell off, analyst says.',
  'Global Markets Strategist Marko Kolanovic is advising clients to increase allocations. He says equities are ‘overpriced’, ‘unsustainable’',
  'Graphics chipmaker said it bought AI competitors for $340.',
  'Call to discuss second-quarter fiscal 2019 results.',
  '‘Unprecedented’ surge for chipmaker doesn’t mean momentum is ‘concentrated,’ analyst says.',
  'Shares of the chipmaker have more than doubled in the past year. CEO says the company is in the early stages of self-drive',
  'Nvidia is up over 140% year to date at $389 after hours',
  'Fed minutes show concern about U.S. debt-ceiling talks. Investors see ‘just beginnings,’ Invesco’s Hooper says',
  'Dow Jones Industrial Average suffered its worst month since 2012 in May. S&P 500 also had the worst May in terms of gains since 2012',
  'Nvidia is nearly certain to be one of the top profit-makers'],
 'ASML': ['Chinese phone manufacturer is restricting designs. 

## Adding Sentiment Analysis to the summaries

In [None]:
from transformers import pipeline
sentiment = pipeline("sentiment-analysis")

In [24]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickets}
scores

{'NVDA': [{'label': 'NEGATIVE', 'score': 0.9987630844116211},
  {'label': 'NEGATIVE', 'score': 0.9974051117897034},
  {'label': 'NEGATIVE', 'score': 0.992994487285614},
  {'label': 'NEGATIVE', 'score': 0.7813846468925476},
  {'label': 'POSITIVE', 'score': 0.9326522350311279},
  {'label': 'NEGATIVE', 'score': 0.992727518081665},
  {'label': 'NEGATIVE', 'score': 0.8145501613616943},
  {'label': 'NEGATIVE', 'score': 0.5292481780052185},
  {'label': 'NEGATIVE', 'score': 0.9994052648544312},
  {'label': 'POSITIVE', 'score': 0.9996417760848999}],
 'ASML': [{'label': 'NEGATIVE', 'score': 0.9884271025657654},
  {'label': 'POSITIVE', 'score': 0.9976325035095215},
  {'label': 'NEGATIVE', 'score': 0.9784945249557495},
  {'label': 'POSITIVE', 'score': 0.9984912872314453},
  {'label': 'POSITIVE', 'score': 0.8668683767318726},
  {'label': 'POSITIVE', 'score': 0.9803920388221741},
  {'label': 'POSITIVE', 'score': 0.5313007831573486},
  {'label': 'NEGATIVE', 'score': 0.9984279870986938},
  {'label': '

In [27]:
print(summaries["NVDA"][0], scores["NVDA"][0]["label"], scores["NVDA"][0]["score"])

AI chipmaker Nvidia is going to sell off, analyst says. NEGATIVE 0.9987630844116211


## Exporting results to a CSV and automating the processs

In [29]:
# creating a function to output an array of all final summaries with their sentiment scores as well.
def create_output_array(summaries, scores, urls):
    output_array = []
    for ticker in monitored_tickets:
        for counter in range(len(summaries[ticker])):
            output = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]["label"],
                scores[ticker][counter]["score"],
                urls[ticker][counter],
            ]
            output_array.append(output)
    return output_array

In [30]:
final_output = create_output_array(summaries, scores, raw_urls)
final_output

[['NVDA',
  'AI chipmaker Nvidia is going to sell off, analyst says.',
  'NEGATIVE',
  0.9987630844116211,
  'https://www.fool.com/investing/2023/06/04/nvidia-stock-2-reasons-to-wait-for-the-dip-and-1-r/'],
 ['NVDA',
  'Global Markets Strategist Marko Kolanovic is advising clients to increase allocations. He says equities are ‘overpriced’, ‘unsustainable’',
  'NEGATIVE',
  0.9974051117897034,
  'https://www.marketwatch.com/story/investors-should-put-more-money-in-gold-and-cash-as-rally-in-stocks-wont-last-top-jpmorgan-analyst-says-e5fce3bb'],
 ['NVDA',
  'Graphics chipmaker said it bought AI competitors for $340.',
  'NEGATIVE',
  0.992994487285614,
  'https://markets.businessinsider.com/news/stocks/nvidia-stock-price-artificial-intelligence-stocks-ai-chips-gold-rush-2023-5'],
 ['NVDA',
  'Call to discuss second-quarter fiscal 2019 results.',
  'NEGATIVE',
  0.7813846468925476,
  'https://www.fool.com/earnings/call-transcripts/2023/05/24/nvidia-nvda-q1-2024-earnings-call-transcript/'],

In [32]:
final_output.insert(0, ["Ticker", "Summary", "Label", "Confidence", "URL"])
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['NVDA',
  'AI chipmaker Nvidia is going to sell off, analyst says.',
  'NEGATIVE',
  0.9987630844116211,
  'https://www.fool.com/investing/2023/06/04/nvidia-stock-2-reasons-to-wait-for-the-dip-and-1-r/'],
 ['NVDA',
  'Global Markets Strategist Marko Kolanovic is advising clients to increase allocations. He says equities are ‘overpriced’, ‘unsustainable’',
  'NEGATIVE',
  0.9974051117897034,
  'https://www.marketwatch.com/story/investors-should-put-more-money-in-gold-and-cash-as-rally-in-stocks-wont-last-top-jpmorgan-analyst-says-e5fce3bb'],
 ['NVDA',
  'Graphics chipmaker said it bought AI competitors for $340.',
  'NEGATIVE',
  0.992994487285614,
  'https://markets.businessinsider.com/news/stocks/nvidia-stock-price-artificial-intelligence-stocks-ai-chips-gold-rush-2023-5'],
 ['NVDA',
  'Call to discuss second-quarter fiscal 2019 results.',
  'NEGATIVE',
  0.7813846468925476,
  'https://www.fool.com/earnings/call-transcripts/2023/

In [36]:
# Downloading contructed array into a csv file
import csv
with open('tickersummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)