# Stocks News Scraper & Summarizer | Felix GG

In [None]:
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

In [None]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = TFPegasusForConditionalGeneration.from_pretrained(model_name)

## Summarizing a single test article

In [3]:
url = "https://finance.yahoo.com/news/amd-tracks-nvidia-gains-wall-093932551.html"
r = requests.get(url)
# Using BeautifulSoup to parse the HTML and make it easier to work with the data
soup = BeautifulSoup(r.content, "html.parser")
# identify paragraphs in the text
paragraphs = soup.find_all("p")

In [4]:
text = [paragraph.text for paragraph in paragraphs]
# Joining all text together into one string, then splitting into words, and limiting words to 500
words = ' '.join(text).split(' ')[:500]
ARTICLE = ' '.join(words)

In [5]:
ARTICLE



In [7]:
# Converting text to tensor of ids
input_ids = tokenizer.encode(ARTICLE, return_tensors="tf")

# Generate Summary
output= model.generate(input_ids, max_length=200, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [8]:
summary

'Shares of the chipmaker have surged this year on bets on AI. AMD’s strategy is focused on data-center chips, not AI'

## Building a News and Sentiment Pipeline to Automate the Summarization

In [None]:
monitored_tickets = ['NVDA', 'ASML', 'DIS']

### Searching for Stock News using Google and Yahoo Finance

In [11]:
# Function to automate the process of listing latest news for specific tickers.
def search_for_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.content, "html.parser")
    atags = soup.find_all("a")
    hrefs = [link['href'] for link in atags]
    return hrefs