In [1]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

In [2]:
# Setup the NLP model
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Downloading:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

In [3]:
#  Test for summarizing a single article
url = "https://finance.yahoo.com/news/aia-agrees-buy-china-post-092005949.html"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [4]:
paragraphs[0].text

'(Bloomberg) -- AIA Group Ltd. has agreed to acquire a stake in China Post Life Insurance Co. for about 12 billion yuan ($1.9 billion) to bolster its China expansion.'

In [14]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
article = ' '.join(words)

In [15]:
article

'(Bloomberg) -- AIA Group Ltd. has agreed to acquire a stake in China Post Life Insurance Co. for about 12 billion yuan ($1.9 billion) to bolster its China expansion. The Hong Kong-based insurance giant will buy a 24.99% stake in China Post Life, subject to regulatory approvals, AIA said in a filing to the Hong Kong exchange on Tuesday. AIA closed 0.8% higher at HK$96.5 in Hong Kong on Wednesday. The deal marks AIA’s second effort this year to boost its distribution network in the world’s most populous nation. AIA is accelerating expansion in mainland China as Covid travel restrictions continue to curb sales of insurance to Chinese visitors, previously a key driver for its Hong Kong unit. Mainland China has eclipsed Hong Kong as the largest contributor of new business value for the insurer. China Post Life, a subsidiary of China Post Group Co., is a bank-affiliated life insurer focusing on the mass-affluent market in China. It has access to a retail distribution network of around 40,00

In [16]:
# Feed the article into the Pegasus NLP model
input_ids = tokenizer.encode(article, return_tensors='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ../aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


In [17]:
summary

'Hong Kong insurer to buy a 24.99% stake. Deal marks second effort this year to boost distribution network'

In [18]:
# Create a list for interested assets
monitored_tickers = ['GME', 'TSLA', 'BTC']

In [19]:
# Search for financial news using Google and Yahoo Finance
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs 

In [20]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'GME': ['/?sa=X&ved=0ahUKEwip6Y3Oib_xAhVAm2oFHcZmDksQOwgC',
  '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwip6Y3Oib_xAhVAm2oFHcZmDksQPAgE',
  '/search?q=yahoo+finance+GME&tbm=nws&ie=UTF-8&gbv=1&sei=hT3cYOmSM8C2qtsPxs252AQ',
  '/search?q=yahoo+finance+GME&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwip6Y3Oib_xAhVAm2oFHcZmDksQ_AUIBygA',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwip6Y3Oib_xAhVAm2oFHcZmDksQ_AUICSgC',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwip6Y3Oib_xAhVAm2oFHcZmDksQ_AUICigD',
  'https://maps.google.com/maps?q=yahoo+finance+GME&um=1&ie=UTF-8&sa=X&ved=0ahUKEwip6Y3Oib_xAhVAm2oFHcZmDksQ_AUICygE',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwip6Y3Oib_xAhVAm2oFHcZmDksQ_AUIDCgF',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwip6Y3Oib_xAhVAm2oFHcZmDksQ_AUIDSgG',
  '/advanced_search',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws&source=lnt&tbs=lr:lang

In [21]:
raw_urls['GME']

['/?sa=X&ved=0ahUKEwip6Y3Oib_xAhVAm2oFHcZmDksQOwgC',
 '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwip6Y3Oib_xAhVAm2oFHcZmDksQPAgE',
 '/search?q=yahoo+finance+GME&tbm=nws&ie=UTF-8&gbv=1&sei=hT3cYOmSM8C2qtsPxs252AQ',
 '/search?q=yahoo+finance+GME&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwip6Y3Oib_xAhVAm2oFHcZmDksQ_AUIBygA',
 '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwip6Y3Oib_xAhVAm2oFHcZmDksQ_AUICSgC',
 '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwip6Y3Oib_xAhVAm2oFHcZmDksQ_AUICigD',
 'https://maps.google.com/maps?q=yahoo+finance+GME&um=1&ie=UTF-8&sa=X&ved=0ahUKEwip6Y3Oib_xAhVAm2oFHcZmDksQ_AUICygE',
 '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwip6Y3Oib_xAhVAm2oFHcZmDksQ_AUIDCgF',
 '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwip6Y3Oib_xAhVAm2oFHcZmDksQ_AUIDSgG',
 '/advanced_search',
 '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws&source=lnt&tbs=lr:lang_1zh-CN%7Clang_1zh

In [22]:
import re

In [23]:
# Create a list for filter keywords
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [24]:
# Filter out the unwanted word from the urls
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [25]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'GME': ['https://finance.yahoo.com/news/meme-stocks-show-that-community-is-profitable-reddit-co-founder-173909883.html',
  'https://finance.yahoo.com/news/gamestop-amc-slip-russell-index-063837020.html',
  'https://ca.finance.yahoo.com/news/forget-gme-stock-consider-shopify-181532573.html',
  'https://finance.yahoo.com/news/gamestop-completes-market-equity-offering-104500035.html',
  'https://finance.yahoo.com/news/options-market-predicting-spike-gamestop-125312793.html',
  'https://finance.yahoo.com/news/tell-whether-amc-gamestop-russell-120012129.html',
  'https://finance.yahoo.com/news/forget-amc-gamestop-10-stocks-205845496.html',
  'https://uk.finance.yahoo.com/news/gamestop-gme-share-price-keep-114619031.html',
  'https://finance.yahoo.com/video/ryan-cohen-elon-musk-goofballs-153407421.html',
  'https://uk.finance.yahoo.com/news/happening-gamestop-gme-share-price-103815992.html'],
 'TSLA': ['https://finance.yahoo.com/news/ex-tesla-president-sold-stocks-000304336.html',
  'https:

In [26]:
# Search and scrape from the cleaned urls
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [27]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'GME': ["As trading platforms like Robinhood create a rise in retail investor market participation and spark a renewed interest in shares of companies dubbed “meme stocks” like GameStop (GME), BlackBerry (BB), and AMC (AMC), the world of finance is seeing an intersection with the social media sphere. “What you're seeing is a much larger trend here,” Alexis Ohanian, founder of venture capital firm Seven Seven Six and co-founder of Reddit, told Yahoo Finance Live. “It is the intersection of community and capital. It's the intersection of social media and finance.” With online communities such as Reddit’s r/WallStreetBets fueling the meme stock craze, the movement has dismissed concerns of market manipulation. Jaime Rogozinski, who founded the subreddit in 2012, says he did it looking for a more active way to trade investment ideas within a community. The forum now boasts a community of over 10.6 million subscribers. According to Ohanian, it is this sense of community and ease of communi

In [28]:
articles['TSLA'][2]

'(For a Reuters live blog on U.S., UK and European stock markets, click LIVE/ or type LIVE/ in a news window.) * Tesla rallies after opening solar charging station * Retail darlings Alfi, Torchlight extend declines (Updates with details at end of session) By Noel Randewich and Devik Jain June 23 (Reuters) - The Nasdaq climbed to a record-high close on Wednesday, fueled by a rally in Tesla Inc, with investors cheering data that showed a record peak for U.S. factory activity in June. Gains in Nvidia Corp and Facebook Inc extended a recent rebound in top-shelf growth stocks that fell out of favor in recent months as investors focused on companies expected to do well as the economy recovers from the pandemic. Data firm IHS Markit said its flash U.S. manufacturing Purchasing Managers\' Index rose to a reading of 62.6 this month, beating estimates of 61.5, but manufacturers are still struggling to secure raw materials and qualified workers, substantially raising prices. The "high level of to

In [29]:
# Summarise all articles by NLP model
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [30]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'GME': ['‘It’s the intersection of social media and finance,’ Ohanian says. Reddit co-founder leads $4 million investment in sports betting startup Wagr',
  'Video game retailer moved to Russell 1000 index while AMC stayed in Russell 2000.',
  'GME stock is trading at a forward price-to-sales multiple of over three. Shopify is Canada’s largest company in terms of market cap',
  'GameStop completed previously ‘at-the-market’ equity offering. Net proceeds from ATM Offering to be used for general corporate purposes',
  'Implied volatility is high for the Jul 16, 2021 Put. Options with high levels of implied volatility suggest there is a trade developing.',
  'AMC and GameStop may be removed from Russell 2000 index. Preliminary ranking for Russell 1000 announced on May 7',
  'Retail investors used forum WallStreetBets to coordinate short-selling bets.',
  'Shares of the video game retailer have surged over the past year. Even though some analysts believe the recent rally is speculation, I

In [31]:
summaries['BTC']

['Vinnik had been sentenced last December on money-laundering charges. He was arrested in Greece in 2017 at the behest of U.S.',
 'Complaint volume more than five times average each month. Coinbase says it’s beefing up its support team',
 'Alibaba founder Jack Ma says U.S. should follow China’s lead.',
 'Fundstrat recommends reducing risk or buying some protection. Bitcoin has roughly halved from a peak near $65,000 in April',
 'Quantum not banned for regulated activities, CEO says. Bitcoin is up 1.6%, Ethereum up 8% on Tuesday',
 'Peter Smith sees ‘fantastic news’ for Bitcoin ecosystem. Beijing’s crackdown will see miners move overseas',
 'Europe Opportunity Fund bought the shares. Grayscale is owned by CoinDesk Digital Currency Group.',
 "Don Vo, VBit Technologies CEO, talks about China's crackdown on crypto.",
 'BTIG reiterates year-end 2021 price targets for Bitcoin, Ether. Rally is ‘not necessarily ‘bullish,’ says Miller Tabak',
 'BTCC to pivot to blockchain use outside of trading

In [32]:
# Perfrom sentiment analysis
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

In [33]:
sentiment(summaries['BTC'])

[{'label': 'NEGATIVE', 'score': 0.9611397981643677},
 {'label': 'NEGATIVE', 'score': 0.6672735810279846},
 {'label': 'NEGATIVE', 'score': 0.5721613764762878},
 {'label': 'NEGATIVE', 'score': 0.9974943399429321},
 {'label': 'NEGATIVE', 'score': 0.9271972179412842},
 {'label': 'POSITIVE', 'score': 0.9988064169883728},
 {'label': 'NEGATIVE', 'score': 0.9742448329925537},
 {'label': 'POSITIVE', 'score': 0.9794243574142456},
 {'label': 'NEGATIVE', 'score': 0.9942106604576111},
 {'label': 'NEGATIVE', 'score': 0.992952287197113}]

In [34]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

{'GME': [{'label': 'POSITIVE', 'score': 0.9224709868431091},
  {'label': 'NEGATIVE', 'score': 0.9849210381507874},
  {'label': 'POSITIVE', 'score': 0.9898219704627991},
  {'label': 'NEGATIVE', 'score': 0.993374228477478},
  {'label': 'NEGATIVE', 'score': 0.9728610515594482},
  {'label': 'NEGATIVE', 'score': 0.9982779026031494},
  {'label': 'NEGATIVE', 'score': 0.9912553429603577},
  {'label': 'POSITIVE', 'score': 0.9866163730621338},
  {'label': 'NEGATIVE', 'score': 0.9856416583061218},
  {'label': 'POSITIVE', 'score': 0.9879394173622131}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.964103102684021},
  {'label': 'POSITIVE', 'score': 0.5628847479820251},
  {'label': 'NEGATIVE', 'score': 0.9795437455177307},
  {'label': 'NEGATIVE', 'score': 0.993100106716156},
  {'label': 'NEGATIVE', 'score': 0.9913976192474365},
  {'label': 'NEGATIVE', 'score': 0.9942201972007751},
  {'label': 'NEGATIVE', 'score': 0.9990779161453247},
  {'label': 'POSITIVE', 'score': 0.9851698875427246},
  {'label': 'NE

In [35]:
print(summaries['GME'][3], scores['GME'][3]['label'], scores['GME'][3]['score'])

GameStop completed previously ‘at-the-market’ equity offering. Net proceeds from ATM Offering to be used for general corporate purposes NEGATIVE 0.993374228477478


In [36]:
scores['BTC'][0]['score']

0.9611397981643677

In [37]:
range(len(summaries['GME']))

range(0, 10)

In [38]:
summaries['GME'][3]

'GameStop completed previously ‘at-the-market’ equity offering. Net proceeds from ATM Offering to be used for general corporate purposes'

In [39]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [40]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['GME',
  '‘It’s the intersection of social media and finance,’ Ohanian says. Reddit co-founder leads $4 million investment in sports betting startup Wagr',
  'POSITIVE',
  0.9224709868431091,
  'https://finance.yahoo.com/news/meme-stocks-show-that-community-is-profitable-reddit-co-founder-173909883.html'],
 ['GME',
  'Video game retailer moved to Russell 1000 index while AMC stayed in Russell 2000.',
  'NEGATIVE',
  0.9849210381507874,
  'https://finance.yahoo.com/news/gamestop-amc-slip-russell-index-063837020.html'],
 ['GME',
  'GME stock is trading at a forward price-to-sales multiple of over three. Shopify is Canada’s largest company in terms of market cap',
  'POSITIVE',
  0.9898219704627991,
  'https://ca.finance.yahoo.com/news/forget-gme-stock-consider-shopify-181532573.html'],
 ['GME',
  'GameStop completed previously ‘at-the-market’ equity offering. Net proceeds from ATM Offering to be used for general corporate purposes',
  'NEGATIVE',
  0.993374228477478,
  'https://finance

In [41]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])

In [42]:
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['GME',
  '‘It’s the intersection of social media and finance,’ Ohanian says. Reddit co-founder leads $4 million investment in sports betting startup Wagr',
  'POSITIVE',
  0.9224709868431091,
  'https://finance.yahoo.com/news/meme-stocks-show-that-community-is-profitable-reddit-co-founder-173909883.html'],
 ['GME',
  'Video game retailer moved to Russell 1000 index while AMC stayed in Russell 2000.',
  'NEGATIVE',
  0.9849210381507874,
  'https://finance.yahoo.com/news/gamestop-amc-slip-russell-index-063837020.html'],
 ['GME',
  'GME stock is trading at a forward price-to-sales multiple of over three. Shopify is Canada’s largest company in terms of market cap',
  'POSITIVE',
  0.9898219704627991,
  'https://ca.finance.yahoo.com/news/forget-gme-stock-consider-shopify-181532573.html'],
 ['GME',
  'GameStop completed previously ‘at-the-market’ equity offering. Net proceeds from ATM Offering to be used for general corporate purposes',

In [44]:
# Export the results to csv file
import csv
with open('financial-news-summaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)