In [12]:
from transformers import PegasusTokenizer , PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests


In [13]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
# convert word into unique identify -> translator (encode , decode) -> translate every words that model can work with
tokenizer = PegasusTokenizer.from_pretrained(model_name)
# loading models 
model = PegasusForConditionalGeneration.from_pretrained(model_name)


### Summerize a single article

In [21]:
url = "https://www.marketwatch.com/story/nvidia-is-about-to-make-a-big-announcement-during-a-difficult-time-11663368850?siteid=yhoof2"
r = requests.get(url)
soup = BeautifulSoup(r.text , 'html.parser')
paragraphs = soup.find_all('p')



'Nvidia Corp. faces a much more troublesome environment from the last time it launched a new chip architecture as it approaches a new launch.'

In [28]:
text = [paragraph.text for paragraph in paragraphs]
# model has its limit 4 hundred words
words = ' '.join(text).split(' ')[:400]
article = ' '.join(words)
article

'Nvidia Corp. faces a much more troublesome environment from the last time it launched a new chip architecture as it approaches a new launch. Nvidia \n        NVDA,\n        +2.08%\n       kicks off its fall GTC event on Monday, with Chief Executive Jensen Huang scheduled to deliver a keynote address Tuesday morning at 11 a.m. Eastern. At the same event in May 2020, Huang unveiled a new chip architecture in Ampere, and analyst expect he will use this year’s event to unveil the company’s next-generation chip architecture code-named “Lovelace,” after the 19th century English mathematician Ada Lovelace, generally considered to be the world’s first computer programmer for her work on Charles Babbage’s theoretical Analytical Engine.  Ahead of the event, though, Nvidia has landed in a tough sport, with pandemic-era demand plunging its stock price by half so far this year. The company recently took a $1.22 billion inventory charge\xa0as it seeks to clear out old inventory prior to the launch.

In [57]:
# encode the article that model can work with it ( ecodeed set of input id)
# specify [ return_tensor = 'pt' we use pytorch tensor (not python list) ]
input_ids = tokenizer.encode(article , return_tensors = 'pt')
# put the article is encoded and then model run and get putput [ max_length is summary length , beams_search algo , early_stop means model reach the certain level of accuracy for that sequence]
output = model.generate(input_ids , max_length=55 , num_beams=5 , early_stopping=True)
# decode the output and get summary , 
summary = tokenizer.decode(output[0], skip_special_tokens=True)

summary

'Analysts expect Huang to unveil ‘Lovelace,’ code-named after Lovelace. Company took a $1.22 billion inventory charge ahead of event'

### Building a News and Sentiment Pipeline (one method)

In [129]:
monitored_tickers = ['GME', 'TSLA', 'BTC']
# monitored_tickers = ['GME']
# https://finance.yahoo.com/quote/TLSA?p=TLSA&.tsrc=fin-srch
yahoo_base = 'https://finance.yahoo.com/quote'
# https://news.google.com/topstories?sa=t&hl=en-US&gl=US&ceid=US:en
google_base = 'https://news.google.com/search?q=BTC&hl=en-US&gl=US&ceid=US%3Aen'

# yahoo url 
yahoo_news = []
for stock in monitored_tickers:
    yahoo_stock_news = f'https://finance.yahoo.com/quote/{stock}?p={stock}&.tsrc=fin-srch'

    r = requests.get(yahoo_stock_news)
    soup = BeautifulSoup(r.text , 'html.parser')
    for each in soup.findAll('li' , attrs={'class':'js-stream-content Pos(r)'}):
        for element in each.findAll('a'): 
            new_url = yahoo_base+element.get('href')
            yahoo_news.append(new_url)
print((yahoo_news))



    

     
    
    
    





['https://finance.yahoo.com/quote/video/yahoo-finance-uncut-brian-shannon-135044226.html', 'https://finance.yahoo.com/quote/video/instacart-ipo-focus-selling-stock-150601543.html', 'https://finance.yahoo.com/quote/video/sept-18-morning-forecast-182409742.html', 'https://finance.yahoo.com/quote/m/9dfaa4b2-dc7f-30dd-aaa2-439783561a0c/gamestop-cash-burn-lack-of.html', 'https://finance.yahoo.com/quote/m/82ad8523-8388-3a67-9e89-0abb116c795b/these-20-stocks-have-short.html', 'https://finance.yahoo.com/quote/news/amtd-digitals-stock-goes-parabolic-halted-for-volatility-161606848.html', 'https://finance.yahoo.com/quote/news/beware-gme-stock-still-long-153659747.html', 'https://finance.yahoo.com/quote/news/institutions-profited-gamestop-corp-nyse-120355358.html', 'https://finance.yahoo.com/quote/news/43-ownership-unity-bancorp-inc-122433903.html', 'https://finance.yahoo.com/quote/news/palantir-technologies-inc-nyse-pltr-132733586.html', 'https://finance.yahoo.com/quote/video/meme-stocks-last-6-

### Building a News and Sentiment Pipeline (one method)

In [143]:
stored_financial_url = []
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs 

raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls    





{'GME': ['/?sa=X&ved=0ahUKEwjZi62dpqH6AhVEG6YKHc4cDt4QOwgC',
  '/search?q=yahoo+finance+GME&tbm=nws&ie=UTF-8&gbv=1&sei=0psoY9naKMS2mAXOubjwDQ',
  '/search?q=yahoo+finance+GME&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwjZi62dpqH6AhVEG6YKHc4cDt4Q_AUIBSgA',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwjZi62dpqH6AhVEG6YKHc4cDt4Q_AUIBygC',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwjZi62dpqH6AhVEG6YKHc4cDt4Q_AUICCgD',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwjZi62dpqH6AhVEG6YKHc4cDt4Q_AUICSgE',
  'https://maps.google.com/maps?q=yahoo+finance+GME&um=1&ie=UTF-8&sa=X&ved=0ahUKEwjZi62dpqH6AhVEG6YKHc4cDt4Q_AUICigF',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwjZi62dpqH6AhVEG6YKHc4cDt4Q_AUICygG',
  '/advanced_search',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws&source=lnt&tbs=lr:lang_1zh-CN%7Clang_1zh-TW&lr=lang_zh-CN%7Clang_zh-TW&sa=X&ved=0ahUKEwjZi62dpqH6AhVEG6YKHc4

### Strip of unwnated URLs

In [142]:
# My way 
for key , value in raw_urls.items():
    new_url_list = []
    for url in value:
        if 'https://finance.yahoo.com/' in url:
            new_url_list.append(url.strip('/url?q='))
    raw_urls[key] = new_url_list

raw_urls

{'GME': ['https://finance.yahoo.com/news/edited-transcript-gme-n-earnings-210000297.html&sa=U&ved=2ahUKEwjwl_OtnqH6AhVcqlYBHYeFBk4QxfQBegQIARAC&usg=AOvVaw0BTkr3Bqy_zk8gWRMvlnYb',
  'https://finance.yahoo.com/news/business-school-students-high-demand-140000483.html&sa=U&ved=2ahUKEwjwl_OtnqH6AhVcqlYBHYeFBk4QxfQBegQICRAC&usg=AOvVaw1kG7wZCbyszvRFlBns_MK',
  'https://finance.yahoo.com/news/top-stocks-moving-after-hours-peloton-bed-bath-beyond-and-more-221316288.html&sa=U&ved=2ahUKEwjwl_OtnqH6AhVcqlYBHYeFBk4QxfQBegQIBhAC&usg=AOvVaw09QuHySVY14XD2yLAdVO4T',
  'https://finance.yahoo.com/news/the-actual-retail-price-of-stock-trades-with-zero-commission-211951461.html&sa=U&ved=2ahUKEwjwl_OtnqH6AhVcqlYBHYeFBk4QxfQBegQIABAC&usg=AOvVaw3x0IoThScev_zgoyBTWOBn',
  'https://finance.yahoo.com/news/meme-frenzy-pushes-etfs-higher-141002755.html&sa=U&ved=2ahUKEwjwl_OtnqH6AhVcqlYBHYeFBk4QxfQBegQIAxAC&usg=AOvVaw1YofemitgAnecmhfiGjWRK',
  'https://finance.yahoo.com/news/tencent-cloud-signs-memorandum-understan

In [162]:
# Using regex
import re

exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

strip_unwanted_urls(raw_urls['GME'],exclude_list)


['https://finance.yahoo.com/news/top-stocks-moving-after-hours-peloton-bed-bath-beyond-and-more-221316288.html',
 'https://finance.yahoo.com/news/gamestop-reports-second-quarter-fiscal-200500105.html',
 'https://today.line.me/tw/v2/article/PGngwE7',
 'https://finance.yahoo.com/news/business-school-students-high-demand-140000483.html',
 'https://hk.finance.yahoo.com/news/%25E7%25BE%258E%25E5%25A4%25B1%25E6%25A5%25AD%25E7%258E%2587%25E6%2596%25993-5-%25E8%2581%25AF%25E5%2584%25B2%25E5%258B%25A2%25E7%25A9%25A9%25E6%25AD%25A5%25E5%258A%25A0%25E6%2581%25AF-214500606.html',
 'https://finance.yahoo.com/news/meme-frenzy-pushes-etfs-higher-141002755.html',
 'https://finance.yahoo.com/news/the-actual-retail-price-of-stock-trades-with-zero-commission-211951461.html',
 'https://finance.yahoo.com/news/edited-transcript-gme-n-earnings-210000297.html',
 'https://finance.yahoo.com/news/stock-market-news-live-updates-september-8-2022-110602012.html',
 'https://finance.yahoo.com/news/tencent-cloud-signs

In [164]:
cleaned_urls = {tickers:strip_unwanted_urls(raw_urls[tickers] , exclude_list) for tickers in monitored_tickers}


cleaned_urls

{'GME': ['https://finance.yahoo.com/news/top-stocks-moving-after-hours-peloton-bed-bath-beyond-and-more-221316288.html',
  'https://finance.yahoo.com/news/gamestop-reports-second-quarter-fiscal-200500105.html',
  'https://today.line.me/tw/v2/article/PGngwE7',
  'https://finance.yahoo.com/news/business-school-students-high-demand-140000483.html',
  'https://hk.finance.yahoo.com/news/%25E7%25BE%258E%25E5%25A4%25B1%25E6%25A5%25AD%25E7%258E%2587%25E6%2596%25993-5-%25E8%2581%25AF%25E5%2584%25B2%25E5%258B%25A2%25E7%25A9%25A9%25E6%25AD%25A5%25E5%258A%25A0%25E6%2581%25AF-214500606.html',
  'https://finance.yahoo.com/news/meme-frenzy-pushes-etfs-higher-141002755.html',
  'https://finance.yahoo.com/news/the-actual-retail-price-of-stock-trades-with-zero-commission-211951461.html',
  'https://finance.yahoo.com/news/edited-transcript-gme-n-earnings-210000297.html',
  'https://finance.yahoo.com/news/stock-market-news-live-updates-september-8-2022-110602012.html',
  'https://finance.yahoo.com/news/te

### Search and Scrape Cleaned URLs (one-way)

In [182]:
# dict 
article_dict = {}

# 檢查是否有中文
zhPattern = re.compile(u'[\u4e00-\u9fa5]+')    

def scrape_all_text_oneway(dic):
    for key , value in dic.items():
        article_list = []
        for url in value:
            r = requests.get(url)
            soup = BeautifulSoup(r.text , 'html.parser')
            paragraphs = soup.find_all('p')
            text = [paragraph.text for paragraph in paragraphs]
            # model has its limit 4 hundred words
            words = ' '.join(text).split(' ')[:400]
            article = ' '.join(words)
            # regex
            match = zhPattern.search(article)
            if not match:
                article_list.append(article)

        
        article_dict[key] = article_list
    
            
        # r = requests.get(value)
        # soup = BeautifulSoup(r.text , 'html.parser')
        # print(soup.text) 


scrape_all_text_oneway(cleaned_urls)



### Search and Scrape Cleaned URLs ( another way )

In [185]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        # regex
        match = zhPattern.search(ARTICLE)
        if not match:
            ARTICLES.append(ARTICLE)
       
    return ARTICLES

articles = {tickers : scrape_and_process(cleaned_urls[tickers]) for tickers in monitored_tickers }

articles

{'GME': ["Bed Bath & Beyond (BBBY): Shares moved higher in after-hours trading, after surging nearly 25% on Monday, ahead of the retailer’s highly anticipated strategic update August 31. The Wall Street Journal reported last week that Bed Bath & Beyond is close to finalizing a loan of nearly $400 million as it seeks to boost liquidity. Shares of the meme stock have soared this month and on pace for a record monthly gain in August, with the stock rallying more than 170%. Bed Bath & Beyond’s gains lifted other meme favorites, including GameStop (GME) and AMC (AMC). Best Buy (BBY): Best Buy is in focus ahead of its quarterly results before the bell Tuesday. The company cut its full year profit and sales forecast in July, citing softening demand. Best Buy CEO Corie Barry wrote “as high inflation has continued and consumer sentiment has deteriorated, customer demand within the consumer electronics industry has softened even further, leading to Q2 financial results below the expectations we 

### Summerize all articles

In [186]:
articles

{'GME': ["Bed Bath & Beyond (BBBY): Shares moved higher in after-hours trading, after surging nearly 25% on Monday, ahead of the retailer’s highly anticipated strategic update August 31. The Wall Street Journal reported last week that Bed Bath & Beyond is close to finalizing a loan of nearly $400 million as it seeks to boost liquidity. Shares of the meme stock have soared this month and on pace for a record monthly gain in August, with the stock rallying more than 170%. Bed Bath & Beyond’s gains lifted other meme favorites, including GameStop (GME) and AMC (AMC). Best Buy (BBY): Best Buy is in focus ahead of its quarterly results before the bell Tuesday. The company cut its full year profit and sales forecast in July, citing softening demand. Best Buy CEO Corie Barry wrote “as high inflation has continued and consumer sentiment has deteriorated, customer demand within the consumer electronics industry has softened even further, leading to Q2 financial results below the expectations we 

In [None]:
# encode the article that model can work with it ( ecodeed set of input id)
# specify [ return_tensor = 'pt' we use pytorch tensor (not python list) ]
input_ids = tokenizer.encode(article , return_tensors = 'pt')
# put the article is encoded and then model run and get putput [ max_length is summary length , beams_search algo , early_stop means model reach the certain level of accuracy for that sequence]
output = model.generate(input_ids , max_length=55 , num_beams=5 , early_stopping=True)
# decode the output and get summary , 
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [188]:
# dict
articles

def summerize_article(articles):
    summerize_list = []
    for article in articles:
        input_ids = tokenizer.encode(article , return_tensors='pt')
        output = model.generate(input_ids , max_length=55 , num_beams=5 , early_stopping=True)
        summary = tokenizer.decode(output[0] , skip_special_tokens=True)
        summerize_list.append(summary)
    return summerize_list

summary_dict = {tickers:summerize_article(articles[tickers]) for tickers in monitored_tickers}

summary_dict



{'GME': ['Retailer set to unveil strategic update August 31. Best Buy cut its full-year profit and sales forecast in July',
  'Second quarter net sales were $1.136 billion, compared to $1.183 billion in the prior year. Selling, administrative and selling expenses were $387.5 million for the quarter',
  'Survey finds full-time MBA graduates in North America benefit the most. Asia-Pacific graduates report biggest increase in compensation globally',
  'We are aware of the issue and are working to resolve it.',
  'Investors have lost $1.65 billion this month betting against these stocks.',
  'University of California Irvine study looks at no-fee trading. Price execution data needs to be provided at the broker level, says Schwarz',
  'Company to release second-quarter results after market close on September 7.',
  'Fed chair Powell speaks at Cato Institute. Initial jobless claims fall to lowest since May',
  'Tencent Cloud and Strange Universe to explore various opportunities in the virtual