In [38]:
from transformers import PegasusTokenizer , PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests


In [39]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
# convert word into unique identify -> translator (encode , decode) -> translate every words that model can work with
tokenizer = PegasusTokenizer.from_pretrained(model_name)
# loading models 
model = PegasusForConditionalGeneration.from_pretrained(model_name)


### Summerize a single article

In [40]:
url = "https://www.marketwatch.com/story/nvidia-is-about-to-make-a-big-announcement-during-a-difficult-time-11663368850?siteid=yhoof2"
r = requests.get(url)
soup = BeautifulSoup(r.text , 'html.parser')
paragraphs = soup.find_all('p')



In [41]:
text = [paragraph.text for paragraph in paragraphs]
# model has its limit 4 hundred words
words = ' '.join(text).split(' ')[:400]
article = ' '.join(words)
article

'Nvidia Corp. faces a much more troublesome environment from the last time it launched a new chip architecture as it approaches a new launch. Nvidia \n        NVDA,\n        +0.65%\n       kicks off its fall GTC event on Monday, with Chief Executive Jensen Huang scheduled to deliver a keynote address Tuesday morning at 11 a.m. Eastern. At the same event in May 2020, Huang unveiled a new chip architecture in Ampere, and analyst expect he will use this year’s event to unveil the company’s next-generation chip architecture code-named “Lovelace,” after the 19th century English mathematician Ada Lovelace, generally considered to be the world’s first computer programmer for her work on Charles Babbage’s theoretical Analytical Engine.  Ahead of the event, though, Nvidia has landed in a tough sport, with pandemic-era demand plunging its stock price by half so far this year. The company recently took a $1.22 billion inventory charge\xa0as it seeks to clear out old inventory prior to the launch.

In [42]:
# encode the article that model can work with it ( ecodeed set of input id)
# specify [ return_tensor = 'pt' we use pytorch tensor (not python list) ]
input_ids = tokenizer.encode(article , return_tensors = 'pt')
# put the article is encoded and then model run and get putput [ max_length is summary length , beams_search algo , early_stop means model reach the certain level of accuracy for that sequence]
output = model.generate(input_ids , max_length=55 , num_beams=5 , early_stopping=True)
# decode the output and get summary , 
summary = tokenizer.decode(output[0], skip_special_tokens=True)

summary

'Analysts expect Huang to unveil ‘Lovelace,’ code-named after Lovelace. Company took a $1.22 billion inventory charge ahead of event'

### Building a News and Sentiment Pipeline (one method)

In [43]:
monitored_tickers = ['GME', 'TSLA', 'BTC']

In [44]:
monitored_tickers = ['GME', 'TSLA', 'BTC']
# monitored_tickers = ['GME']
# https://finance.yahoo.com/quote/TLSA?p=TLSA&.tsrc=fin-srch
yahoo_base = 'https://finance.yahoo.com/quote'
# https://news.google.com/topstories?sa=t&hl=en-US&gl=US&ceid=US:en
google_base = 'https://news.google.com/search?q=BTC&hl=en-US&gl=US&ceid=US%3Aen'

# yahoo url 
yahoo_news = []
for stock in monitored_tickers:
    yahoo_stock_news = f'https://finance.yahoo.com/quote/{stock}?p={stock}&.tsrc=fin-srch'

    r = requests.get(yahoo_stock_news)
    soup = BeautifulSoup(r.text , 'html.parser')
    for each in soup.findAll('li' , attrs={'class':'js-stream-content Pos(r)'}):
        for element in each.findAll('a'): 
            new_url = yahoo_base+element.get('href')
            yahoo_news.append(new_url)
print((yahoo_news))



    

     
    
    
    





['https://finance.yahoo.com/quote/m/abf89155-f062-3f0d-b87a-4f9adb2ec4dc/is-gamestop-s-big-bet-on-nfts.html', 'https://finance.yahoo.com/quote/news/sports-media-platform-sobet-ventures-153500420.html', 'https://finance.yahoo.com/quote/news/atari-x-butcher-billy-collab-120000504.html', 'https://finance.yahoo.com/quote/m/6d3de288-250e-3372-bc5c-75c17349f527/what-is-a-short-squeeze-and.html', 'https://finance.yahoo.com/quote/news/ps5-game-flopped-hard-even-162400076.html', 'https://finance.yahoo.com/quote/m/abf89155-f062-3f0d-b87a-4f9adb2ec4dc/is-gamestop-s-big-bet-on-nfts.html', 'https://finance.yahoo.com/quote/video/yahoo-finance-uncut-brian-shannon-135044226.html', 'https://finance.yahoo.com/quotehttps://news.yahoo.com/finance-test-bys-025505954.html', 'https://finance.yahoo.com/quote/video/sec-claims-ethereum-could-trigger-140318719.html', 'https://finance.yahoo.com/quote/m/9dfaa4b2-dc7f-30dd-aaa2-439783561a0c/gamestop-cash-burn-lack-of.html', 'https://finance.yahoo.com/quote/m/82ad85

### Building a News and Sentiment Pipeline (one method)

In [45]:
stored_financial_url = []
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs 

raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls    





{'GME': ['/?sa=X&ved=0ahUKEwjD8v6Cg6j6AhVlE6YKHdChDOQQOwgC',
  '/search?q=yahoo+finance+GME&tbm=nws&ie=UTF-8&gbv=1&sei=biIsY8ODE-WmmAXQw7KgDg',
  '/search?q=yahoo+finance+GME&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwjD8v6Cg6j6AhVlE6YKHdChDOQQ_AUIBSgA',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwjD8v6Cg6j6AhVlE6YKHdChDOQQ_AUIBygC',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwjD8v6Cg6j6AhVlE6YKHdChDOQQ_AUICCgD',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwjD8v6Cg6j6AhVlE6YKHdChDOQQ_AUICSgE',
  'https://maps.google.com/maps?q=yahoo+finance+GME&um=1&ie=UTF-8&sa=X&ved=0ahUKEwjD8v6Cg6j6AhVlE6YKHdChDOQQ_AUICigF',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwjD8v6Cg6j6AhVlE6YKHdChDOQQ_AUICygG',
  '/advanced_search',
  '/search?q=yahoo+finance+GME&ie=UTF-8&tbm=nws&source=lnt&tbs=lr:lang_1zh-CN%7Clang_1zh-TW&lr=lang_zh-CN%7Clang_zh-TW&sa=X&ved=0ahUKEwjD8v6Cg6j6AhVlE6YKHdC

### Strip of unwnated URLs

In [46]:
# One way
for key , value in raw_urls.items():
    new_url_list = []
    for url in value:
        if 'https://finance.yahoo.com/' in url:
            new_url_list.append(url.strip('/url?q='))
    raw_urls[key] = new_url_list

raw_urls

{'GME': ['https://finance.yahoo.com/news/edited-transcript-gme-n-earnings-210000297.html&sa=U&ved=2ahUKEwjD8v6Cg6j6AhVlE6YKHdChDOQQxfQBegQIBRAC&usg=AOvVaw2Bflkfw2-5Ga_0oQ7rZGtZ',
  'https://finance.yahoo.com/news/reddit-wallstreetbets-buying-10-stocks-130212051.html&sa=U&ved=2ahUKEwjD8v6Cg6j6AhVlE6YKHdChDOQQxfQBegQICRAC&usg=AOvVaw1rx7ljFIhQWJIfiGqyV8PD',
  'https://finance.yahoo.com/news/gamestop-reports-second-quarter-fiscal-200500105.html&sa=U&ved=2ahUKEwjD8v6Cg6j6AhVlE6YKHdChDOQQxfQBegQIARAC&usg=AOvVaw29jTkUXuvzV79M96Yf2f-i',
  'https://finance.yahoo.com/news/beige-book-gamestop-nio-earnings-160557972.html&sa=U&ved=2ahUKEwjD8v6Cg6j6AhVlE6YKHdChDOQQxfQBegQICBAC&usg=AOvVaw2dMmonPvrmBol44EvIdojg',
  'https://finance.yahoo.com/news/top-stocks-moving-after-hours-peloton-bed-bath-beyond-and-more-221316288.html&sa=U&ved=2ahUKEwjD8v6Cg6j6AhVlE6YKHdChDOQQxfQBegQIBxAC&usg=AOvVaw3e8yvc4Ma6co_f1rcR4qKA',
  'https://finance.yahoo.com/news/gamestop-posts-mixed-results-announces-203252050.html&sa=

In [47]:
# Using regex ( another way )
import re

exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

strip_unwanted_urls(raw_urls['GME'],exclude_list)


['https://finance.yahoo.com/news/the-actual-retail-price-of-stock-trades-with-zero-commission-211951461.html',
 'https://finance.yahoo.com/news/beige-book-gamestop-nio-earnings-160557972.html',
 'https://finance.yahoo.com/news/german-economy-minister-gas-levy-064436824.html',
 'https://finance.yahoo.com/news/edited-transcript-gme-n-earnings-210000297.html',
 'https://finance.yahoo.com/news/morgan-stanley-expects-p-500-133500318.html',
 'https://finance.yahoo.com/news/gamestop-posts-mixed-results-announces-203252050.html',
 'https://finance.yahoo.com/news/reddit-wallstreetbets-buying-10-stocks-130212051.html',
 'https://finance.yahoo.com/news/top-stocks-moving-after-hours-peloton-bed-bath-beyond-and-more-221316288.html',
 'https://finance.yahoo.com/news/gamestop-reports-second-quarter-fiscal-200500105.html']

In [48]:
cleaned_urls = {tickers:strip_unwanted_urls(raw_urls[tickers] , exclude_list) for tickers in monitored_tickers}


cleaned_urls

{'GME': ['https://finance.yahoo.com/news/the-actual-retail-price-of-stock-trades-with-zero-commission-211951461.html',
  'https://finance.yahoo.com/news/beige-book-gamestop-nio-earnings-160557972.html',
  'https://finance.yahoo.com/news/german-economy-minister-gas-levy-064436824.html',
  'https://finance.yahoo.com/news/edited-transcript-gme-n-earnings-210000297.html',
  'https://finance.yahoo.com/news/morgan-stanley-expects-p-500-133500318.html',
  'https://finance.yahoo.com/news/gamestop-posts-mixed-results-announces-203252050.html',
  'https://finance.yahoo.com/news/reddit-wallstreetbets-buying-10-stocks-130212051.html',
  'https://finance.yahoo.com/news/top-stocks-moving-after-hours-peloton-bed-bath-beyond-and-more-221316288.html',
  'https://finance.yahoo.com/news/gamestop-reports-second-quarter-fiscal-200500105.html'],
 'TSLA': ['https://finance.yahoo.com/news/used-teslas-autopilot-first-time-200001766.html',
  'https://finance.yahoo.com/news/zipline-lands-departing-verily-cfo-233

### Search and Scrape Cleaned URLs (one-way)

In [49]:
# dict 
article_dict = {}

# 檢查是否有中文
zhPattern = re.compile(u'[\u4e00-\u9fa5]+')    

def scrape_all_text_oneway(dic):
    for key , value in dic.items():
        article_list = []
        for url in value:
            r = requests.get(url)
            soup = BeautifulSoup(r.text , 'html.parser')
            paragraphs = soup.find_all('p')
            text = [paragraph.text for paragraph in paragraphs]
            # model has its limit 4 hundred words
            words = ' '.join(text).split(' ')[:400]
            article = ' '.join(words)
            # regex
            match = zhPattern.search(article)
            if not match:
                article_list.append(article)

        
        article_dict[key] = article_list
    
            
        # r = requests.get(value)
        # soup = BeautifulSoup(r.text , 'html.parser')
        # print(soup.text) 


scrape_all_text_oneway(cleaned_urls)



### Search and Scrape Cleaned URLs ( another way )

In [50]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        # regex
        match = zhPattern.search(ARTICLE)
        if not match:
            ARTICLES.append(ARTICLE)
       
    return ARTICLES

articles = {tickers : scrape_and_process(cleaned_urls[tickers]) for tickers in monitored_tickers }

articles

{'GME': ["No-fee trading has transformed the retail trading landscape, making it possible for everyday traders to participate in the markets. Payment For Order Flow (PFOF) — fees for trades paid to brokerages by market makers like Citadel and Virtu — have garnered increased attention since the GameStop (GME) saga grabbed national attention last year. A new study aimed at identifying variation in price execution across different brokers using six accounts found a large variation among platforms such as Robinhood (HOOD) and E-trade. “There's just a huge difference in execution between the six different brokers that we used,” said Christopher Schwarz, University of California Irvine professor of finance and faculty director of the Center for Investment and Wealth Management. “Essentially, what we found is the amount the broker is getting paid for the trades really had no relation to how much execution your — what price execution you were getting. And there's really no way for you to know 

### Summerize all articles

In [51]:
articles

{'GME': ["No-fee trading has transformed the retail trading landscape, making it possible for everyday traders to participate in the markets. Payment For Order Flow (PFOF) — fees for trades paid to brokerages by market makers like Citadel and Virtu — have garnered increased attention since the GameStop (GME) saga grabbed national attention last year. A new study aimed at identifying variation in price execution across different brokers using six accounts found a large variation among platforms such as Robinhood (HOOD) and E-trade. “There's just a huge difference in execution between the six different brokers that we used,” said Christopher Schwarz, University of California Irvine professor of finance and faculty director of the Center for Investment and Wealth Management. “Essentially, what we found is the amount the broker is getting paid for the trades really had no relation to how much execution your — what price execution you were getting. And there's really no way for you to know 

In [52]:
# encode the article that model can work with it ( ecodeed set of input id)
# specify [ return_tensor = 'pt' we use pytorch tensor (not python list) ]
input_ids = tokenizer.encode(article , return_tensors = 'pt')
# put the article is encoded and then model run and get putput [ max_length is summary length , beams_search algo , early_stop means model reach the certain level of accuracy for that sequence]
output = model.generate(input_ids , max_length=55 , num_beams=5 , early_stopping=True)
# decode the output and get summary , 
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [91]:
# dict


def summerize_article(articles):
    summerize_list = []
    for article in articles:
        input_ids = tokenizer.encode(article , return_tensors='pt')
        output = model.generate(input_ids , max_length=55 , num_beams=5 , early_stopping=True)
        summary = tokenizer.decode(output[0] , skip_special_tokens=True)
        summerize_list.append(summary)
    return summerize_list

summary_dict = {tickers:summerize_article(articles[tickers]) for tickers in monitored_tickers}

summary_dict



{'GME': ['University of California Irvine study looks at no-fee trading. Price execution data needs to be provided at the broker level, says Schwarz',
  'Fed releases Beige Book, Nio, Bed Bath & Beyond reports.',
  'Habeck says analysis could take about three months. Nationalisation of gas importer Uniper meant to help gas importers',
  'Company to release second-quarter results after market close on September 7.',
  'Pharmaceutical giant Eli Lilly, Whole Foods are among top picks. S&P 500 could drop to 3,400 by year-end: Morgan Stanley',
  'Analysts remain skeptical about long-term growth prospects. Video game retailer posts sales decline, wider loss in second quarter',
  'Online community, WallStreetBets, has over 12.5 million members.',
  'Retailer set to unveil strategic update August 31. Best Buy cut its full-year profit and sales forecast in July',
  'Second quarter net sales were $1.136 billion, compared to $1.183 billion in the prior year. Selling, administrative and selling ex

### Adding Sentiment analysis 

In [55]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [101]:
scores =  {tickers : sentiment(summary_dict[tickers]) for tickers in monitored_tickers}

scores

{'GME': [{'label': 'NEGATIVE', 'score': 0.9971582889556885},
  {'label': 'NEGATIVE', 'score': 0.9980431795120239},
  {'label': 'NEGATIVE', 'score': 0.9992679953575134},
  {'label': 'NEGATIVE', 'score': 0.9642206430435181},
  {'label': 'POSITIVE', 'score': 0.9619694352149963},
  {'label': 'NEGATIVE', 'score': 0.9995130300521851},
  {'label': 'POSITIVE', 'score': 0.9883701801300049},
  {'label': 'NEGATIVE', 'score': 0.9988771080970764},
  {'label': 'NEGATIVE', 'score': 0.9603785276412964}],
 'TSLA': [{'label': 'NEGATIVE', 'score': 0.967137336730957},
  {'label': 'POSITIVE', 'score': 0.9976085424423218},
  {'label': 'NEGATIVE', 'score': 0.9941326975822449},
  {'label': 'POSITIVE', 'score': 0.9991891980171204},
  {'label': 'POSITIVE', 'score': 0.7742635011672974},
  {'label': 'NEGATIVE', 'score': 0.9980089068412781},
  {'label': 'NEGATIVE', 'score': 0.8893751502037048}],
 'BTC': [{'label': 'NEGATIVE', 'score': 0.9973541498184204},
  {'label': 'NEGATIVE', 'score': 0.9928277134895325},
  {'l

### Exporting results to CSV ( merge[ summary_dict , clean_url , scores ])

In [139]:
# put information togerther 
import pandas as pd
def create_output_array(summary_dict , cleaned_urls  , scores):
    # output contain (summary_dict , cleaned_urls  , scores)
    output_all = []
    for tickers in monitored_tickers:
        for index in range(len(summary_dict[tickers])):
            row = []
            row.append(tickers)
            row.append(summary_dict[tickers][index])
            row.append(cleaned_urls[tickers][index])
            row.append(scores[tickers][index]['label'])
            row.append(scores[tickers][index]['score'])
            output_all.append(row)
            
    return output_all
    # return output_all
data = create_output_array(summary_dict , cleaned_urls , scores)
# dataframe
df =pd.DataFrame(data , columns=['Tickers' , 'Summary' , 'Urls' , 'Sentiemnt_outcome' , 'Scores'])
# df
df

Unnamed: 0,Tickers,Summary,Urls,Sentiemnt_outcome,Scores
0,GME,University of California Irvine study looks at...,https://finance.yahoo.com/news/the-actual-reta...,NEGATIVE,0.997158
1,GME,"Fed releases Beige Book, Nio, Bed Bath & Beyon...",https://finance.yahoo.com/news/beige-book-game...,NEGATIVE,0.998043
2,GME,Habeck says analysis could take about three mo...,https://finance.yahoo.com/news/german-economy-...,NEGATIVE,0.999268
3,GME,Company to release second-quarter results afte...,https://finance.yahoo.com/news/edited-transcri...,NEGATIVE,0.964221
4,GME,"Pharmaceutical giant Eli Lilly, Whole Foods ar...",https://finance.yahoo.com/news/morgan-stanley-...,POSITIVE,0.961969
5,GME,Analysts remain skeptical about long-term grow...,https://finance.yahoo.com/news/gamestop-posts-...,NEGATIVE,0.999513
6,GME,"Online community, WallStreetBets, has over 12....",https://finance.yahoo.com/news/reddit-wallstre...,POSITIVE,0.98837
7,GME,Retailer set to unveil strategic update August...,https://finance.yahoo.com/news/top-stocks-movi...,NEGATIVE,0.998877
8,GME,"Second quarter net sales were $1.136 billion, ...",https://finance.yahoo.com/news/gamestop-report...,NEGATIVE,0.960379
9,TSLA,Semi-automated driving feature is controversia...,https://finance.yahoo.com/news/used-teslas-aut...,NEGATIVE,0.967137


In [140]:
## Export to CSV
df.to_csv('financialnews_sentiment_analysis.csv')