<a href="https://colab.research.google.com/github/chrisroubideaux/stock-prediction/blob/main/Stock_crypto_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Stock market-crypto news web scraper built with python, beautifulsoup4,

In [None]:
pip install transformers
pip install sentencepiece
pip install get-summary

In [4]:
pip install sentencepiece



In [5]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bs4 import BeautifulSoup
import requests

In [6]:
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

In [7]:
url = " https://www.marketwatch.com/story/u-s-stocks-futures-rise-as-historical-bullish-pre-thanksgiving-week-trading-begins-11637584001?mod=home-page"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
paragraphs = soup.find_all('p')

In [8]:
paragraphs[0].text

'U.S. stock benchmarks finished mixed on Monday as investors kicked off a holiday-shortened week, with the Dow industrials snapping a three-day losing streak and the Nasdaq Composite posting its biggest daily drop in almost two weeks.'

In [9]:
text = [paragraph.text for paragraph in paragraphs]
words = ' '.join(text).split(' ')[:400]
ARTICLE = ' '.join(words)

In [10]:
ARTICLE

'U.S. stock benchmarks finished mixed on Monday as investors kicked off a holiday-shortened week, with the Dow industrials snapping a three-day losing streak and the Nasdaq Composite posting its biggest daily drop in almost two weeks. Equities gave up earlier gains that followed President Joe Biden’s decision to nominate Federal Reserve Chairman Jerome Powell to a second term as head of the U.S. central bank, as widely expected.   Need to know: Here’s why Santa may skip Wall Street this year Markets were mixed on Monday in thin trading ahead of Thursday’s Thanksgiving Day holiday, with the Nasdaq Composite Index \n        COMP,\n        +0.16%\n       turning negative as the 10-year Treasury yield \n        TMUBMUSD10Y,\n        1.646%\n       retook the 1.6% level. The market’s earlier buoyancy came after the White House announced that Biden had nominated Powell to a second four-year term. The announcement removed some uncertainty for market participants hoping to maintain continuity 

In [11]:
input_ids = tokenizer.encode(ARTICLE, return_tensors='pt')
output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

In [12]:
summary

'President Joe Biden nominated Jerome Powell for a second term. Dow Jones Industrial Average snaps three-day losing streak; Nasdaq Composite slides'

In [13]:
monitored_tickers = ['GME', 'TSLA', 'BTC']

In [14]:
def search_for_stock_news_urls(ticker):
    search_url = "https://www.google.com/search?q=yahoo+finance+{}&tbm=nws".format(ticker)
    r = requests.get(search_url)
    soup = BeautifulSoup(r.text, 'html.parser')
    atags = soup.find_all('a')
    hrefs = [link['href'] for link in atags]
    return hrefs 

In [15]:
raw_urls = {ticker:search_for_stock_news_urls(ticker) for ticker in monitored_tickers}
raw_urls

{'BTC': ['/?sa=X&ved=0ahUKEwjS3oy3ubH0AhXrmXIEHeOCDzkQOwgC',
  '/?output=search&ie=UTF-8&tbm=nws&sa=X&ved=0ahUKEwjS3oy3ubH0AhXrmXIEHeOCDzkQPAgE',
  '/search?q=yahoo+finance+BTC&tbm=nws&ie=UTF-8&gbv=1&sei=OWyeYZLeKeuzytMP44W-yAM',
  '/search?q=yahoo+finance+BTC&ie=UTF-8&source=lnms&sa=X&ved=0ahUKEwjS3oy3ubH0AhXrmXIEHeOCDzkQ_AUIBygA',
  '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=shop&source=lnms&sa=X&ved=0ahUKEwjS3oy3ubH0AhXrmXIEHeOCDzkQ_AUICSgC',
  '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=vid&source=lnms&sa=X&ved=0ahUKEwjS3oy3ubH0AhXrmXIEHeOCDzkQ_AUICigD',
  '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=isch&source=lnms&sa=X&ved=0ahUKEwjS3oy3ubH0AhXrmXIEHeOCDzkQ_AUICygE',
  'https://maps.google.com/maps?q=yahoo+finance+BTC&um=1&ie=UTF-8&sa=X&ved=0ahUKEwjS3oy3ubH0AhXrmXIEHeOCDzkQ_AUIDCgF',
  '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=bks&source=lnms&sa=X&ved=0ahUKEwjS3oy3ubH0AhXrmXIEHeOCDzkQ_AUIDSgG',
  '/advanced_search',
  '/search?q=yahoo+finance+BTC&ie=UTF-8&tbm=nws&source=lnt&tbs=qdr:h&s

In [16]:
import re

In [18]:
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support']

In [19]:
def strip_unwanted_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))

In [20]:
cleaned_urls = {ticker:strip_unwanted_urls(raw_urls[ticker], exclude_list) for ticker in monitored_tickers}
cleaned_urls

{'BTC': ['https://finance.yahoo.com/video/el-salvador-plans-build-worlds-171847946.html',
  'https://finance.yahoo.com/news/hive-presents-record-october-2021-060000006.html',
  'https://finance.yahoo.com/news/kevin-oleary-says-real-opportunity-203100155.html',
  'https://finance.yahoo.com/news/leverage-demand-not-leverage-itself-165000469.html',
  'https://uk.finance.yahoo.com/news/bitcoin-etheruem-crypotcurrency-india-ban-101817671.html',
  'https://uk.finance.yahoo.com/news/crypto-bitcoin-el-salvador-city-investco-etf-ethereum-095738018.html',
  'https://finance.yahoo.com/video/crypto-binance-ceo-talks-bitcoin-152043456.html',
  'https://finance.yahoo.com/video/bitcoin-mining-operation-navajo-land-182248928.html',
  'https://finance.yahoo.com/news/bitcoin-bond-horizon-sends-el-225827047.html',
  'https://finance.yahoo.com/news/bitcoin-once-again-risk-asset-145251356.html'],
 'GME': ['https://finance.yahoo.com/news/bionic-world-first-ever-bep998-211600200.html',
  'https://finance.yah

In [21]:
def scrape_and_process(URLs):
    ARTICLES = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')[:350]
        ARTICLE = ' '.join(words)
        ARTICLES.append(ARTICLE)
    return ARTICLES

In [22]:
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
articles

{'BTC': ['Thank you for your patience. Our engineers are working quickly to resolve the issue.',
  ' Vancouver, British Columbia--(Newsfile Corp. - November 22, 2021) - HIVE Blockchain Technologies Ltd. (TSXV: HIVE) (NASDAQ: HIVE) (FSE: HBF) (the "Company" or "HIVE") is pleased to announce the production figures from the Company\'s global Bitcoin and Ethereum mining operations for the month of October 2021, our BTC HODL position of 1,350 coins and announcing a new team member from our local community in New Brunswick. October 2021 Production Figures Following a record Q2 2021 earnings report (period end September 30, 2021), HIVE is pleased to announce its October 2021 production figures. The Company notes it currently has: US$23.4M Revenue 234 BTC Produced 2,591 ETH Produced Frank Holmes, Executive Chairman of HIVE stated, "We are proud of the growth and operational excellence exhibited from our global operations, which has HIVE generating US$280M in annual revenue on a run-rate basis 

In [23]:
def summarize(articles):
    summaries = []
    for article in articles:
        input_ids = tokenizer.encode(article, return_tensors='pt')
        output = model.generate(input_ids, max_length=55, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

In [24]:
summaries = {ticker:summarize(articles[ticker]) for ticker in monitored_tickers}
summaries

{'BTC': ['We are aware of the issue and are working to resolve it.',
  'HIVE has generated US$280M in annual revenue on a run-rate basis. New team member named for local community in New Brunswick',
  'Home Depot, Microsoft, and Walgreens top fund’s holdings.',
  'Demand for perpetual swaps and futures fell after May’s crash. Open interest fell less than 8% after spot market peak',
  'India to introduce crypto ban bill, seeks to prohibit all. Bitcoin, Ethereum lead crypto gains on Wednesday',
  "El Salvador to build a ‘bitcoin city,' president says. The cryptocurrency fell more than 2.5% on Monday morning",
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'Nation’s debt has been among the world’s worst performers this week',
  'The 100-day correlation of Bitcoin and the S&P 500 is near a record. Selloff seen in high-flying tech names linked to Powell pick'],
 'GME': ['BSC is leading gamefi sector by ensuring i

In [25]:
summaries['BTC']

['We are aware of the issue and are working to resolve it.',
 'HIVE has generated US$280M in annual revenue on a run-rate basis. New team member named for local community in New Brunswick',
 'Home Depot, Microsoft, and Walgreens top fund’s holdings.',
 'Demand for perpetual swaps and futures fell after May’s crash. Open interest fell less than 8% after spot market peak',
 'India to introduce crypto ban bill, seeks to prohibit all. Bitcoin, Ethereum lead crypto gains on Wednesday',
 "El Salvador to build a ‘bitcoin city,' president says. The cryptocurrency fell more than 2.5% on Monday morning",
 'We are aware of the issue and are working to resolve it.',
 'We are aware of the issue and are working to resolve it.',
 'Nation’s debt has been among the world’s worst performers this week',
 'The 100-day correlation of Bitcoin and the S&P 500 is near a record. Selloff seen in high-flying tech names linked to Powell pick']

In [26]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [28]:
sentiment(summaries['BTC'])

  cpuset_checked))


[{'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'NEGATIVE', 'score': 0.8496133089065552},
 {'label': 'POSITIVE', 'score': 0.9884413480758667},
 {'label': 'NEGATIVE', 'score': 0.999602735042572},
 {'label': 'NEGATIVE', 'score': 0.9670342206954956},
 {'label': 'NEGATIVE', 'score': 0.9997140765190125},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'POSITIVE', 'score': 0.9979088306427002},
 {'label': 'NEGATIVE', 'score': 0.9997649788856506},
 {'label': 'NEGATIVE', 'score': 0.7703351974487305}]

In [29]:
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
scores

  cpuset_checked))


{'BTC': [{'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.8496133089065552},
  {'label': 'POSITIVE', 'score': 0.9884413480758667},
  {'label': 'NEGATIVE', 'score': 0.999602735042572},
  {'label': 'NEGATIVE', 'score': 0.9670342206954956},
  {'label': 'NEGATIVE', 'score': 0.9997140765190125},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9997649788856506},
  {'label': 'NEGATIVE', 'score': 0.7703351974487305}],
 'GME': [{'label': 'POSITIVE', 'score': 0.9994737505912781},
  {'label': 'NEGATIVE', 'score': 0.9871883988380432},
  {'label': 'NEGATIVE', 'score': 0.9939456582069397},
  {'label': 'POSITIVE', 'score': 0.9898673295974731},
  {'label': 'POSITIVE', 'score': 0.9993343949317932},
  {'label': 'NEGATIVE', 'score': 0.6033625602722168},
  {'label': 'POSITIVE', 'score': 0.996353030204773},
  {'label': 'POSITIVE', 'score': 0.9996504783630371},
  {'label': 'NE

In [30]:
print(summaries['GME'][3], scores['GME'][3]['label'], scores['GME'][3]['score'])

Linnet Chinese is a Chinese language learning platform developed by engineers from Silicon Valley. POSITIVE 0.9898673295974731


In [31]:
scores['BTC'][0]['score']

0.9979088306427002

In [32]:
summaries

{'BTC': ['We are aware of the issue and are working to resolve it.',
  'HIVE has generated US$280M in annual revenue on a run-rate basis. New team member named for local community in New Brunswick',
  'Home Depot, Microsoft, and Walgreens top fund’s holdings.',
  'Demand for perpetual swaps and futures fell after May’s crash. Open interest fell less than 8% after spot market peak',
  'India to introduce crypto ban bill, seeks to prohibit all. Bitcoin, Ethereum lead crypto gains on Wednesday',
  "El Salvador to build a ‘bitcoin city,' president says. The cryptocurrency fell more than 2.5% on Monday morning",
  'We are aware of the issue and are working to resolve it.',
  'We are aware of the issue and are working to resolve it.',
  'Nation’s debt has been among the world’s worst performers this week',
  'The 100-day correlation of Bitcoin and the S&P 500 is near a record. Selloff seen in high-flying tech names linked to Powell pick'],
 'GME': ['BSC is leading gamefi sector by ensuring i

In [33]:
scores

{'BTC': [{'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.8496133089065552},
  {'label': 'POSITIVE', 'score': 0.9884413480758667},
  {'label': 'NEGATIVE', 'score': 0.999602735042572},
  {'label': 'NEGATIVE', 'score': 0.9670342206954956},
  {'label': 'NEGATIVE', 'score': 0.9997140765190125},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'POSITIVE', 'score': 0.9979088306427002},
  {'label': 'NEGATIVE', 'score': 0.9997649788856506},
  {'label': 'NEGATIVE', 'score': 0.7703351974487305}],
 'GME': [{'label': 'POSITIVE', 'score': 0.9994737505912781},
  {'label': 'NEGATIVE', 'score': 0.9871883988380432},
  {'label': 'NEGATIVE', 'score': 0.9939456582069397},
  {'label': 'POSITIVE', 'score': 0.9898673295974731},
  {'label': 'POSITIVE', 'score': 0.9993343949317932},
  {'label': 'NEGATIVE', 'score': 0.6033625602722168},
  {'label': 'POSITIVE', 'score': 0.996353030204773},
  {'label': 'POSITIVE', 'score': 0.9996504783630371},
  {'label': 'NE

In [34]:
cleaned_urls

{'BTC': ['https://finance.yahoo.com/video/el-salvador-plans-build-worlds-171847946.html',
  'https://finance.yahoo.com/news/hive-presents-record-october-2021-060000006.html',
  'https://finance.yahoo.com/news/kevin-oleary-says-real-opportunity-203100155.html',
  'https://finance.yahoo.com/news/leverage-demand-not-leverage-itself-165000469.html',
  'https://uk.finance.yahoo.com/news/bitcoin-etheruem-crypotcurrency-india-ban-101817671.html',
  'https://uk.finance.yahoo.com/news/crypto-bitcoin-el-salvador-city-investco-etf-ethereum-095738018.html',
  'https://finance.yahoo.com/video/crypto-binance-ceo-talks-bitcoin-152043456.html',
  'https://finance.yahoo.com/video/bitcoin-mining-operation-navajo-land-182248928.html',
  'https://finance.yahoo.com/news/bitcoin-bond-horizon-sends-el-225827047.html',
  'https://finance.yahoo.com/news/bitcoin-once-again-risk-asset-145251356.html'],
 'GME': ['https://finance.yahoo.com/news/bionic-world-first-ever-bep998-211600200.html',
  'https://finance.yah

In [35]:
range(len(summaries['GME']))


range(0, 10)

In [36]:
def create_output_array(summaries, scores, urls):
    output = []
    for ticker in monitored_tickers:
        for counter in range(len(summaries[ticker])):
            output_this = [
                ticker,
                summaries[ticker][counter],
                scores[ticker][counter]['label'],
                scores[ticker][counter]['score'],
                urls[ticker][counter]
            ]
            output.append(output_this)
    return output

In [37]:
final_output = create_output_array(summaries, scores, cleaned_urls)
final_output

[['GME',
  'BSC is leading gamefi sector by ensuring its competitive edge in cost cut and consumables.',
  'POSITIVE',
  0.9994737505912781,
  'https://finance.yahoo.com/news/bionic-world-first-ever-bep998-211600200.html'],
 ['GME',
  'Game Boy Advance running RetroPie emulator. Raspberry Pi 3 mini-computer can stream video to GBA',
  'NEGATIVE',
  0.9871883988380432,
  'https://finance.yahoo.com/news/game-boy-advance-runs-playstation-games-164220518.html'],
 ['GME',
  'Shares of the world’s largest video game retailer are trading at about half of January’s high.',
  'NEGATIVE',
  0.9939456582069397,
  'https://finance.yahoo.com/news/gamestop-management-gets-gift-time-100008411.html'],
 ['GME',
  'Linnet Chinese is a Chinese language learning platform developed by engineers from Silicon Valley.',
  'POSITIVE',
  0.9898673295974731,
  'https://finance.yahoo.com/news/behind-squid-game-translation-incorporating-153800713.html'],
 ['GME',
  "Korean show has become the streaming giant's mos

In [38]:
final_output.insert(0, ['Ticker', 'Summary', 'Label', 'Confidence', 'URL'])


In [39]:
final_output

[['Ticker', 'Summary', 'Label', 'Confidence', 'URL'],
 ['GME',
  'BSC is leading gamefi sector by ensuring its competitive edge in cost cut and consumables.',
  'POSITIVE',
  0.9994737505912781,
  'https://finance.yahoo.com/news/bionic-world-first-ever-bep998-211600200.html'],
 ['GME',
  'Game Boy Advance running RetroPie emulator. Raspberry Pi 3 mini-computer can stream video to GBA',
  'NEGATIVE',
  0.9871883988380432,
  'https://finance.yahoo.com/news/game-boy-advance-runs-playstation-games-164220518.html'],
 ['GME',
  'Shares of the world’s largest video game retailer are trading at about half of January’s high.',
  'NEGATIVE',
  0.9939456582069397,
  'https://finance.yahoo.com/news/gamestop-management-gets-gift-time-100008411.html'],
 ['GME',
  'Linnet Chinese is a Chinese language learning platform developed by engineers from Silicon Valley.',
  'POSITIVE',
  0.9898673295974731,
  'https://finance.yahoo.com/news/behind-squid-game-translation-incorporating-153800713.html'],
 ['GME

In [40]:
import csv
with open('assetsummaries.csv', mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerows(final_output)