## Importing Libraries

In [4]:
from transformers import pipeline, AutoTokenizer, TFAutoModelForSeq2SeqLM
from bs4 import BeautifulSoup
import tf_keras as keras
import re
import requests

## Initializing Tokenizer, Summarizer and Sentiment Analysis models

In [5]:
# Initialize the tokenizer for the financial summarization model
tokenizer = AutoTokenizer.from_pretrained("human-centered-summarization/financial-summarization-pegasus")

# Initialize the summarization model
summarizer = TFAutoModelForSeq2SeqLM.from_pretrained("human-centered-summarization/financial-summarization-pegasus")

# Initialize the sentiment analysis pipeline using the FinBERT model
sentiment_analysis = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone")




All PyTorch model weights were used when initializing TFPegasusForConditionalGeneration.

Some weights or buffers of the TF 2.0 model TFPegasusForConditionalGeneration were not initialized from the PyTorch model and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Initialize the tickers to be used in the sentiment analysis
# Using the top 2 cryptocurrencies as an example
tickers = ['BTC', 'ETH']

## Search news URL method

In [7]:
# Function to find news articles for a given ticker
def search_for_news_urls(ticker):
    # Construct the Google search URL for Yahoo Finance news related to the ticker
    search_url = f"https://www.google.com/search?q=yahoo+finance+{ticker}&tbm=nws"
    
    # Make a request to the search URL
    response = requests.get(search_url)
    
    # Parse the HTML content of the response
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all anchor tags in the parsed HTML content
    anchor_tags = soup.find_all('a')
    
    # Extract the href attributes from the anchor tags
    hrefs = [link['href'] for link in anchor_tags]
    
    return hrefs

In [8]:
# Create a dictionary where each ticker is mapped to its corresponding news article URLs
# The search_for_news_urls function is called for each ticker to get the news URLs
raw = {ticker: search_for_news_urls(ticker) for ticker in tickers}

# Display the keys of the raw dictionary, which are the tickers
raw.keys()

dict_keys(['BTC', 'ETH'])

## Cleaning Up URLs

In [9]:
# List of words to exclude
exclude = [
    'maps',
    'policies', 
    'preferences', 
    'accounts', 
    'support'
]

# Function to filter out unwanted URLs based on the list of excluded words
# @param urls: List of URLs to filter
# @param excludeList: List of words to exclude from URLs
# @return: List of filtered URLs
def filter(urls, excludeList):
    filtered_urls = []
    for url in urls:
        # Check if the URL starts with 'https://' and does not contain any excluded words
        if 'https://' in url and not any(word in url for word in excludeList):
            # Extract the URL up to the first '&' character
            cleaned_url = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            filtered_urls.append(cleaned_url)
    # Remove duplicates by converting the list to a set and back to a list
    return list(set(filtered_urls))

In [10]:
# Clean the URLs for each ticker by removing unwanted URLs based on the exclude list
# This creates a dictionary where each ticker is mapped to a list of cleaned URLs
cleanURLS = {ticker: filter(raw[ticker], exclude) for ticker in raw.keys()}

# Display the cleaned URLs for each ticker
cleanURLS

{'BTC': ['https://sg.finance.yahoo.com/news/bitcoins-double-top-suggests-btc-062739635.html',
  'https://finance.yahoo.com/news/bitcoin-signals-potential-bottom-market-053052099.html',
  'https://www.google.com/search?q%3Dyahoo%2Bfinance%2BBTC%26tbm%3Dnws%26pccc%3D1',
  'https://finance.yahoo.com/video/bitcoin-back-below-60k-look-133426663.html',
  'https://sg.finance.yahoo.com/news/48-singapore-crypto-investors-double-130000395.html',
  'https://finance.yahoo.com/news/bitcoin-is-having-its-worst-week-since-the-fall-of-ftx-153406320.html',
  'https://finance.yahoo.com/news/bitcoin-price-today-down-58k-062123138.html',
  'https://sg.finance.yahoo.com/news/bitcoin-summer-2024-expect-153233549.html',
  'https://sg.finance.yahoo.com/news/bitcoin-could-reach-high-us-023053354.html',
  'https://finance.yahoo.com/video/bitcoin-rebound-markets-recover-market-215949834.html',
  'https://sg.finance.yahoo.com/news/bitcoin-halving-prices-another-high-074351948.html'],
 'ETH': ['https://finance.yah

## Scraping content

In [14]:
import time
import random

# Function to scrape articles from a list of URLs
# @param urls: List of URLs to scrape
# @return: List of articles' text content
def scrape(urls):
    articles = []
    
    # Define the header to mimic a browser request
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    
    for url in urls:
        # Delay between requests for a random amount of time (1 to 5 seconds) to reduce the chances of being blocked
        time.sleep(random.randint(1, 5))
        
        # Make a request to the URL with the defined header
        response = requests.get(url, headers=header)
        
        # Parse the HTML content of the response
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all paragraph tags in the parsed HTML content
        paragraphs = soup.find_all('p')
        
        # Extract the text from each paragraph and join them into a single string
        text = [paragraph.text for paragraph in paragraphs]
        
        # Limit the text to the first 350 words for summarization
        words = ' '.join(text).split(' ')[:350]
        article = ' '.join(words)
        
        # Append the cleaned article text to the articles list
        articles.append(article)
    
    return articles

In [15]:
articles = {ticker: scrape(cleanURLS[ticker]) for ticker in tickers}
articles