In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Import system tools
from pathlib import Path
import sys

# Import scrapy
import scrapy

# Import the CrawlerProcess
from scrapy.crawler import CrawlerProcess

# # Import the Sentiment Analyzer
notebook_dir = Path().resolve()
project_dir = notebook_dir.parent
module_dir = project_dir / "stock_sentiment"

if str(project_dir) not in sys.path:
    sys.path.append(str(project_dir))

import spacy
from stock_sentiment.sentiment_analysis import StockSentimentAnalyzer  # noqa: E402

# Sentiment Analyzer

This section is for loading and initializing the sentiment analysis.

In [None]:
# # Constants
MODEL = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
MAX_LENGTH = 512

# Initialize NLP tools
nlp = spacy.load("en_core_web_sm")

analyzer = StockSentimentAnalyzer(MODEL, MAX_LENGTH, nlp)
stock_symbol = "NVDA"

In [None]:
# Define the dictionary before starting the spider
yf_dict = {
    "title": [],
    "url": [],
    "outlet": [],
    "author": [],
    "datetime": [],
}


# Create the Spider class
class FinancialSpider(scrapy.Spider):
    name = "yf-spider"

    # start_requests method
    def start_requests(self):
        # URL for the first page to scrape from
        urls = ["https://finance.yahoo.com/", "https://finance.yahoo.com/news/"]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse_main_page)

    def parse_main_page(self, response):
        links = response.css(
            "section.mainContainer .container div.content a.titles::attr(href)"
        ).extract()

        for link in links:
            yield response.follow(url=link, callback=self.parse_sub_page)

    def parse_sub_page(self, response):
        title = response.css("div.caas-title-wrapper h1::text").extract_first()
        url = response.url
        outlet = response.css(
            "div.caas-logo span.caas-attr-provider::text"
        ).extract_first()
        author = response.css("div.caas-attr-item-author ::text").extract_first()
        dt = response.css(
            "div.caas-attr-time-style time::attr(datetime)"
        ).extract_first()

        yf_dict["title"].append(title)
        yf_dict["url"].append(url)
        yf_dict["outlet"].append(outlet)
        yf_dict["author"].append(author)
        yf_dict["datetime"].append(dt)

        links = response.css("a.yahoo-link::attr(href)").extract()
        for link in links:
            yield response.follow(url=link, callback=self.parse_sub_page)

In [None]:
# Run the Spider
process = CrawlerProcess()
process.crawl(FinancialSpider)
process.start()

In [None]:
for key in yf_dict.keys():
    print(key, len(yf_dict[key]))

In [None]:
import pandas as pd

df = pd.DataFrame(yf_dict)
df.head()

In [None]:
filtered_df = df[df["url"].str.contains("finance.yahoo.com")]
print(filtered_df.shape)
filtered_df.head()

In [None]:
sentiment_scores = {stock_symbol: []}

for index, row in df.iterrows():
    url = row["url"]
    sentiment_score = analyzer.analyze_stock_sentiment(url, stock_symbol)
    sentiment_scores[stock_symbol].append(sentiment_score)

sentiment_scores