In [0]:
import pyspark.sql.functions as sf

companies_df = spark.read.table("stock_prediction.default.companies")

# Removes punctuation
companies_df = companies_df.withColumn(
    "clean_name", 
    sf.regexp_replace(sf.col("name"), '[^a-zA-Z0-9\\s]', '')
)

regex_pattern = r"\b(Inc|Corporation|Incorporated|Corp|Ltd|Co)\b"
companies_df = companies_df.withColumn(
    "clean_name", 
    sf.regexp_replace(sf.col("clean_name"), regex_pattern, '')
)

companies_df = companies_df.withColumn(
    "clean_name",
    sf.lower(sf.col("clean_name"))
)

companies_df = companies_df.withColumnRenamed("id", "company_id")

display(companies_df)

In [0]:
from pyspark.sql.functions import col, expr

articles_df = spark.read.table("stock_prediction.default.articles")

# Cross join articles with companies to check for company mentions in article content
joined_df = articles_df.crossJoin(companies_df)

# Filter where clean_name is contained in content_cleaned (case-insensitive)
filtered_df = joined_df.filter(
    expr("lower(content_cleaned) LIKE concat('%', clean_name, '%')")
)

# Group by company and collect articles mentioning each company
grouped_df = filtered_df.groupBy("clean_name").agg(
    sf.collect_list("id").alias("article_ids"),
    sf.collect_list("title").alias("article_titles"),
    # sf.collect_list("content").alias("article_contents")
)

display(grouped_df)

In [0]:
%pip install transformers torch pandas nltk
%restart_python
nltk.download('punkt')

In [0]:
from nltk.tokenize import sent_tokenize
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import ArrayType, StringType
from transformers import pipeline

import pandas as pd
import torch

# 1. Setup the Model on the Driver (to ensure it downloads correctly)
# We use a pipeline for simplicity. It handles tokenization automatically.
# device=0 uses the GPU if available, -1 uses CPU.
sentiment_pipeline = pipeline(
    "text-classification", 
    model="ProsusAI/finbert", 
    return_all_scores=True,
    device=-1 # Set to 0 if your Databricks cluster has GPUs
)

# 2. Define the Pandas UDF
# This function receives a SERIES of text (a batch), not just one string.
@pandas_udf('double')
def calculate_sentiment_score(text_series: pd.Series) -> pd.Series:
    # Reload the pipeline inside the worker to prevent serialization issues
    # Note: In a production job, you might load this globally or use a broadcast variable
    pipe = pipeline("text-classification", model="ProsusAI/finbert", return_all_scores=True, device=-1)
    
    # Run inference on the whole batch at once (much faster)
    # Truncation=True ensures long articles don't crash the model
    results = pipe(text_series.tolist(), truncation=True, max_length=512)
    
    # 3. Process Results
    # FinBERT returns 3 scores: Positive, Negative, Neutral.
    # We want a single "Compound" score: Positive - Negative.
    final_scores = []
    for res in results:
        # res looks like: [{'label': 'positive', 'score': 0.9}, {'label': 'negative', 'score': 0.01}, ...]
        pos = next(item['score'] for item in res if item['label'] == 'positive')
        neg = next(item['score'] for item in res if item['label'] == 'negative')
        
        # Calculate compound score (-1 to 1)
        final_scores.append(pos - neg)
        
    return pd.Series(final_scores)


@udf(ArrayType(StringType()))
def split_and_filter_sentences(content: str, company_name: str) -> list:
    """
    Splits content into sentences and filters for those mentioning the company.
    """
    if not content:
        return []
        
    sentences = sent_tokenize(content)
    
    lower_company = company_name.lower()
    
    filtered_sentences = [
        s for s in sentences
        if lower_company in s.lower()
    ]
    return filtered_sentences

In [0]:

# 2. Apply the UDF
# This adds a new column 'sentiment_score' to your DataFrame
analyzed_df = grouped_df.withColumn(
    "sentiment_score", 
    calculate_sentiment_score(grouped_df.content)
)

# 3. Save the results (e.g., to a new table or overwrite)
analyzed_df.write.mode("overwrite").saveAsTable("articles_with_sentiment")

# Verify
display(analyzed_df.select("title", "sentiment_score"))