In [0]:
%pip install transformers torch pandas nltk
%restart_python

In [0]:
from transformers import pipeline

sentiment_pipe = pipeline("text-classification", model="ProsusAI/finbert", return_all_scores=True, device=-1)

In [0]:
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt_tab', download_dir='/tmp/nltk_data', quiet=True)
sentence_tokenization_pipe = sent_tokenize
text = """
Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32.
"""
result = sentence_tokenization_pipe(text)
print(result)


In [0]:
from nltk.tokenize import sent_tokenize
from pyspark.sql.functions import pandas_udf, udf
from pyspark.sql.types import ArrayType, StringType
from transformers import pipeline

import pandas as pd
import numpy as np

@pandas_udf('double')
def calculate_contextual_sentiment(
    text_series: pd.Series, 
    company_names: pd.Series
) -> pd.Series:
    """
    Calculates the aggregate sentiment score for an article based ONLY on sentences 
    that mention the corresponding company name.
    """
    
    final_scores = []
    relevant_sentences = []
    
    # 2. Iterate through the corresponding article content and company name pairs
    # zip() ensures we process the rows in the batch together.
    for article_content, company_name in zip(text_series, company_names):
        
        if not article_content:
            final_scores.append(0.0) # Assign neutral score if no content
            continue

        # 3. Split the article into sentences (using the initialized function)
        sentences = sentence_tokenization_pipe(article_content)
        lower_company = company_name.lower()
        
        # 4. Filter the sentences based on the company name
        relevant_sentences.append (
            s for s in sentences
            if lower_company in s.lower()
        )
        
        # 5. Feed relevant sentences to FinBERT for sentiment score
        if not relevant_sentences:
            final_scores.append(0.0) # Assign neutral score if no relevant sentences
            continue

        # Run inference on the list of relevant sentences
        # The model automatically processes the batch of sentences efficiently.
        results = sentiment_pipe(relevant_sentences, truncation=True, max_length=512)
        
        # 6. Calculate the aggregate sentiment score
        sentence_scores = []
        for res in results:
            # Extract scores for Positive and Negative labels
            pos = next(item['score'] for item in res if item['label'] == 'positive')
            neg = next(item['score'] for item in res if item['label'] == 'negative')
            
            # Compound score (-1 to 1)
            sentence_scores.append(pos - neg)
        
        # Calculate the final article score as the mean of all relevant sentence scores
        if sentence_scores:
            final_scores.append(np.mean(sentence_scores))
        else:
            final_scores.append(0.0)

    # 7. Return the final scores as a Pandas Series
    return pd.Series(final_scores)


In [0]:
import pyspark.sql.functions as sf

companies_df = spark.read.table("stock_prediction.default.companies")

# Removes punctuation
companies_df = companies_df.withColumn(
    "clean_name", 
    sf.regexp_replace(sf.col("name"), '[^a-zA-Z0-9\\s]', '')
)

regex_pattern = r"\b(Inc|Corporation|Incorporated|Corp|Ltd|Co)\b"
companies_df = companies_df.withColumn(
    "clean_name", 
    sf.regexp_replace(sf.col("clean_name"), regex_pattern, '')
)

companies_df = companies_df.withColumn(
    "clean_name",
    sf.lower(sf.col("clean_name"))
)

companies_df = companies_df.withColumnRenamed("id", "company_id")

display(companies_df)

In [0]:
from pyspark.sql.functions import col, expr

articles_df = spark.read.table("stock_prediction.default.articles")

# Cross join articles with companies to check for company mentions in article content
joined_df = articles_df.crossJoin(companies_df)

# Filter where clean_name is contained in content_cleaned (case-insensitive)
filtered_df = joined_df.filter(
    expr("lower(content_cleaned) LIKE concat('%', clean_name, '%')")
)

# Group by company and collect articles mentioning each company
grouped_df = filtered_df.groupBy("clean_name").agg(
    sf.collect_list("id").alias("article_ids"),
    sf.collect_list("title").alias("article_titles"),
    sf.collect_list("content").alias("article_contents")
)

display(grouped_df.select("clean_name", "article_ids", "article_titles"))

In [0]:

# 2. Apply the UDF
# This adds a new column 'sentiment_score' to your DataFrame
analyzed_df = grouped_df.withColumn(
    "sentiment_score", 
    calculate_contextual_sentiment(grouped_df["article_contents"], grouped_df["clean_name"])
)

# analyzed_df.show()
# analyzed_df.createOrReplaceTempView("stock_prediction.default.articles_with_sentiment_view")
# analyzed_df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("stock_prediction.default.articles_with_sentences")

# Verify
display(analyzed_df.limit(2))