In [0]:
%pip install transformers torch pandas nltk


In [0]:
from transformers import pipeline

sentiment_pipe = pipeline("text-classification", model="ProsusAI/finbert", return_all_scores=True, device=-1)

result = sentiment_pipe("With the new iPhone launch Apple took a very bad launch that caused a loss.")

print(result)

In [0]:
from nltk.tokenize import sent_tokenize
import nltk
import os # Import os for path manipulation

# 1. Define the writable path (Use this same path for both download and search)
NLTK_DATA_PATH = '/tmp/nltk_data'

# 2. Add the custom path to NLTK's search list (CRITICAL STEP)
# This ensures that when NLTK looks for 'punkt_tab', it checks /tmp/nltk_data.
nltk.data.path.append(NLTK_DATA_PATH)

# 3. Download the resource to the specified path
# Note: You should be downloading 'punkt', not 'punkt_tab'. 'punkt_tab' is an internal error 
# NLTK throws when it can't find 'punkt'. Let's stick to 'punkt'.
nltk.download('punkt', download_dir=NLTK_DATA_PATH, quiet=True) 

# Ensure the download directory exists (though the download function usually handles this)
os.makedirs(NLTK_DATA_PATH, exist_ok=True)

sentence_tokenization_pipe = sent_tokenize
text = """
Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32.
"""
# This now executes successfully because NLTK knows where to find the 'punkt' data.
result = sentence_tokenization_pipe(text) 
print(result)

In [0]:
%sh
export HF_HOME='/tmp/huggingface_cache'
echo $HF_HOME

In [0]:
from nltk.tokenize import sent_tokenize
from pyspark.sql.functions import pandas_udf, udf
from pyspark.sql.types import ArrayType, StringType

import numpy as np
import pandas as pd

SENTIMENT_PIPE, SENTENCE_TOKENIZATION_PIPE = None, None

def initialize_models():
    """Initializes the heavy Hugging Face models once per worker process."""
    import os
    global SENTIMENT_PIPE, SENTENCE_TOKENIZATION_PIPE
    

    if SENTIMENT_PIPE is None:
        from transformers import pipeline   
        CACHE_DIR = '/tmp/huggingface_cache'
        os.environ['HF_HOME'] = CACHE_DIR
        os.makedirs(CACHE_DIR, exist_ok=True)

        SENTIMENT_PIPE = pipeline(
            "text-classification", 
            model="ProsusAI/finbert", 
            return_all_scores=True, 
            device=-1,
            model_kwargs={"cache_dir": CACHE_DIR}
        )
        
    if SENTENCE_TOKENIZATION_PIPE is None:
        import nltk
        NLTK_DATA_PATH = '/tmp/nltk_data'
        nltk.data.path.append(NLTK_DATA_PATH)
        nltk.download('punkt', download_dir=NLTK_DATA_PATH, quiet=True) 

        os.makedirs(NLTK_DATA_PATH, exist_ok=True)
        SENTENCE_TOKENIZATION_PIPE = sent_tokenize

@pandas_udf('double')
def calculate_contextual_sentiment(sentence_lists: pd.Series) -> pd.Series:
    initialize_models()
    
    final_scores = []
    
    for s_list in sentence_lists:
        if not s_list or len(s_list) == 0:
            final_scores.append(0.0)
            continue
            
        try:
            results = SENTIMENT_PIPE(list(s_list), truncation=True, max_length=512)
        except Exception:
            final_scores.append(0.0)
            continue
        
        article_scores = []
        for res in results:
            # res format: [{'label': 'positive', 'score': 0.9}, ...]
            pos = next((x['score'] for x in res if x['label'] == 'positive'), 0.0)
            neg = next((x['score'] for x in res if x['label'] == 'negative'), 0.0)
            article_scores.append(pos - neg)
            
        if article_scores:
            final_scores.append(float(np.mean(article_scores)))
        else:
            final_scores.append(0.0)
            
    return pd.Series(final_scores)

In [0]:
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import ArrayType, StringType
import pandas as pd
import numpy as np
import os

NLTK_INITIALIZED = False

def initialize_nltk():
    """Sets up NLTK on the worker node."""
    global NLTK_INITIALIZED
    import nltk
    
    if not NLTK_INITIALIZED:
        NLTK_DATA_PATH = '/tmp/nltk_data'
        os.makedirs(NLTK_DATA_PATH, exist_ok=True)
        
        nltk.data.path.append(NLTK_DATA_PATH)
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt', download_dir=NLTK_DATA_PATH, quiet=True)
            try:
                nltk.download('punkt_tab', download_dir=NLTK_DATA_PATH, quiet=True)
            except:
                pass
        
        NLTK_INITIALIZED = True

@pandas_udf(ArrayType(StringType()))
def filter_relevant_sentences(content_series: pd.Series, company_name_series: pd.Series) -> pd.Series:
    """
    Splits articles into sentences and returns ONLY the sentences 
    that contain the specific company name.
    """
    
    initialize_nltk()
    from nltk.tokenize import sent_tokenize
    
    output_rows = []
    
    for article, company in zip(content_series, company_name_series):
        if not article or not isinstance(article, str) or not company:
            output_rows.append([])
            continue
            
        try:
            sentences = sent_tokenize(article)
        except Exception:
            output_rows.append([])
            continue
            
        target_company = str(company).lower().strip()
        
        relevant = [
            sent for sent in sentences 
            if target_company in sent.lower()
        ]
        
        output_rows.append(relevant)
        
    return pd.Series(output_rows)

In [0]:
import pyspark.sql.functions as sf

companies_df = spark.read.table("stock_prediction.default.companies")

# Removes punctuation
companies_df = companies_df.withColumn(
    "clean_name", 
    sf.regexp_replace(sf.col("name"), '[^a-zA-Z0-9\\s]', '')
)

regex_pattern = r"\b(Inc|Corporation|Incorporated|Corp|Ltd|Co)\b"
companies_df = companies_df.withColumn(
    "clean_name", 
    sf.regexp_replace(sf.col("clean_name"), regex_pattern, '')
)

companies_df = companies_df.withColumn(
    "clean_name",
    sf.lower(sf.col("clean_name"))
)

companies_df = companies_df.withColumnRenamed("id", "company_id")

display(companies_df)

In [0]:
from pyspark.sql.functions import col, expr

articles_df = spark.read.table("stock_prediction.default.articles")

# Cross join articles with companies to check for company mentions in article content
joined_df = articles_df.crossJoin(companies_df)

# Filter where clean_name is contained in content_cleaned (case-insensitive)
filtered_df = joined_df.filter(
    expr("lower(content_cleaned) LIKE concat('%', clean_name, '%')")
)

final_df = filtered_df.select("id", "company_id", "name", "title", "clean_name", "content_cleaned", "published_at")
uber = final_df.where(sf.col("clean_name") == "uber technologies ")

uber = uber.withColumn(
    "filtered_sentences", 
    filter_relevant_sentences(col("content_cleaned"), col("clean_name"))
)

display(uber.limit(10))

In [0]:
x = uber.select('id', 'company_id', 'filtered_sentences')

x = x.withColumn(
    "sentiment_score", 
    calculate_contextual_sentiment(col("filtered_sentences"))
)
display(x.select("sentiment_score").limit(10))

In [0]:
# New Cell 4 (Filter Logic without Grouping)
from pyspark.sql.functions import col, expr

articles_df = spark.read.table("stock_prediction.default.articles")

# Cross join to get potential matches
joined_df = articles_df.crossJoin(companies_df)

# Filter down to only rows where the company is actually mentioned
# We keep the rows INDIVIDUAL. We do NOT group them.
filtered_df = joined_df.filter(
    expr("lower(content_cleaned) LIKE concat('%', clean_name, '%')")
)

# Optional: If you want to test with just Uber first
uber_df = filtered_df.filter(col("clean_name") == "uber technologies ")

# Run the UDF on the individual 'content' column
# This allows Spark to batch them (e.g., 100 articles at a time) automatically
analyzed_df = uber_df.withColumn(
    "sentiment_score", 
    calculate_contextual_sentiment(col("content"), col("clean_name"))
)

display(analyzed_df.select("clean_name", "title", "sentiment_score"))

In [0]:
analyzed_df = uber.withColumn(
    "relevant_sentences", 
    calculate_contextual_sentiment(uber["article_contents"], uber["clean_name"])
)
# analyzed_df.createOrReplaceTempView("stock_prediction.default.articles_with_sentiment_view")
# analyzed_df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("stock_prediction.default.articles_with_sentences")
display(analyzed_df)