In [None]:
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import contractions
from bs4 import BeautifulSoup
import emoji # Even if less common, good to have

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# --- Customize Stopwords for Finance ---
stop_words = set(stopwords.words('english'))

# CRITICAL: Remove negations and potentially sentiment-bearing words
words_to_keep = {
    "no", "not", "nor", "up", "down", "above", "below", "over", "under",
    "more", "less", "increase", "decrease", "positive", "negative", "profit", "loss",
    "gain", "fall", "rise", "against", "don't", "doesn't", "didn't", "shouldn't",
    "couldn't", "won't", "wouldn't", "isn't", "aren't", "wasn't", "weren't",
    "buy", "sell", "hold"
}
stop_words = stop_words - words_to_keep

# Add common financial filler words IF they prove uninformative in your specific task
# (Do this based on analysis, not preemptively)
# financial_fillers = {"company", "market", "stock", "share", "quarter", "report", "million", "billion"}
# stop_words.update(financial_fillers)

print(f"Using {len(stop_words)} stop words.")

Removing stop words, stemming, and lemmatization simplifies the data but can sometimes remove important nuances, especially for sophisticated DL models. It's often best to start with minimal processing **(cleaning, lowercasing, tokenizing)** and add steps like stop word removal or lemmatization only if needed based on initial model performance.

In [None]:
# --- Standard Cleaning (reuse from previous example) ---
def remove_html_tags(text):
    if isinstance(text, str): return BeautifulSoup(text, "html.parser").get_text()
    return text

def remove_urls(text):
    if isinstance(text, str): return re.sub(r'https?://\S+|www\.\S+', '', text)
    return text

def lowercase_text(text):
    if isinstance(text, str): return text.lower()
    return text

def expand_contractions(text):
    if isinstance(text, str): return contractions.fix(text)
    return text

def handle_emojis(text): # Keep this in case emojis appear
    if isinstance(text, str): return emoji.demojize(text, delimiters=(" _EMOJI_", "_ "))
    return text

# --- Financial Jargon/Abbreviations (Requires Custom Dictionary) ---
# You need to build this dictionary based on your specific data!
financial_jargon_map = {
    "qoq": "quarter over quarter",
    "yoy": "year over year",
    "eps": "earnings per share",
    "p/e": "price to earnings ratio",
    "roi": "return on investment",
    "ipo": "initial public offering",
    "fed": "federal reserve",
    # Add more financial abbreviations/jargon specific to your data
}
def handle_financial_jargon(text, mapping_dict):
    """Replaces known financial jargon/abbreviations."""
    if isinstance(text, str):
        # Use regex to handle variations (e.g., case-insensitivity, word boundaries)
        for abbr, full_form in mapping_dict.items():
             # Simple boundary match, case insensitive
            pattern = r'\b' + re.escape(abbr) + r'\b'
            text = re.sub(pattern, full_form, text, flags=re.IGNORECASE)
        return text
    return text

# --- Normalization of Financial Terms (Requires Custom Rules/Dictionary) ---
# Example: Standardize company names or metrics
term_normalization_map = {
    "q1": "first quarter",
    "q2": "second quarter",
    # ...
    "alphabet inc.": "google", # Example
    "google parent": "google"
}
def normalize_financial_terms(text, mapping_dict):
    """Standardizes specific financial terms."""
    # Similar implementation to handle_financial_jargon
    if isinstance(text, str):
        for term, normalized_term in mapping_dict.items():
            pattern = r'\b' + re.escape(term) + r'\b'
            text = re.sub(pattern, normalized_term, text, flags=re.IGNORECASE)
        return text
    return text


# --- Handling Financial Indicators & Punctuation ---
def clean_financial_punctuation_and_symbols(text, keep_numbers=True):
    """Removes punctuation but keeps essential financial symbols and optionally numbers."""
    if not isinstance(text, str):
        return text

    # Define characters to KEEP
    financial_chars_to_keep = ".$%-" # Keep period (decimals), dollar, percent, hyphen
    allowed_chars = string.ascii_letters + string.digits + string.whitespace + financial_chars_to_keep

    # Remove characters NOT in the allowed set
    cleaned_text = ''.join(filter(lambda x: x in allowed_chars, text))

    # Optional: Replace specific symbols with tokens if preferred over keeping them directly
    # cleaned_text = cleaned_text.replace('$', ' _DOLLAR_ ')
    # cleaned_text = cleaned_text.replace('%', ' _PERCENT_ ')

    # Handle numbers (keep by default, or replace/remove)
    if not keep_numbers:
        cleaned_text = re.sub(r'\d+(\.\d+)?', '', cleaned_text) # Remove integers and decimals
    # else: # Optional: replace numbers with a token
        # cleaned_text = re.sub(r'\d+(\.\d+)?', '_NUMBER_', cleaned_text)

    # Handle stock tickers (example: $AAPL, $TSLA). Replace with a token or remove.
    # Option 1: Replace with generic token
    cleaned_text = re.sub(r'\$[A-Z]{1,5}\b', '_TICKER_', cleaned_text)
    # Option 2: Remove tickers (if replacing, do it before removing '$')
    # cleaned_text = re.sub(r'\$[A-Z]{1,5}\b', '', cleaned_text)

    return cleaned_text


# --- Tokenization (NLTK) ---
def tokenize_text_nltk(text):
    if isinstance(text, str): return word_tokenize(text)
    return []

# --- Removing Stop Words (Using customized list) ---
def remove_stopwords(tokens):
    """Removes stop words using the finance-customized list."""
    return [word for word in tokens if word not in stop_words and len(word) > 1]

# --- Lemmatization (NLTK) ---
def lemmatize_tokens(tokens):
    """Applies WordNet lemmatization."""
    return [lemmatizer.lemmatize(word) for word in tokens]

# --- Negation Handling (Basic - covered by keeping 'not' etc. in stop words) ---
# Advanced negation handling (e.g., marking scope) often requires dedicated libraries
# like NegEx or relies on contextual models (Transformers) to understand implicitly.
# For ML/basic DL, ensuring negations AREN'T removed by stop words is the first step.

# --- Domain Adaptation ---
# This is a MODELING technique, not a text preprocessing step.
# It involves using a model pre-trained on general data and fine-tuning it
# on your specific (potentially smaller) financial dataset.
# Example concept: Fine-tune a general BERT model on financial news.

In [None]:
def preprocess_financial_text_pipeline(text,
                                       remove_html=True,
                                       remove_url=True,
                                       handle_emoji=False, # Less likely needed
                                       expand_contract=True,
                                       do_lowercase=True,
                                       handle_jargon=True, # Enable if you build the map
                                       jargon_map=financial_jargon_map,
                                       normalize_terms=False, # Enable if you build the map
                                       norm_map=term_normalization_map,
                                       clean_punct_symbols=True,
                                       keep_numbers=True, # Keep numbers by default for finance
                                       do_tokenize=True,
                                       remove_stop=True,
                                       do_lemmatize=True):
    """Applies a sequence of preprocessing steps tailored for financial text."""
    if not isinstance(text, str):
        return "" if not do_tokenize else []

    # 1. Basic Cleaning
    if remove_html: text = remove_html_tags(text)
    if remove_url: text = remove_urls(text)
    if handle_emoji: text = handle_emojis(text) # Optional

    # 2. Normalization
    if expand_contract: text = expand_contractions(text)
    if do_lowercase: text = lowercase_text(text)

    # 3. Domain Specific Handling (BEFORE punctuation/number removal if they affect patterns)
    if handle_jargon: text = handle_financial_jargon(text, jargon_map)
    if normalize_terms: text = normalize_financial_terms(text, norm_map)

    # 4. Handle Punctuation, Symbols, Numbers
    if clean_punct_symbols: text = clean_financial_punctuation_and_symbols(text, keep_numbers=keep_numbers)

    # --- Tokenization ---
    if not do_tokenize:
        return ' '.join(text.split()) # Clean whitespace
    else:
        tokens = tokenize_text_nltk(text)

        # 5. Remove Stopwords (custom list)
        if remove_stop:
            tokens = remove_stopwords(tokens)

        # 6. Lemmatize (preferred over stemming)
        if do_lemmatize:
            tokens = lemmatize_tokens(tokens)

        # Remove any empty tokens that might result from cleaning
        tokens = [token for token in tokens if token]

        return tokens # Or ' '.join(tokens) for string output

### For financial text:
- Handling Financial Jargon and Abbreviations: Financial texts contain specific terminology and abbreviations that might need to be handled appropriately, possibly through expansion or standardization.   
- Normalization of Financial Terms: Ensuring consistent representation of financial entities and concepts.
- Removal of Financial Indicators: Depending on the task, symbols like '$', '€', or stock tickers might be removed or treated specially.
- Domain-Specific Stop Word Lists: Using stop word lists tailored to the financial domain, as some common words might carry sentiment in a financial context.   
- Handling Numerical Data: Deciding how to treat numerical values, which can be significant in financial texts.
- Negation Handling: Crucial in finance as negations can significantly alter the sentiment (e.g., "not profitable"). Specific negation handling algorithms might be employed.   
- Domain Adaptation: When labeled financial data is scarce, techniques like domain adaptation might be used to leverage sentiment knowledge from other domains.  

These shit will be handle later, now I have to build a simple pre-processing pipeline to run and test the result

In [None]:
financial_news = pd.read_csv('../data/processed/financial_news/financial_news_train.csv', encoding='latin-1')