In [None]:
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import spacy
from bs4 import BeautifulSoup
import contractions
from spellchecker import SpellChecker
import emoji

# Load spaCy model (optional, if using spaCy for tokenization/lemmatization)
nlp = spacy.load('en_core_web_sm')

# Initialize lemmatizer and stemmer (if using NLTK)
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Initialize spell checker
spell = SpellChecker()

# Get standard English stop words (customize this list!)
stop_words = set(stopwords.words('english'))

# --- CRITICAL: Customize Stopwords ---
# Remove words that might be important for sentiment (negations, intensifiers)
words_to_keep = {"no", "not", "nor", "very", "too", "against", "don't", "doesn't", "didn't", "shouldn't", "couldn't", "won't", "wouldn't", "isn't", "aren't", "wasn't", "weren't"}
stop_words = stop_words - words_to_keep

# Add any other domain-specific stop words if needed
# stop_words.add("book") # Example if 'book' is too common and uninformative

Removing stop words, stemming, and lemmatization simplifies the data but can sometimes remove important nuances, especially for sophisticated DL models. It's often best to start with minimal processing **(cleaning, lowercasing, tokenizing)** and add steps like stop word removal or lemmatization only if needed based on initial model performance.

In [None]:
# --- Text Cleaning ---
def remove_html_tags(text):
    """Removes HTML tags from text."""
    if isinstance(text, str):
        return BeautifulSoup(text, "html.parser").get_text()
    return text

def remove_urls(text):
    """Removes URLs from text."""
    if isinstance(text, str):
        return re.sub(r'https?://\S+|www\.\S+', '', text)
    return text

# --- Lowercasing ---
def lowercase_text(text):
    """Converts text to lowercase."""
    if isinstance(text, str):
        return text.lower()
    return text

# --- Handling Rare Words and Slang (Requires Custom Dictionary) ---
# This is highly domain/data specific. Example placeholder:
rare_word_map = {
    "imho": "in my humble opinion",
    "brb": "be right back"
}
def handle_rare_slang(text, mapping_dict):
    """Replaces known slang/rare words based on a dictionary."""
    if isinstance(text, str):
        words = text.split()
        corrected_words = [mapping_dict.get(word, word) for word in words]
        return " ".join(corrected_words)
    return text

# --- Expanding Contractions ---
def expand_contractions(text):
    """Expands contractions like "don't" to "do not"."""
    if isinstance(text, str):
        return contractions.fix(text)
    return text

# --- Removing Punctuation ---
def remove_punctuation(text):
    """Removes standard punctuation. Keeps apostrophes within words."""
    if isinstance(text, str):
        # Keep apostrophes for contractions handled earlier, remove others
        translator = str.maketrans('', '', string.punctuation.replace("'", ""))
        return text.translate(translator)
    return text
    # --- Alternative: Keep specific punctuation like '!' or '?' ---
    # punctuation_to_keep = "!?"
    # punctuation_to_remove = ''.join([p for p in string.punctuation if p not in punctuation_to_keep])
    # translator = str.maketrans('', '', punctuation_to_remove)
    # return text.translate(translator)


# --- Tokenization ---
def tokenize_text_nltk(text):
    """Tokenizes text into words using NLTK."""
    if isinstance(text, str):
        return word_tokenize(text)
    return [] # Return empty list for non-string input

# --- Removing Stop Words ---
def remove_stopwords(tokens):
    """Removes stop words from a list of tokens."""
    return [word for word in tokens if word not in stop_words and len(word) > 1] # Also remove single chars

# --- Stemming and Lemmatization ---
def stem_tokens(tokens):
    """Applies Porter stemming to a list of tokens."""
    return [stemmer.stem(word) for word in tokens]

def lemmatize_tokens(tokens):
    """Applies WordNet lemmatization to a list of tokens."""
    # Note: Lemmatization is often more accurate with Part-of-Speech tags,
    # but that adds complexity (requires PoS tagging first).
    # This basic version assumes words are nouns by default if tag unknown.
    return [lemmatizer.lemmatize(word) for word in tokens]

# --- Spell Checking and Correction (Use with Caution!) ---
def correct_spelling(tokens):
    """Corrects spelling for a list of tokens. Can be slow and sometimes inaccurate."""
    # Find potentially misspelled words
    misspelled = spell.unknown(tokens)
    corrected_tokens = []
    for word in tokens:
        if word in misspelled:
            corrected_word = spell.correction(word)
            # Only correct if a correction is found, otherwise keep original
            corrected_tokens.append(corrected_word if corrected_word else word)
        else:
            corrected_tokens.append(word)
    return corrected_tokens

# --- Handling Emojis and Emoticons ---
def handle_emojis(text):
    """Converts emojis to descriptive text (e.g., :smile:) or removes them."""
    if isinstance(text, str):
        # Option 1: Convert to text description
        text = emoji.demojize(text, delimiters=(" _EMOJI_", "_ ")) # Add spaces for tokenization
        # Option 2: Remove emojis completely
        # text = emoji.replace_emoji(text, replace='')
        return text
    return text

In [None]:
def preprocess_text_pipeline(text,
                             remove_html=True,
                             remove_url=True,
                             handle_emoji=True, # Convert emojis to text
                             expand_contract=True,
                             do_lowercase=True,
                             handle_slang=False, # Set to True if you have a good slang_map
                             slang_map=rare_word_map,
                             remove_punct=True,
                             do_spell_correct=False, # CAUTION: Slow and potentially error-prone
                             do_tokenize=True,
                             remove_stop=True,
                             do_lemmatize=False, # Choose lemmatize OR stem
                             do_stem=False):
    """Applies a sequence of preprocessing steps to raw text."""
    if not isinstance(text, str):
        return "" # Or return empty list if tokenizing

    # 1. Clean HTML and URLs
    if remove_html:
        text = remove_html_tags(text)
    if remove_url:
        text = remove_urls(text)

    # 2. Handle Emojis (convert to text before lowercasing/punctuation removal)
    if handle_emoji:
        text = handle_emojis(text)

    # 3. Expand Contractions (before punctuation removal)
    if expand_contract:
        text = expand_contractions(text)

    # 4. Lowercase
    if do_lowercase:
        text = lowercase_text(text)

    # 5. Handle Slang/Rare words (after lowercasing)
    if handle_slang:
        text = handle_rare_slang(text, slang_map)

    # 6. Remove Punctuation (after contractions)
    if remove_punct:
        text = remove_punctuation(text)

    # --- Tokenization is the central point ---
    if not do_tokenize:
        # If not tokenizing, usually return the cleaned string
        # Remove extra whitespace that might result from removals
        return ' '.join(text.split())
    else:
        tokens = tokenize_text_nltk(text) # Use NLTK tokenizer

        # 7. Spell Correction (on tokens - USE WITH CAUTION)
        if do_spell_correct:
            tokens = correct_spelling(tokens)

        # 8. Remove Stopwords (on tokens)
        if remove_stop:
            tokens = remove_stopwords(tokens)

        # 9. Lemmatize OR Stem (on tokens)
        if do_lemmatize:
            tokens = lemmatize_tokens(tokens)
        elif do_stem:
            tokens = stem_tokens(tokens)

        # Return list of processed tokens
        return tokens # Or join back into string: ' '.join(tokens)

### For book review:
- Handle irony for negative review
- Handling abbreviations, correcting improper punctuation, and rectifying spelling mistakes (text writen by human so there will 100% be grammatical or lexical errors, how to handle that?)

### For financial text:
- All the standard preprocessing steps mentioned above (cleaning, lowercasing, tokenization, stop word removal, stemming/lemmatization, handling special characters, etc.) are generally applied.   
- Handling Financial Jargon and Abbreviations: Financial texts contain specific terminology and abbreviations that might need to be handled appropriately, possibly through expansion or standardization.   
- Normalization of Financial Terms: Ensuring consistent representation of financial entities and concepts.
- Removal of Financial Indicators: Depending on the task, symbols like '$', '€', or stock tickers might be removed or treated specially.
- Domain-Specific Stop Word Lists: Using stop word lists tailored to the financial domain, as some common words might carry sentiment in a financial context.   
- Handling Numerical Data: Deciding how to treat numerical values, which can be significant in financial texts.
- Negation Handling: Crucial in finance as negations can significantly alter the sentiment (e.g., "not profitable"). Specific negation handling algorithms might be employed.   
- Domain Adaptation: When labeled financial data is scarce, techniques like domain adaptation might be used to leverage sentiment knowledge from other domains.  

These shit will be handle later, now I have to build a simple pre-processing pipeline to run and test the result

In [31]:
book_reviews = pd.read_csv('../data/processed/book_reviews/book_reviews_sample.csv', encoding='utf-8-sig')
financial_news = pd.read_csv('../data/processed/financial_news/financial_news_train.csv', encoding='latin-1')