In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import textstat

# Download required NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('vader_lexicon', quiet=True)
except:
    print("NLTK data download failed - some features may not work")

print("Libraries imported successfully!")

In [None]:
# Load the data (assuming df is already loaded from data collection notebook)
df = pd.read_csv("../data/combined_final.csv")

## Text Preprocessing and Cleaning

In [None]:
# Advanced text preprocessing functions
def advanced_text_preprocessing(text):
    """Comprehensive text preprocessing for NLP analysis"""
    if pd.isna(text):
        return ""

    # Convert to string and lowercase
    text = str(text).lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    try:
        # Tokenization
        tokens = word_tokenize(text)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]

        # Remove words with less than 3 characters
        tokens = [word for word in tokens if len(word) >= 3]

        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

        return ' '.join(tokens)

    except Exception as e:
        # Fallback processing if NLTK components fail
        words = text.split()
        common_stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
        words = [word for word in words if word not in common_stopwords and len(word) >= 3]
        return ' '.join(words)

print("ADVANCED TEXT PREPROCESSING")
print("=" * 60)

# Show examples of text preprocessing
sample_texts = df['text'].dropna().head(3).tolist()

print("BEFORE AND AFTER PREPROCESSING EXAMPLES:")
print("-" * 50)

for i, text in enumerate(sample_texts, 1):
    print(f"\nExample {i}:")
    print("ORIGINAL:")
    print(text[:200] + "..." if len(text) > 200 else text)

    processed = advanced_text_preprocessing(text)
    print("\nPROCESSED:")
    print(processed[:200] + "..." if len(processed) > 200 else processed)
    print("-" * 50)

# Apply advanced preprocessing to the dataset
print("\nApplying advanced preprocessing to the entire dataset...")
df['title_processed'] = df['title'].apply(advanced_text_preprocessing)
df['text_processed'] = df['text'].apply(advanced_text_preprocessing)

# Calculate processing statistics
original_title_chars = df['title'].astype(str).str.len().sum()
processed_title_chars = df['title_processed'].str.len().sum()
original_text_chars = df['text'].astype(str).str.len().sum()
processed_text_chars = df['text_processed'].str.len().sum()

print(f"\nPREPROCESSING STATISTICS:")
print("-" * 30)
print(f"Title characters - Original: {original_title_chars:,}, Processed: {processed_title_chars:,}")
print(f"Text characters - Original: {original_text_chars:,}, Processed: {processed_text_chars:,}")
print(f"Title reduction: {((original_title_chars - processed_title_chars) / original_title_chars * 100):.1f}%")
print(f"Text reduction: {((original_text_chars - processed_text_chars) / original_text_chars * 100):.1f}%")

## Feature Engineering for ML Model

In [None]:
def engineer_features(df):
    """Simple and safe feature engineering for text + title columns."""
    df = df.copy()

    # --- Clean text ---
    df['title'] = df['title'].astype(str)
    df['text'] = df['text'].astype(str)

    # --- Basic text stats ---
    df['title_length'] = df['title'].apply(len)
    df['text_length'] = df['text'].apply(len)
    df['title_word_count'] = df['title'].apply(lambda x: len(x.split()))
    df['text_word_count'] = df['text'].apply(lambda x: len(x.split()))

    # --- Ratios ---
    df['title_text_length_ratio'] = df['title_length'] / (df['text_length'] + 1)
    df['title_text_word_ratio'] = df['title_word_count'] / (df['text_word_count'] + 1)

    # --- Punctuation and capitalization ---
    df['title_exclamation_count'] = df['title'].str.count('!')
    df['title_question_count'] = df['title'].str.count(r'\?')
    df['text_exclamation_count'] = df['text'].str.count('!')
    df['text_question_count'] = df['text'].str.count(r'\?')
    df['title_caps_ratio'] = df['title'].apply(lambda x: sum(c.isupper() for c in x) / (len(x) + 1))

    # --- Special chars, digits, URLs ---
    df['text_digit_count'] = df['text'].str.count(r'\d')
    df['text_special_char_count'] = df['text'].apply(lambda x: sum(not c.isalnum() and not c.isspace() for c in x))
    df['contains_url'] = df['text'].str.contains(r'http|www|\.com|\.org|\.net', case=False, na=False).astype(int)

    # --- Fake news keywords ---
    keywords = ['breaking', 'urgent', 'exclusive', 'shocking', 'leaked', 'exposed', 'viral']
    df['fake_keywords_count'] = (
        df['title'].apply(lambda x: sum(k in x.lower() for k in keywords)) +
        df['text'].apply(lambda x: sum(k in x.lower() for k in keywords))
    )

    print("Features engineered successfully!")
    print(f"Created {df.shape[1]} columns in total.")
    return df


# Apply feature engineering
df_engineered = engineer_features(df)

# Show sample of new features
print("\nSample engineered features:")
print(df_engineered.head(5)[[
    'title_text_length_ratio', 'title_text_word_ratio',
    'title_exclamation_count', 'text_exclamation_count',
    'title_caps_ratio', 'contains_url', 'fake_keywords_count'
]])

## Data Cleaning for Modeling

In [None]:
def clean_mojibake(text):
    if isinstance(text, str):
        try:
            # Try re-decoding bytes to fix mis-encoded text
            text = text.encode('latin1').decode('utf-8')
        except:
            pass
        # Basic cleanup
        text = text.replace('\xa0', ' ').replace('\u200b', ' ')
    return text

df['text'] = (df['title'].fillna('') + ' ' + df['text'].fillna('')).apply(clean_mojibake)

df = df[pd.to_numeric(df['label'], errors='coerce').notnull()]
df['label'] = df['label'].astype(int)

In [None]:
def normalize_series(series: pd.Series) -> pd.Series:
    # Ensure everything is a string
    series = series.fillna('').astype(str)

    # Convert to lowercase
    series = series.str.lower()

    # Remove URLs
    series = series.str.replace(r'https?://\S+|www\.\S+', '', regex=True)

    # Remove non-word characters
    series = series.str.replace(r'\W', ' ', regex=True)

    # Remove newlines
    series = series.str.replace(r'\n', '', regex=True)

    # Replace multiple spaces with a single space
    series = series.str.replace(r' +', ' ', regex=True)

    # Strip leading/trailing spaces
    series = series.str.strip()

    return series

In [None]:
df["title_text"] = df["title"] + " " + df["text"]
df = df.dropna(subset=['title_text'])         # remove NaN texts
df = df[df['title_text'].str.strip() != '']   # remove empty strings
df['label'] = pd.to_numeric(df['label'], errors='coerce')
df = df[df['label'].isin([0, 1])]
df['label'] = df['label'].astype(int)
print(df['title_text'].isna().sum())  # should be 0
print(df['label'].isna().sum())       # should be 0

## Save Preprocessed Data

In [None]:
# Save preprocessed data for use in modeling
df.to_csv('../data/preprocessed_data.csv', index=False)
print("Preprocessing complete!")