In [2]:
import os
import re
import pandas as pd
import numpy as np
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Setup
data_dir = "hyperpartisan_data"
os.makedirs(data_dir, exist_ok=True)

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

class TextPreprocessor:
    def __init__(self, remove_stopwords=True, stemming=False, lemmatization=True):
        self.remove_stopwords = remove_stopwords
        self.stemming = stemming
        self.lemmatization = lemmatization
        
        # Initialize tools
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
    
    def clean_html(self, html_text):
        """Remove HTML tags"""
        # Check if input is a string
        if not isinstance(html_text, str):
            return ""
        soup = BeautifulSoup(html_text, "html.parser")
        return soup.get_text()
    
    def remove_special_characters(self, text):
        """Remove special characters and numbers"""
        if not isinstance(text, str):
            return ""
        pattern = r'[^a-zA-Z\s]'
        text = re.sub(pattern, '', text)
        return text
    
    def remove_extra_whitespace(self, text):
        """Remove extra whitespaces"""
        if not isinstance(text, str):
            return ""
        return ' '.join(text.split())
    
    def tokenize(self, text):
        """Tokenize text to words"""
        if not isinstance(text, str) or not text:
            return []
        return word_tokenize(text.lower())
    
    def remove_stopwords_from_tokens(self, tokens):
        """Remove stopwords from tokenized text"""
        if self.remove_stopwords:
            return [word for word in tokens if word not in self.stop_words]
        return tokens
    
    def lemmatize_tokens(self, tokens):
        """Apply lemmatization to tokens"""
        if self.lemmatization:
            return [self.lemmatizer.lemmatize(word) for word in tokens]
        return tokens
    
    def preprocess(self, text, keep_original=False):
        """Apply full preprocessing pipeline"""
        # Check if input is valid
        if pd.isna(text) or not isinstance(text, str):
            return "" if not keep_original else (text, "")
            
        if keep_original:
            original = text
        
        # Clean HTML
        text = self.clean_html(text)
        
        # Remove special characters
        text = self.remove_special_characters(text)
        
        # Remove extra whitespace
        text = self.remove_extra_whitespace(text)
        
        # Tokenize
        tokens = self.tokenize(text)
        
        # Remove stopwords
        tokens = self.remove_stopwords_from_tokens(tokens)
        
        # Apply lemmatization
        if self.lemmatization:
            tokens = self.lemmatize_tokens(tokens)
        
        # Join tokens back to text
        processed_text = ' '.join(tokens)
        
        if keep_original:
            return original, processed_text
        return processed_text

# Apply preprocessing to the dataset
def preprocess_dataset(df, text_column='text', keep_original=True):
    """Preprocess all texts in the dataset"""
    print(f"NaN values in {text_column} column: {df[text_column].isna().sum()}")
    
    # Create a copy to avoid SettingWithCopyWarning
    df = df.copy()
    
    preprocessor = TextPreprocessor(remove_stopwords=True, lemmatization=True)
    
    if keep_original:
        # Create a new column for processed text
        df['processed_text'] = df[text_column].apply(lambda x: preprocessor.preprocess(x))
    else:
        # Replace the original text with processed text
        df[text_column] = df[text_column].apply(lambda x: preprocessor.preprocess(x))
    
    return df

# Load the dataset
csv_path = os.path.join(data_dir, "articles_byarticle.csv")
if os.path.exists(csv_path):
    print(f"Loading articles from {csv_path}")
    articles_df = pd.read_csv(csv_path)
    
    # Print dataset statistics
    print(f"Loaded {len(articles_df)} articles")
    print(f"Hyperpartisan distribution: {articles_df['hyperpartisan'].value_counts()}")
    
    # Preprocess the dataset
    articles_df = preprocess_dataset(articles_df, text_column='text')
    
    # Save the preprocessed dataset
    preprocessed_path = os.path.join(data_dir, "articles_preprocessed.csv")
    articles_df.to_csv(preprocessed_path, index=False)
    
    print(f"Preprocessed dataset saved to {preprocessed_path}")
    
    # Print a sample to verify
    print("\nSample preprocessed text:")
    sample = articles_df.iloc[0]
    print(f"Original title: {sample['title']}")
    print(f"Processed text (excerpt): {sample['processed_text'][:200]}...")
    
else:
    print(f"Dataset not found at {csv_path}")

[nltk_data] Downloading package punkt to /Users/dlqnt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/dlqnt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/dlqnt/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading articles from hyperpartisan_data/articles_byarticle.csv
Loaded 645 articles
Hyperpartisan distribution: hyperpartisan
False    645
Name: count, dtype: int64
NaN values in text column: 3
Preprocessed dataset saved to hyperpartisan_data/articles_preprocessed.csv

Sample preprocessed text:
Original title: Kucinich: Reclaiming the money power
Processed text (excerpt): money wall closing congress terrifying wall water hurricane harvey irma damage totaled could rise half trillion dollar wall war multitrillion dollar ongoing cost afghanistan iraq intervention crumblin...
