In [None]:
# Feature Engineering for Hyperpartisan News Detection

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
from nltk.tokenize import word_tokenize
from collections import Counter
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Download required NLTK resources
nltk.download('vader_lexicon', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Initialize sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Load preprocessed data
data_dir = "hyperpartisan_data"
articles_df = pd.read_csv(f"{data_dir}/articles_preprocessed.csv")
print(f"Loaded {len(articles_df)} articles")

# Create a directory for feature outputs
features_dir = "hyperpartisan_features"
os.makedirs(features_dir, exist_ok=True)

# Initialize a dictionary to hold politically charged terms
# Based on our EDA findings
politically_charged_terms = {
    # Political figures
    'trump': 1, 'clinton': 1, 'hillary': 1, 'obama': 1, 'donald': 1, 'biden': 1,
    
    # Political parties/ideologies
    'republican': 1, 'democrat': 1, 'conservative': 1, 'liberal': 1, 'progressive': 1, 
    'left': 1, 'right': 1, 'leftist': 1, 'rightist': 1, 'gop': 1, 'democratic': 1,
    
    # Charged political terms
    'fake': 1, 'propaganda': 1, 'elite': 1, 'mainstream': 1, 'establishment': 1,
    'racist': 1, 'fascist': 1, 'socialist': 1, 'communist': 1, 'radical': 1,
    'corruption': 1, 'scandal': 1, 'conspiracy': 1, 'freedom': 1, 'patriot': 1,
    'america': 1, 'american': 1, 'nationalism': 1, 'globalist': 1, 'populist': 1,
    'lying': 1, 'hoax': 1, 'crooked': 1, 'swamp': 1, 'drain': 1, 'deep state': 1
}

# 1. ARTICLE LENGTH FEATURES
def extract_length_features(df):
    """Extract features based on article length and structure"""
    print("Extracting length-based features...")
    
    # Create a copy to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Basic length features
    df['word_count'] = df['text'].fillna("").apply(lambda x: len(x.split()))
    df['char_count'] = df['text'].fillna("").apply(len)
    df['avg_word_length'] = df.apply(
        lambda x: x['char_count'] / x['word_count'] if x['word_count'] > 0 else 0, 
        axis=1
    )
    
    # Processed text features
    df['processed_word_count'] = df['processed_text'].fillna("").apply(lambda x: len(x.split()))
    df['unique_word_count'] = df['processed_text'].fillna("").apply(lambda x: len(set(x.split())))
    
    # Vocabulary diversity
    df['vocab_diversity'] = df.apply(
        lambda x: x['unique_word_count'] / x['processed_word_count'] if x['processed_word_count'] > 0 else 0, 
        axis=1
    )
    
    # Sentence structure
    df['sentence_count'] = df['text'].fillna("").apply(lambda x: len(nltk.sent_tokenize(x)))
    df['avg_sentence_length'] = df.apply(
        lambda x: x['word_count'] / x['sentence_count'] if x['sentence_count'] > 0 else 0, 
        axis=1
    )
    
    # Paragraph structure (approximated by double newlines)
    df['paragraph_count'] = df['text'].fillna("").apply(lambda x: x.count('\n\n') + 1)
    df['avg_paragraph_length'] = df.apply(
        lambda x: x['word_count'] / x['paragraph_count'] if x['paragraph_count'] > 0 else 0, 
        axis=1
    )
    
    # Create length-based feature subset
    length_features = [
        'word_count', 'char_count', 'avg_word_length', 
        'unique_word_count', 'vocab_diversity',
        'sentence_count', 'avg_sentence_length',
        'paragraph_count', 'avg_paragraph_length'
    ]
    
    return df[length_features]

# 2. LEXICAL FEATURES - POLITICALLY CHARGED TERMS
def extract_lexical_features(df):
    """Extract features based on presence of politically charged terms"""
    print("Extracting lexical features...")
    
    # Create a copy
    df = df.copy()
    
    # Function to count politically charged terms
    def count_charged_terms(text):
        if not isinstance(text, str) or not text:
            return 0
        words = word_tokenize(text.lower())
        return sum(1 for word in words if word in politically_charged_terms)
    
    # Function to calculate ratio of charged terms
    def charged_terms_ratio(text):
        if not isinstance(text, str) or not text:
            return 0
        words = word_tokenize(text.lower())
        if len(words) == 0:
            return 0
        return count_charged_terms(text) / len(words)
    
    # Extract counts for specific political terms
    for term in ['trump', 'clinton', 'hillary', 'obama', 'republican', 'democrat', 
                 'conservative', 'liberal', 'fake', 'america', 'american']:
        df[f'count_{term}'] = df['text'].fillna("").apply(
            lambda x: len(re.findall(r'\b' + term + r'\b', x.lower()))
        )
    
    # Create aggregate features
    df['political_terms_count'] = df['text'].fillna("").apply(count_charged_terms)
    df['political_terms_ratio'] = df['text'].fillna("").apply(charged_terms_ratio)
    
    # Count political bigrams
    political_bigrams = ['fake news', 'deep state', 'white house', 
                         'hillary clinton', 'donald trump', 'president trump',
                         'white supremacist', 'mainstream media', 'ruling class']
    
    for bigram in political_bigrams:
        df[f'count_{bigram.replace(" ", "_")}'] = df['text'].fillna("").apply(
            lambda x: len(re.findall(r'\b' + bigram + r'\b', x.lower()))
        )
    
    # Select lexical features
    lexical_features = [col for col in df.columns if col.startswith('count_') or col.endswith('_ratio')]
    
    return df[lexical_features]

# 3. SENTIMENT FEATURES
def extract_sentiment_features(df):
    """Extract sentiment-based features"""
    print("Extracting sentiment features...")
    
    # Create a copy
    df = df.copy()
    
    # Function to extract VADER sentiment
    def get_sentiment(text):
        if not isinstance(text, str) or not text:
            return {'compound': 0, 'pos': 0, 'neg': 0, 'neu': 0}
        return sid.polarity_scores(text)
    
    # Apply sentiment analysis
    df['sentiment'] = df['text'].fillna("").apply(get_sentiment)
    df['sentiment_compound'] = df['sentiment'].apply(lambda x: x['compound'])
    df['sentiment_positive'] = df['sentiment'].apply(lambda x: x['pos'])
    df['sentiment_negative'] = df['sentiment'].apply(lambda x: x['neg'])
    df['sentiment_neutral'] = df['sentiment'].apply(lambda x: x['neu'])
    
    # Calculate derived sentiment features
    df['sentiment_emotional_ratio'] = df.apply(
        lambda x: (x['sentiment_positive'] + x['sentiment_negative']) / x['sentiment_neutral'] 
        if x['sentiment_neutral'] > 0 else 0, 
        axis=1
    )
    
    # Calculate sentiment variance by sentence
    def sentence_sentiment_variance(text):
        if not isinstance(text, str) or not text:
            return 0
        sentences = nltk.sent_tokenize(text)
        if len(sentences) <= 1:
            return 0
        sentiments = [sid.polarity_scores(s)['compound'] for s in sentences]
        return np.var(sentiments)
    
    df['sentiment_variance'] = df['text'].fillna("").apply(sentence_sentiment_variance)
    
    # Select sentiment features
    sentiment_features = [
        'sentiment_compound', 'sentiment_positive', 'sentiment_negative', 
        'sentiment_neutral', 'sentiment_emotional_ratio', 'sentiment_variance'
    ]
    
    return df[sentiment_features]

# 4. TF-IDF FEATURES
def extract_tfidf_features(df, max_features=1000):
    """Extract TF-IDF features from text"""
    print("Extracting TF-IDF features...")
    
    # Create a TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(
        max_features=max_features,
        min_df=3,
        max_df=0.95,
        stop_words='english'
    )
    
    # Fit and transform the processed text
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text'].fillna(""))
    
    # Convert to DataFrame
    tfidf_df = pd.DataFrame(
        tfidf_matrix.toarray(),
        columns=[f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])]
    )
    
    # Save the vectorizer for future use
    with open(os.path.join(features_dir, 'tfidf_vectorizer.pkl'), 'wb') as f:
        pickle.dump(tfidf_vectorizer, f)
    
    print(f"Created {tfidf_matrix.shape[1]} TF-IDF features")
    return tfidf_df

# 5. N-GRAM FEATURES
def extract_ngram_features(df, max_features=100):
    """Extract important n-gram features"""
    print("Extracting n-gram features...")
    
    # Create bigram vectorizer
    bigram_vectorizer = CountVectorizer(
        ngram_range=(2, 2),
        max_features=max_features,
        min_df=5
    )
    
    # Fit and transform the processed text
    bigram_matrix = bigram_vectorizer.fit_transform(df['processed_text'].fillna(""))
    
    # Get feature names
    feature_names = bigram_vectorizer.get_feature_names_out()
    
    # Convert to DataFrame
    bigram_df = pd.DataFrame(
        bigram_matrix.toarray(),
        columns=[f'bigram_{name.replace(" ", "_")}' for name in feature_names]
    )
    
    # Save the vectorizer for future use
    with open(os.path.join(features_dir, 'bigram_vectorizer.pkl'), 'wb') as f:
        pickle.dump(bigram_vectorizer, f)
    
    print(f"Created {len(feature_names)} bigram features")
    return bigram_df

# 6. COMBINED FEATURE EXTRACTION
def create_feature_matrix(df, include_tfidf=True, include_ngrams=True):
    """Combine all features into a single feature matrix"""
    print("Creating combined feature matrix...")
    
    # Extract different feature sets
    length_features_df = extract_length_features(df)
    lexical_features_df = extract_lexical_features(df)
    sentiment_features_df = extract_sentiment_features(df)
    
    # Combine base features
    combined_features = pd.concat([
        length_features_df, 
        lexical_features_df,
        sentiment_features_df
    ], axis=1)
    
    # Add TF-IDF features if requested (can be large)
    if include_tfidf:
        tfidf_features_df = extract_tfidf_features(df)
        combined_features = pd.concat([combined_features, tfidf_features_df], axis=1)
    
    # Add n-gram features if requested
    if include_ngrams:
        ngram_features_df = extract_ngram_features(df)
        combined_features = pd.concat([combined_features, ngram_features_df], axis=1)
    
    print(f"Created combined feature matrix with {combined_features.shape[1]} features")
    return combined_features

# MAIN EXECUTION
# Step 1: Create combined feature set
print("Starting feature engineering process...")
features_df = create_feature_matrix(articles_df)

# Step 2: Add target variable
features_df['hyperpartisan'] = articles_df['hyperpartisan']

# Step 3: Split into training and testing sets
X = features_df.drop('hyperpartisan', axis=1)
y = features_df['hyperpartisan']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 4: Scale numerical features
# Get only numeric columns
numeric_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

# Step 5: Save processed data
with open(os.path.join(features_dir, 'scaler.pkl'), 'wb') as f:
    pickle.dump(scaler, f)

X_train.to_csv(os.path.join(features_dir, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(features_dir, 'X_test.csv'), index=False)
y_train.to_csv(os.path.join(features_dir, 'y_train.csv'), index=False)
y_test.to_csv(os.path.join(features_dir, 'y_test.csv'), index=False)

# Save a sample of features for inspection
features_df.head(10).to_csv(os.path.join(features_dir, 'features_sample.csv'), index=False)

# Print feature categories counts
length_features = [col for col in features_df.columns if col in extract_length_features(articles_df).columns]
lexical_features = [col for col in features_df.columns if col in extract_lexical_features(articles_df).columns]
sentiment_features = [col for col in features_df.columns if col in extract_sentiment_features(articles_df).columns]
tfidf_features = [col for col in features_df.columns if col.startswith('tfidf_')]
ngram_features = [col for col in features_df.columns if col.startswith('bigram_')]

print("\nFeature Summary:")
print(f"Total features: {len(features_df.columns) - 1}")  # Exclude target
print(f"Length features: {len(length_features)}")
print(f"Lexical features: {len(lexical_features)}")
print(f"Sentiment features: {len(sentiment_features)}")
print(f"TF-IDF features: {len(tfidf_features)}")
print(f"N-gram features: {len(ngram_features)}")

print("\nFeature engineering complete. Files saved to:", features_dir)
print("Now you can proceed to model training with these engineered features.")