In [1]:
# %% [markdown]
# # Feature Engineering for Hyperpartisan News Detection (Using Official Splits)
# 
# **Purpose:** Create engineered features suitable for baseline machine learning models, using the official SemEval training and testing datasets. This ensures that vectorizers and scalers are fitted ONLY on the training data.
# 
# **Input:** Reads `official_train_data.csv` and `official_test_data.csv` (generated by `xml_parse.ipynb`).
# **Output:** Creates scaled feature matrices (`X_train_scaled.csv`, `X_test_scaled.csv`) and target files (`y_train.csv`, `y_test.csv`) in the `hyperpartisan_features_official` directory.

# %%
# Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
from nltk.tokenize import word_tokenize, sent_tokenize # Use NLTK tokenizer
import pickle
import os
from sklearn.preprocessing import StandardScaler
import warnings

warnings.filterwarnings('ignore')

# %%
# Download required NLTK resources
nltk.download('vader_lexicon', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True) # Still needed for TF-IDF/N-grams

# %%
# Initialize sentiment analyzer
sid = SentimentIntensityAnalyzer()

# %%
# --- Configuration ---
# Directory where official CSVs are located
official_data_dir = "hyperpartisan_data_official" 
# Directory to save engineered features
features_dir = "hyperpartisan_features_official" # Use a new directory
os.makedirs(features_dir, exist_ok=True)

# Input file paths
train_csv_path = os.path.join(official_data_dir, "official_train_data.csv")
test_csv_path = os.path.join(official_data_dir, "official_test_data.csv")

# Output file paths
X_train_output_path = os.path.join(features_dir, 'X_train_scaled.csv')
X_test_output_path = os.path.join(features_dir, 'X_test_scaled.csv')
y_train_output_path = os.path.join(features_dir, 'y_train.csv')
y_test_output_path = os.path.join(features_dir, 'y_test.csv')
scaler_path = os.path.join(features_dir, 'scaler.pkl')
tfidf_vectorizer_path = os.path.join(features_dir, 'tfidf_vectorizer.pkl')
bigram_vectorizer_path = os.path.join(features_dir, 'bigram_vectorizer.pkl')

# Feature extraction parameters
TFIDF_MAX_FEATURES = 1000
NGRAM_MAX_FEATURES = 100

# Politically charged terms dictionary (from EDA)
politically_charged_terms = {
    'trump': 1, 'clinton': 1, 'hillary': 1, 'obama': 1, 'donald': 1, 'biden': 1,
    'republican': 1, 'democrat': 1, 'conservative': 1, 'liberal': 1, 'progressive': 1, 
    'left': 1, 'right': 1, 'leftist': 1, 'rightist': 1, 'gop': 1, 'democratic': 1,
    'fake': 1, 'propaganda': 1, 'elite': 1, 'mainstream': 1, 'establishment': 1,
    'racist': 1, 'fascist': 1, 'socialist': 1, 'communist': 1, 'radical': 1,
    'corruption': 1, 'scandal': 1, 'conspiracy': 1, 'freedom': 1, 'patriot': 1,
    'america': 1, 'american': 1, 'nationalism': 1, 'globalist': 1, 'populist': 1,
    'lying': 1, 'hoax': 1, 'crooked': 1, 'swamp': 1, 'drain': 1, 'deep state': 1
}

# Political bigrams list (from EDA)
political_bigrams = ['fake news', 'deep state', 'white house', 
                     'hillary clinton', 'donald trump', 'president trump',
                     'white supremacist', 'mainstream media', 'ruling class']

# %%
# --- Load Data ---
print("--- Loading Official Train/Test Data ---")
if not os.path.exists(train_csv_path) or not os.path.exists(test_csv_path):
    raise FileNotFoundError(f"Official train/test CSV files not found in '{official_data_dir}'. Please run the XML parsing script first.")

train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

# Ensure text columns are strings and handle NaNs
train_df['text'] = train_df['text'].fillna("").astype(str)
test_df['text'] = test_df['text'].fillna("").astype(str)

print(f"Loaded Training Data: {train_df.shape}")
print(f"Loaded Test Data: {test_df.shape}")

# Separate target variable
y_train = train_df['hyperpartisan'].astype(int)
y_test = test_df['hyperpartisan'].astype(int)

# %%
# --- Feature Extraction Functions (Operating on Raw Text) ---

def safe_word_tokenize(text):
    """Tokenizes text safely, handling potential non-string inputs."""
    try:
        # Ensure input is treated as string, handle potential floats/ints if necessary
        return word_tokenize(str(text).lower()) 
    except Exception: # Catch any unexpected error during tokenization
        return []

def extract_length_features(df):
    """Extract features based on article length and structure from RAW text"""
    print("Extracting length-based features...")
    features = pd.DataFrame(index=df.index) # Create new df to avoid modifying original
    
    # Use raw text
    raw_text = df['text']
    
    features['char_count'] = raw_text.apply(len)
    features['tokens_raw'] = raw_text.apply(safe_word_tokenize) # Use safe tokenizer
    features['word_count_raw'] = features['tokens_raw'].apply(len)
    features['unique_word_count_raw'] = features['tokens_raw'].apply(lambda x: len(set(x)))
    
    features['vocab_diversity_raw'] = features.apply(
        lambda x: x['unique_word_count_raw'] / x['word_count_raw'] if x['word_count_raw'] > 0 else 0, 
        axis=1
    )
    
    features['sentence_count'] = raw_text.apply(lambda x: len(sent_tokenize(str(x))) if pd.notna(x) else 0) # Use sent_tokenize
    features['avg_sentence_length'] = features.apply(
        lambda x: x['word_count_raw'] / x['sentence_count'] if x['sentence_count'] > 0 else 0, 
        axis=1
    )
    features['avg_word_length'] = features.apply(
        lambda x: x['char_count'] / x['word_count_raw'] if x['word_count_raw'] > 0 else 0,
        axis=1
    )
    
    # Drop the intermediate token list
    features = features.drop(columns=['tokens_raw']) 
    
    # Paragraph features (less reliable but kept from original)
    features['paragraph_count'] = raw_text.apply(lambda x: str(x).count('\n\n') + 1)
    features['avg_paragraph_length'] = features.apply(
        lambda x: x['word_count_raw'] / x['paragraph_count'] if x['paragraph_count'] > 0 else 0, 
        axis=1
    )
    
    return features

def extract_lexical_features(df):
    """Extract features based on presence of politically charged terms in RAW text"""
    print("Extracting lexical features...")
    features = pd.DataFrame(index=df.index)
    raw_text = df['text']

    def count_charged_terms(text):
        words = safe_word_tokenize(text) # Use safe tokenizer
        return sum(1 for word in words if word in politically_charged_terms)

    def charged_terms_ratio(text):
        words = safe_word_tokenize(text)
        if not words: return 0
        return count_charged_terms(text) / len(words)

    # Specific term counts
    for term in ['trump', 'clinton', 'hillary', 'obama', 'republican', 'democrat', 
                 'conservative', 'liberal', 'fake', 'america', 'american']:
        # Use regex for whole word matching, handle potential errors for odd characters
        try:
            regex = r'\b' + re.escape(term) + r'\b'
            features[f'count_{term}'] = raw_text.apply(lambda x: len(re.findall(regex, str(x).lower())))
        except re.error:
            print(f"Warning: Could not compile regex for term '{term}'. Skipping count.")
            features[f'count_{term}'] = 0


    features['political_terms_count'] = raw_text.apply(count_charged_terms)
    features['political_terms_ratio'] = raw_text.apply(charged_terms_ratio)

    # Political bigram counts
    for bigram in political_bigrams:
         try:
            regex = r'\b' + re.escape(bigram) + r'\b'
            features[f'count_{bigram.replace(" ", "_")}'] = raw_text.apply(
                lambda x: len(re.findall(regex, str(x).lower()))
            )
         except re.error:
             print(f"Warning: Could not compile regex for bigram '{bigram}'. Skipping count.")
             features[f'count_{bigram.replace(" ", "_")}'] = 0

    return features

def extract_sentiment_features(df):
    """Extract sentiment-based features from RAW text"""
    print("Extracting sentiment features...")
    features = pd.DataFrame(index=df.index)
    raw_text = df['text']

    def get_sentiment(text):
        try:
            # Ensure text is string for VADER
            return sid.polarity_scores(str(text))
        except: # Catch potential errors if text is not string-like
            return {'compound': 0, 'pos': 0, 'neg': 0, 'neu': 1.0} # Return neutral default

    sentiment_scores = raw_text.apply(get_sentiment)
    features['sentiment_compound'] = sentiment_scores.apply(lambda x: x['compound'])
    features['sentiment_positive'] = sentiment_scores.apply(lambda x: x['pos'])
    features['sentiment_negative'] = sentiment_scores.apply(lambda x: x['neg'])
    features['sentiment_neutral'] = sentiment_scores.apply(lambda x: x['neu'])

    features['sentiment_emotional_ratio'] = features.apply(
        lambda x: (x['sentiment_positive'] + x['sentiment_negative']) / x['sentiment_neutral']
        if x['sentiment_neutral'] > 0 else 0,
        axis=1
    )

    def sentence_sentiment_variance(text):
        try:
            sentences = sent_tokenize(str(text))
            if len(sentences) <= 1: return 0
            sentiments = [sid.polarity_scores(s)['compound'] for s in sentences]
            return np.var(sentiments)
        except:
             return 0 # Return 0 if sentence tokenization fails

    features['sentiment_variance'] = raw_text.apply(sentence_sentiment_variance)

    return features

# %%
# --- Feature Generation ---
print("--- Starting Feature Engineering Process ---")

# Extract non-text features for train and test sets
print("\nProcessing Training Set Features...")
train_length_features = extract_length_features(train_df)
train_lexical_features = extract_lexical_features(train_df)
train_sentiment_features = extract_sentiment_features(train_df)

print("\nProcessing Test Set Features...")
test_length_features = extract_length_features(test_df)
test_lexical_features = extract_lexical_features(test_df)
test_sentiment_features = extract_sentiment_features(test_df)

# --- TF-IDF Features ---
print("\nExtracting TF-IDF Features (Fit on Train, Transform Train & Test)...")
tfidf_vectorizer = TfidfVectorizer(
    max_features=TFIDF_MAX_FEATURES,
    min_df=3, # Keep min_df
    max_df=0.95,
    stop_words='english'
)
# Fit ONLY on training data raw text
train_tfidf_matrix = tfidf_vectorizer.fit_transform(train_df['text'])
# Transform test data raw text
test_tfidf_matrix = tfidf_vectorizer.transform(test_df['text'])

# Get feature names FROM THE FIT on training data
tfidf_feature_names = [f'tfidf_{i}' for i in range(train_tfidf_matrix.shape[1])] # Generic names

# Convert sparse matrices to DataFrames with consistent columns
X_train_tfidf = pd.DataFrame(train_tfidf_matrix.toarray(), columns=tfidf_feature_names, index=train_df.index)
X_test_tfidf = pd.DataFrame(test_tfidf_matrix.toarray(), columns=tfidf_feature_names, index=test_df.index)

print(f"Created {X_train_tfidf.shape[1]} TF-IDF features")
# Save the fitted vectorizer
with open(tfidf_vectorizer_path, 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)
print(f"Saved TF-IDF vectorizer to {tfidf_vectorizer_path}")

# --- N-Gram Features (Bigrams) ---
print("\nExtracting N-gram Features (Fit on Train, Transform Train & Test)...")
bigram_vectorizer = CountVectorizer(
    ngram_range=(2, 2),
    max_features=NGRAM_MAX_FEATURES,
    min_df=5, # Keep min_df
    stop_words='english' # Apply stopwords here as well
)
# Fit ONLY on training data raw text
train_bigram_matrix = bigram_vectorizer.fit_transform(train_df['text'])
# Transform test data raw text
test_bigram_matrix = bigram_vectorizer.transform(test_df['text'])

# Get feature names FROM THE FIT on training data
bigram_feature_names = [f'bigram_{name.replace(" ", "_")}' for name in bigram_vectorizer.get_feature_names_out()]

# Convert sparse matrices to DataFrames
X_train_bigram = pd.DataFrame(train_bigram_matrix.toarray(), columns=bigram_feature_names, index=train_df.index)
X_test_bigram = pd.DataFrame(test_bigram_matrix.toarray(), columns=bigram_feature_names, index=test_df.index)

print(f"Created {X_train_bigram.shape[1]} bigram features")
# Save the fitted vectorizer
with open(bigram_vectorizer_path, 'wb') as f:
    pickle.dump(bigram_vectorizer, f)
print(f"Saved Bigram vectorizer to {bigram_vectorizer_path}")

# %%
# --- Combine All Features ---
print("\nCombining all features...")

X_train_combined = pd.concat([
    train_length_features, 
    train_lexical_features,
    train_sentiment_features,
    X_train_tfidf,
    X_train_bigram
], axis=1)

X_test_combined = pd.concat([
    test_length_features, 
    test_lexical_features,
    test_sentiment_features,
    X_test_tfidf,
    X_test_bigram
], axis=1)

print(f"Final Training Feature Matrix shape: {X_train_combined.shape}")
print(f"Final Test Feature Matrix shape: {X_test_combined.shape}")

# --- Sanity Check: Ensure columns match exactly ---
if not all(X_train_combined.columns == X_test_combined.columns):
     print("\nWARNING: Train and Test columns do not match perfectly!")
     # Find differences if any (debugging)
     train_cols = set(X_train_combined.columns)
     test_cols = set(X_test_combined.columns)
     print("Columns in Train but not Test:", sorted(list(train_cols - test_cols)))
     print("Columns in Test but not Train:", sorted(list(test_cols - train_cols)))
     # Attempt to align columns - adding missing ones with 0s
     print("Attempting to align columns by adding missing ones with 0...")
     all_cols = X_train_combined.columns.union(X_test_combined.columns)
     X_train_combined = X_train_combined.reindex(columns=all_cols, fill_value=0)
     X_test_combined = X_test_combined.reindex(columns=all_cols, fill_value=0)
     print(f"Aligned Training Feature Matrix shape: {X_train_combined.shape}")
     print(f"Aligned Test Feature Matrix shape: {X_test_combined.shape}")

# %%
# --- Scale Numerical Features ---
print("\nScaling numerical features...")
# Identify numerical columns (excluding potentially introduced object columns if errors occurred)
numeric_cols = X_train_combined.select_dtypes(include=np.number).columns.tolist()
print(f"Found {len(numeric_cols)} numerical features to scale.")

# Initialize and fit scaler ONLY on training data
scaler = StandardScaler()
X_train_scaled = X_train_combined.copy()
X_test_scaled = X_test_combined.copy()

X_train_scaled[numeric_cols] = scaler.fit_transform(X_train_combined[numeric_cols])
# Transform test data using the scaler fitted on train data
X_test_scaled[numeric_cols] = scaler.transform(X_test_combined[numeric_cols])
print("Features scaled.")

# --- Save Processed Data ---
print("\nSaving scaled features and labels...")
# Save the fitted scaler
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)
print(f"Saved Scaler to {scaler_path}")

# Save feature matrices and labels
X_train_scaled.to_csv(X_train_output_path, index=False)
X_test_scaled.to_csv(X_test_output_path, index=False)
y_train.to_csv(y_train_output_path, index=False, header=True) # Include header for Series
y_test.to_csv(y_test_output_path, index=False, header=True) # Include header for Series
print(f"Saved training features to {X_train_output_path}")
print(f"Saved test features to {X_test_output_path}")
print(f"Saved training labels to {y_train_output_path}")
print(f"Saved test labels to {y_test_output_path}")

# Save a sample of scaled features for inspection
X_train_scaled.head(10).to_csv(os.path.join(features_dir, 'X_train_scaled_sample.csv'), index=False)

# --- Final Feature Summary ---
print("\n--- Feature Engineering Summary ---")
print(f"Total features generated: {X_train_scaled.shape[1]}")
# Recalculate feature type counts based on final columns
final_cols = X_train_scaled.columns
length_features_final = [col for col in final_cols if col in train_length_features.columns]
lexical_features_final = [col for col in final_cols if col in train_lexical_features.columns]
sentiment_features_final = [col for col in final_cols if col in train_sentiment_features.columns]
tfidf_features_final = [col for col in final_cols if col.startswith('tfidf_')]
ngram_features_final = [col for col in final_cols if col.startswith('bigram_')]

print(f"Length features: {len(length_features_final)}")
print(f"Lexical features: {len(lexical_features_final)}")
print(f"Sentiment features: {len(sentiment_features_final)}")
print(f"TF-IDF features: {len(tfidf_features_final)}")
print(f"N-gram features: {len(ngram_features_final)}")

print("\nFeature engineering for baseline models complete.")
print(f"Output files are in: {features_dir}")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


--- Loading Official Train/Test Data ---
Loaded Training Data: (645, 5)
Loaded Test Data: (628, 5)
--- Starting Feature Engineering Process ---

Processing Training Set Features...
Extracting length-based features...
Extracting lexical features...
Extracting sentiment features...

Processing Test Set Features...
Extracting length-based features...
Extracting lexical features...
Extracting sentiment features...

Extracting TF-IDF Features (Fit on Train, Transform Train & Test)...
Created 1000 TF-IDF features
Saved TF-IDF vectorizer to hyperpartisan_features_official/tfidf_vectorizer.pkl

Extracting N-gram Features (Fit on Train, Transform Train & Test)...
Created 100 bigram features
Saved Bigram vectorizer to hyperpartisan_features_official/bigram_vectorizer.pkl

Combining all features...
Final Training Feature Matrix shape: (645, 1137)
Final Test Feature Matrix shape: (628, 1137)

Scaling numerical features...
Found 1137 numerical features to scale.
Features scaled.

Saving scaled feat