In [None]:
# fake_news_text_preprocessing.py
# Text Preprocessing Module - Lowercasing & URL Removal

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import os

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 300)

# Set matplotlib style safely
try:
    plt.style.use('seaborn-v0_8')
except:
    plt.style.use('seaborn')  # Fallback for older versions
    print("Using seaborn style (seaborn-v0_8 not available)")

print("=== FAKE NEWS DETECTION - TEXT PREPROCESSING MODULE ===\n")


## 1. DATA LOADING

In [None]:
print("1. LOADING AND PREPARING DATASET...")

try:
    # Extract dataset from zip file
    print("Extracting dataset from archive...")
    with zipfile.ZipFile("../data/archive.zip", 'r') as zip_ref:
        zip_ref.extractall("unzipped_data")
    print("✓ Dataset extracted successfully!")
    
    # Load the datasets
    df_fake = pd.read_csv("unzipped_data/Fake.csv")
    df_true = pd.read_csv("unzipped_data/True.csv")

except FileNotFoundError:
    print("⚠ Zip file not found. Trying to load CSV files directly...")
    try:
        df_fake = pd.read_csv("Fake.csv")
        df_true = pd.read_csv("True.csv")
        print("✓ CSV files loaded directly!")
    except FileNotFoundError:
        print("❌ Error: Could not find dataset files.")
        print("Please ensure the dataset files are in the correct location.")
        exit()


In [None]:
# Add labels to distinguish between fake and real news
df_fake["label"] = "FAKE"
df_true["label"] = "TRUE"

# Combine into one dataset for processing
df = pd.concat([df_fake, df_true], axis=0).reset_index(drop=True)

print(f"✓ Dataset loaded successfully!")
print(f"   - Total articles: {len(df)}")
print(f"   - Fake articles: {len(df_fake)}")
print(f"   - Real articles: {len(df_true)}")
print(f"   - Columns: {list(df.columns)}\n")

# Display sample of the data
print("Sample of original data:")
print(df[['title', 'text', 'label']].head(2))
print("\n")

## 2. TEXT PREPROCESSING FUNCTIONS

In [None]:
def preprocess_text_lowercase_url(text):
    """
    MAIN PREPROCESSING FUNCTION:
    - Converts text to lowercase
    - Removes URLs, hyperlinks, and website addresses
    - Handles missing values safely
    - Cleans extra whitespace
    """
    # Handle missing values
    if pd.isna(text) or text is None:
        return ""
    
    # Convert to string to ensure consistent processing
    text = str(text)
    
    # COMPREHENSIVE URL REMOVAL PATTERN:
    url_pattern = r'https?://\S+|www\.\S+|\S+\.(com|org|net|edu|gov|io|co|uk)\S*|bit\.ly/\S+|t\.co/\S+'
    
    # Remove all URLs from text
    text = re.sub(url_pattern, '', text)
    
    # Convert entire text to lowercase for consistency
    text = text.lower()
    
    # Clean up extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

print("✓ Preprocessing functions defined!\n")

## 3. APPLY PREPROCESSING

In [None]:
# Create copies of original text for comparison
df['title_original'] = df['title'].copy()
df['text_original'] = df['text'].copy()

# Apply preprocessing to title and text columns
print("Processing titles...")
df['title_clean'] = df['title'].apply(preprocess_text_lowercase_url)

print("Processing article texts...")
df['text_clean'] = df['text'].apply(preprocess_text_lowercase_url)

print("✓ Text preprocessing completed!\n")

## 4. QUALITY CONTROL AND VERIFICATION

In [None]:
def contains_url(text):
    """Check if text contains any URLs"""
    url_pattern = r'https?://|www\.|\.[a-z]{2,}'
    return bool(re.search(url_pattern, str(text).lower()))

def count_uppercase(text):
    """Count uppercase characters in text"""
    return sum(1 for char in str(text) if char.isupper())

# Count URLs in original vs cleaned text
urls_original_title = df['title_original'].apply(contains_url).sum()
urls_clean_title = df['title_clean'].apply(contains_url).sum()
urls_original_text = df['text_original'].apply(contains_url).sum()
urls_clean_text = df['text_clean'].apply(contains_url).sum()

In [None]:
# Check case conversion
uppercase_original = df['text_original'].apply(count_uppercase).sum()
uppercase_clean = df['text_clean'].apply(count_uppercase).sum()

# Verify no data loss occurred
original_non_empty = df['text_original'].apply(lambda x: len(str(x)) > 0).sum()
clean_non_empty = df['text_clean'].apply(lambda x: len(str(x)) > 0).sum()

print("URL Removal Results:")
print(f"  Titles: {urls_original_title} URLs → {urls_clean_title} URLs remaining")
print(f"  Texts:  {urls_original_text} URLs → {urls_clean_text} URLs remaining")
print(f"  Total URLs removed: {urls_original_title + urls_original_text - urls_clean_title - urls_clean_text}")

print("\nCase Conversion Results:")
print(f"  Uppercase characters: {uppercase_original} → {uppercase_clean}")
print(f"  Reduction: {uppercase_original - uppercase_clean} characters")

print("\nData Integrity Check:")
print(f"  Non-empty original texts: {original_non_empty}")
print(f"  Non-empty cleaned texts: {clean_non_empty}")

if original_non_empty == clean_non_empty:
    print("✓ No data loss detected!\n")
else:
    print("⚠ Warning: Possible data loss detected!\n")


## 5. SHOW SAMPLE RESULTS

In [None]:
print("BEFORE PREPROCESSING (First article):")
print("-" * 50)
print("Title:", df['title_original'].iloc[0][:100] + "..." if len(str(df['title_original'].iloc[0])) > 100 else df['title_original'].iloc[0])
print("Text:", df['text_original'].iloc[0][:200] + "..." if len(str(df['text_original'].iloc[0])) > 200 else df['text_original'].iloc[0])

print("\nAFTER PREPROCESSING (First article):")
print("-" * 50)
print("Title:", df['title_clean'].iloc[0][:100] + "..." if len(str(df['title_clean'].iloc[0])) > 100 else df['title_clean'].iloc[0])
print("Text:", df['text_clean'].iloc[0][:200] + "..." if len(str(df['text_clean'].iloc[0])) > 200 else df['text_clean'].iloc[0])

## 6. SIMPLE VISUALIZATION

In [None]:
try:
    # Create a simple visualization
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # URL removal comparison
    categories = ['Titles', 'Texts']
    original_urls = [urls_original_title, urls_original_text]
    clean_urls = [urls_clean_title, urls_clean_text]
    
    x = np.arange(len(categories))
    width = 0.35
    
    ax1.bar(x - width/2, original_urls, width, label='Original', color='red', alpha=0.7)
    ax1.bar(x + width/2, clean_urls, width, label='Cleaned', color='green', alpha=0.7)
    ax1.set_title('URL Removal Effectiveness')
    ax1.set_ylabel('Number of URLs')
    ax1.set_xticks(x)
    ax1.set_xticklabels(categories)
    ax1.legend()
    
    # Uppercase reduction
    case_categories = ['Original', 'Cleaned']
    case_counts = [uppercase_original, uppercase_clean]
    
    ax2.bar(case_categories, case_counts, color=['blue', 'orange'], alpha=0.7)
    ax2.set_title('Uppercase Character Reduction')
    ax2.set_ylabel('Total Uppercase Characters')
    
    plt.tight_layout()
    plt.savefig('preprocessing_results.png', dpi=300, bbox_inches='tight')
    plt.show()
    print("✓ Visualization saved as 'preprocessing_results.png'")
    
except Exception as e:
    print(f"⚠ Visualization failed: {e}")
    print("Continuing without visualization...")

## 7. SAVE PROCESSED DATA

In [None]:
print("\n7. SAVING PROCESSED DATA...")

# Save the processed dataset
output_filename = 'fake_news_preprocessed.csv'
df.to_csv(output_filename, index=False)

print(f"✓ Processed data saved as '{output_filename}'")
print(f"✓ File contains {len(df)} articles")

# Save only the essential columns for the next pipeline stage
essential_columns = ['title_clean', 'text_clean', 'label']
df[essential_columns].to_csv('fake_news_clean.csv', index=False)
print(f"✓ Cleaned data saved as 'fake_news_clean.csv' (essential columns only)")

## 8. FINAL SUMMARY

In [None]:
print("\n" + "="*60)
print("PREPROCESSING COMPLETE - SUMMARY")
print("="*60)

print("✅ TASKS COMPLETED:")
print("   - URLs and hyperlinks successfully removed")
print("   - All text converted to lowercase")
print("   - Data integrity maintained")
print("   - Processed data exported")

print("\n📊 KEY RESULTS:")
print(f"   - URLs removed: {urls_original_title + urls_original_text - urls_clean_title - urls_clean_text}")
print(f"   - Uppercase characters reduced: {uppercase_original - uppercase_clean}")
print(f"   - Articles processed: {len(df)}")

print("\n📁 OUTPUT FILES:")
print("   - fake_news_preprocessed.csv (full dataset with original + cleaned)")
print("   - fake_news_clean.csv (cleaned data only, for next stage)")
print("   - preprocessing_results.png (results visualization)")

print("\n🎯 NEXT STEPS:")
print("   - Data is ready for tokenization and further NLP processing")
print("   - Pass 'fake_news_clean.csv' to the next team member")
print("="*60)