# NLP Data Pipeline for Fake News Detection

This notebook implements a comprehensive NLP data pipeline for preprocessing and feature extraction on Kaggle articles and posts from the fake news detection dataset.

## Pipeline Components:
1. **Data Loading & Exploration** - Load and examine the dataset
2. **Text Preprocessing** - Clean and normalize text data
3. **Tokenization & Processing** - Advanced text processing with NLTK/spaCy
4. **Feature Extraction** - TF-IDF, n-grams, word embeddings
5. **Advanced Features** - Sentiment analysis, named entities, readability
6. **Visualization** - Data exploration and analysis
7. **Pipeline Class** - Reusable processing pipeline

In [None]:
# Install required packages
!pip install kagglehub pandas numpy matplotlib seaborn nltk spacy textblob wordcloud scikit-learn plotly
!python -m spacy download en_core_web_sm

# Import required libraries
import kagglehub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")


## 1. Data Loading and Exploration


In [None]:
# Download dataset from Kaggle
path = kagglehub.dataset_download("emineyetm/fake-news-detection-datasets")
print("Path to dataset files:", path)

# List available files in the dataset
dataset_files = os.listdir(path)
print("\nAvailable files in the dataset:")
for file in dataset_files:
    file_path = os.path.join(path, file)
    if os.path.isfile(file_path):
        size = os.path.getsize(file_path) / (1024*1024)  # Size in MB
        print(f"  - {file} ({size:.2f} MB)")

# Load the main dataset files
csv_files = [f for f in dataset_files if f.endswith('.csv')]
print(f"\nLoading {len(csv_files)} CSV file(s)...")

dataframes = {}
for file in csv_files:
    file_path = os.path.join(path, file)
    try:
        df = pd.read_csv(file_path)
        dataframes[file] = df
        print(f"Loaded {file}: {df.shape[0]} rows × {df.shape[1]} columns")
    except Exception as e:
        print(f"Error loading {file}: {e}")


In [None]:
# Explore the dataset structure
print("Dataset Exploration Summary")
print("=" * 50)

# Select the main dataframe (usually the largest one)
if dataframes:
    main_df_name = max(dataframes.keys(), key=lambda k: dataframes[k].shape[0])
    df = dataframes[main_df_name]
    print(f"\nWorking with main dataset: {main_df_name}")
    print(f"Shape: {df.shape}")
    
    print("\nColumn names and types:")
    print(df.dtypes)
    
    print("\nFirst few rows:")
    print(df.head())
    
    print("\nBasic statistics:")
    print(df.describe(include='all'))
    
    print("\nMissing values:")
    missing_values = df.isnull().sum()
    print(missing_values[missing_values > 0])
    
    # Identify text columns
    text_columns = []
    for col in df.columns:
        if df[col].dtype == 'object' and col.lower() in ['text', 'title', 'content', 'article', 'news', 'headline', 'body']:
            text_columns.append(col)
        elif df[col].dtype == 'object' and df[col].str.len().mean() > 20:  # Likely text if average length > 20 chars
            text_columns.append(col)
    
    print(f"\nIdentified text columns: {text_columns}")
    
    # Identify label column
    label_columns = []
    for col in df.columns:
        if col.lower() in ['label', 'class', 'target', 'fake', 'real', 'category']:
            label_columns.append(col)
        elif df[col].nunique() <= 10 and df[col].dtype in ['int64', 'object']:  # Likely categorical
            label_columns.append(col)
    
    print(f"Identified potential label columns: {label_columns}")
    
    # Show label distribution if available
    if label_columns:
        for col in label_columns[:2]:  # Show first 2 potential label columns
            print(f"\nDistribution of '{col}':")
            print(df[col].value_counts())
else:
    print("No datasets loaded!")


## 2. Text Preprocessing Pipeline


In [None]:
import re
import string
from urllib.parse import urlparse
import nltk

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

class TextPreprocessor:
    """Comprehensive text preprocessing pipeline"""
    
    def __init__(self, 
                 lowercase=True, 
                 remove_punctuation=True, 
                 remove_numbers=False,
                 remove_stopwords=True, 
                 lemmatize=True,
                 remove_urls=True,
                 remove_emails=True,
                 remove_html=True,
                 min_length=2):
        
        self.lowercase = lowercase
        self.remove_punctuation = remove_punctuation
        self.remove_numbers = remove_numbers
        self.remove_stopwords = remove_stopwords
        self.lemmatize = lemmatize
        self.remove_urls = remove_urls
        self.remove_emails = remove_emails
        self.remove_html = remove_html
        self.min_length = min_length
        
        # Initialize NLTK components
        if self.remove_stopwords:
            self.stop_words = set(stopwords.words('english'))
        if self.lemmatize:
            self.lemmatizer = WordNetLemmatizer()
    
    def clean_text(self, text):
        """Apply all cleaning steps to a text string"""
        if pd.isna(text) or not isinstance(text, str):
            return ""
        
        # Remove HTML tags
        if self.remove_html:
            text = re.sub(r'<.*?>', '', text)
            
        # Remove URLs
        if self.remove_urls:
            text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
            
        # Remove email addresses
        if self.remove_emails:
            text = re.sub(r'\S+@\S+', '', text)
            
        # Remove extra whitespaces, newlines, tabs
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        
        # Convert to lowercase
        if self.lowercase:
            text = text.lower()
            
        # Remove punctuation
        if self.remove_punctuation:
            text = text.translate(str.maketrans('', '', string.punctuation))
            
        # Remove numbers
        if self.remove_numbers:
            text = re.sub(r'\d+', '', text)
            
        # Tokenize
        tokens = word_tokenize(text)
        
        # Remove stopwords
        if self.remove_stopwords:
            tokens = [token for token in tokens if token not in self.stop_words]
            
        # Lemmatize
        if self.lemmatize:
            tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
            
        # Remove short tokens
        tokens = [token for token in tokens if len(token) >= self.min_length]
        
        return ' '.join(tokens)
    
    def process_dataframe(self, df, text_column, output_column=None):
        """Process a pandas DataFrame text column"""
        if output_column is None:
            output_column = f"{text_column}_processed"
            
        print(f"Processing {len(df)} texts...")
        df[output_column] = df[text_column].apply(self.clean_text)
        
        # Calculate processing statistics
        original_lengths = df[text_column].str.len().fillna(0)
        processed_lengths = df[output_column].str.len().fillna(0)
        
        print(f"Original text - Average length: {original_lengths.mean():.1f}, Max: {original_lengths.max()}")
        print(f"Processed text - Average length: {processed_lengths.mean():.1f}, Max: {processed_lengths.max()}")
        print(f"Length reduction: {((original_lengths.mean() - processed_lengths.mean()) / original_lengths.mean() * 100):.1f}%")
        
        return df

# Initialize the preprocessor
preprocessor = TextPreprocessor()

print("Text Preprocessor initialized with default settings:")
print(f"- Lowercase: {preprocessor.lowercase}")
print(f"- Remove punctuation: {preprocessor.remove_punctuation}")
print(f"- Remove stopwords: {preprocessor.remove_stopwords}")
print(f"- Lemmatize: {preprocessor.lemmatize}")
print(f"- Remove URLs: {preprocessor.remove_urls}")
print(f"- Remove emails: {preprocessor.remove_emails}")
print(f"- Remove HTML: {preprocessor.remove_html}")
print(f"- Minimum token length: {preprocessor.min_length}")


In [None]:
# Apply preprocessing to the dataset
if dataframes and text_columns:
    # Use the first identified text column
    main_text_column = text_columns[0]
    print(f"Preprocessing text column: '{main_text_column}'")
    
    # Show before and after examples
    print("\n" + "="*80)
    print("BEFORE AND AFTER PREPROCESSING EXAMPLES:")
    print("="*80)
    
    sample_texts = df[main_text_column].dropna().head(3)
    for i, (idx, original_text) in enumerate(sample_texts.items(), 1):
        processed_text = preprocessor.clean_text(original_text)
        
        print(f"\nExample {i}:")
        print(f"Original ({len(original_text)} chars):")
        print(f"'{original_text[:200]}{'...' if len(original_text) > 200 else ''}'")
        print(f"\nProcessed ({len(processed_text)} chars):")
        print(f"'{processed_text[:200]}{'...' if len(processed_text) > 200 else ''}'")
        print("-" * 60)
    
    # Apply preprocessing to the entire dataset
    df = preprocessor.process_dataframe(df, main_text_column)
    processed_column = f"{main_text_column}_processed"
    
    # Remove rows with empty processed text
    original_rows = len(df)
    df = df[df[processed_column].str.len() > 0]
    print(f"\nRemoved {original_rows - len(df)} rows with empty processed text")
    print(f"Final dataset size: {len(df)} rows")
    
else:
    print("No text columns identified for preprocessing!")


## 3. Advanced Tokenization and Text Processing


In [None]:
import spacy
from collections import Counter
import textstat

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
    print("spaCy model loaded successfully!")
except OSError:
    print("spaCy model not found. Installing...")
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

class AdvancedTextProcessor:
    """Advanced text processing using spaCy for NLP tasks"""
    
    def __init__(self, nlp_model):
        self.nlp = nlp_model
        
    def extract_linguistic_features(self, text, max_length=1000000):
        """Extract advanced linguistic features from text"""
        if pd.isna(text) or not isinstance(text, str) or len(text) == 0:
            return self._empty_features()
        
        # Truncate text if too long (spaCy has limits)
        if len(text) > max_length:
            text = text[:max_length]
            
        doc = self.nlp(text)
        
        features = {
            # Token-level features
            'token_count': len(doc),
            'sentence_count': len(list(doc.sents)),
            'avg_token_length': np.mean([len(token.text) for token in doc]) if doc else 0,
            
            # POS tag distribution
            'noun_count': sum(1 for token in doc if token.pos_ == 'NOUN'),
            'verb_count': sum(1 for token in doc if token.pos_ == 'VERB'),
            'adj_count': sum(1 for token in doc if token.pos_ == 'ADJ'),
            'adv_count': sum(1 for token in doc if token.pos_ == 'ADV'),
            
            # Named entities
            'entity_count': len(doc.ents),
            'person_count': sum(1 for ent in doc.ents if ent.label_ == 'PERSON'),
            'org_count': sum(1 for ent in doc.ents if ent.label_ == 'ORG'),
            'gpe_count': sum(1 for ent in doc.ents if ent.label_ == 'GPE'),  # Geopolitical entities
            
            # Readability metrics
            'flesch_reading_ease': textstat.flesch_reading_ease(text),
            'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text),
            'gunning_fog': textstat.gunning_fog(text),
            
            # Complexity metrics
            'avg_sentence_length': np.mean([len(sent) for sent in doc.sents]) if list(doc.sents) else 0,
            'unique_token_ratio': len(set(token.lemma_.lower() for token in doc if token.is_alpha)) / len(doc) if doc else 0,
        }
        
        return features
    
    def _empty_features(self):
        """Return empty features for invalid text"""
        return {
            'token_count': 0, 'sentence_count': 0, 'avg_token_length': 0,
            'noun_count': 0, 'verb_count': 0, 'adj_count': 0, 'adv_count': 0,
            'entity_count': 0, 'person_count': 0, 'org_count': 0, 'gpe_count': 0,
            'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0, 'gunning_fog': 0,
            'avg_sentence_length': 0, 'unique_token_ratio': 0
        }
    
    def extract_entities(self, text):
        """Extract named entities from text"""
        if pd.isna(text) or not isinstance(text, str):
            return []
        
        doc = self.nlp(text[:1000000])  # Limit text length
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        return entities
    
    def get_pos_tags(self, text):
        """Get part-of-speech tags for text"""
        if pd.isna(text) or not isinstance(text, str):
            return []
        
        doc = self.nlp(text[:1000000])  # Limit text length
        pos_tags = [(token.text, token.pos_, token.lemma_) for token in doc if token.is_alpha]
        return pos_tags

# Initialize advanced processor
advanced_processor = AdvancedTextProcessor(nlp)
print("Advanced text processor initialized with spaCy!")


In [None]:
# Demonstrate advanced tokenization on sample texts
if 'df' in locals() and not df.empty and text_columns:
    print("Demonstrating Advanced Text Processing")
    print("=" * 50)
    
    # Take a sample text for detailed analysis
    sample_idx = df.index[0]
    sample_text = df.loc[sample_idx, main_text_column]
    
    if pd.notna(sample_text) and len(sample_text) > 50:
        print(f"\nSample Text Analysis:")
        print(f"Text preview: {sample_text[:200]}...")
        
        # Extract linguistic features
        features = advanced_processor.extract_linguistic_features(sample_text)
        
        print(f"\nLinguistic Features:")
        for feature, value in features.items():
            print(f"  {feature}: {value:.2f}" if isinstance(value, float) else f"  {feature}: {value}")
        
        # Extract named entities
        entities = advanced_processor.extract_entities(sample_text)
        if entities:
            print(f"\nNamed Entities (first 10):")
            for entity, label in entities[:10]:
                print(f"  {entity} ({label})")
        
        # Show POS tags for first sentence
        pos_tags = advanced_processor.get_pos_tags(sample_text)
        if pos_tags:
            print(f"\nPart-of-Speech Tags (first 15 tokens):")
            for token, pos, lemma in pos_tags[:15]:
                print(f"  {token} -> {pos} (lemma: {lemma})")
    
    # Extract linguistic features for a subset of the data (for performance)
    print(f"\nExtracting linguistic features for first 100 rows...")
    sample_df = df.head(100).copy()
    
    # Extract features
    linguistic_features = []
    for idx, row in sample_df.iterrows():
        text = row[main_text_column]
        features = advanced_processor.extract_linguistic_features(text)
        linguistic_features.append(features)
    
    # Convert to DataFrame
    features_df = pd.DataFrame(linguistic_features)
    
    print(f"\nLinguistic Features Summary:")
    print(features_df.describe())
    
    # Store the features for later use
    for col in features_df.columns:
        sample_df[f'ling_{col}'] = features_df[col]
    
    print(f"\nAdded {len(features_df.columns)} linguistic features to the dataset!")
else:
    print("No valid data available for advanced processing demonstration.")


## 4. Basic Feature Extraction (TF-IDF, N-grams, Word Counts)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter
import numpy as np

class FeatureExtractor:
    """Extract various text features for machine learning"""
    
    def __init__(self):
        self.tfidf_vectorizers = {}
        self.count_vectorizers = {}
        self.fitted = False
        
    def extract_basic_features(self, texts):
        """Extract basic statistical features from texts"""
        features = []
        
        for text in texts:
            if pd.isna(text) or not isinstance(text, str):
                text = ""
            
            # Basic counts
            char_count = len(text)
            word_count = len(text.split())
            sentence_count = text.count('.') + text.count('!') + text.count('?')
            
            # Average lengths
            avg_word_length = np.mean([len(word) for word in text.split()]) if text.split() else 0
            avg_sentence_length = word_count / max(sentence_count, 1)
            
            # Character statistics
            uppercase_count = sum(1 for c in text if c.isupper())
            digit_count = sum(1 for c in text if c.isdigit())
            special_char_count = sum(1 for c in text if not c.isalnum() and not c.isspace())
            
            # Punctuation
            punctuation_count = sum(1 for c in text if c in string.punctuation)
            exclamation_count = text.count('!')
            question_count = text.count('?')
            
            features.append({
                'char_count': char_count,
                'word_count': word_count,
                'sentence_count': max(sentence_count, 1),  # At least 1 sentence
                'avg_word_length': avg_word_length,
                'avg_sentence_length': avg_sentence_length,
                'uppercase_ratio': uppercase_count / max(char_count, 1),
                'digit_ratio': digit_count / max(char_count, 1),
                'special_char_ratio': special_char_count / max(char_count, 1),
                'punctuation_ratio': punctuation_count / max(char_count, 1),
                'exclamation_count': exclamation_count,
                'question_count': question_count,
            })
        
        return pd.DataFrame(features)
    
    def fit_tfidf(self, texts, ngram_range=(1, 1), max_features=5000, min_df=2, max_df=0.95):
        """Fit TF-IDF vectorizer"""
        vectorizer_key = f"tfidf_{ngram_range[0]}_{ngram_range[1]}"
        
        self.tfidf_vectorizers[vectorizer_key] = TfidfVectorizer(
            ngram_range=ngram_range,
            max_features=max_features,
            min_df=min_df,
            max_df=max_df,
            stop_words='english',
            sublinear_tf=True
        )
        
        # Clean texts
        clean_texts = [text if isinstance(text, str) else "" for text in texts]
        
        self.tfidf_vectorizers[vectorizer_key].fit(clean_texts)
        self.fitted = True
        
        return vectorizer_key
    
    def transform_tfidf(self, texts, vectorizer_key):
        """Transform texts using fitted TF-IDF vectorizer"""
        if vectorizer_key not in self.tfidf_vectorizers:
            raise ValueError(f"Vectorizer {vectorizer_key} not found. Fit first.")
        
        # Clean texts
        clean_texts = [text if isinstance(text, str) else "" for text in texts]
        
        tfidf_matrix = self.tfidf_vectorizers[vectorizer_key].transform(clean_texts)
        feature_names = self.tfidf_vectorizers[vectorizer_key].get_feature_names_out()
        
        return tfidf_matrix, feature_names
    
    def get_top_tfidf_features(self, tfidf_matrix, feature_names, top_k=20):
        """Get top TF-IDF features across all documents"""
        # Calculate mean TF-IDF scores
        mean_scores = np.array(tfidf_matrix.mean(axis=0)).flatten()
        
        # Get top features
        top_indices = mean_scores.argsort()[-top_k:][::-1]
        top_features = [(feature_names[i], mean_scores[i]) for i in top_indices]
        
        return top_features
    
    def extract_ngrams(self, texts, n=2, top_k=50):
        """Extract top n-grams from texts"""
        from nltk.util import ngrams
        from nltk.tokenize import word_tokenize
        
        all_ngrams = []
        
        for text in texts:
            if pd.isna(text) or not isinstance(text, str):
                continue
            
            tokens = word_tokenize(text.lower())
            text_ngrams = list(ngrams(tokens, n))
            all_ngrams.extend(text_ngrams)
        
        # Count n-grams
        ngram_counts = Counter(all_ngrams)
        top_ngrams = ngram_counts.most_common(top_k)
        
        return top_ngrams

# Initialize feature extractor
feature_extractor = FeatureExtractor()
print("Feature extractor initialized!")


In [None]:
# Apply basic feature extraction
if 'df' in locals() and not df.empty and text_columns:
    print("Extracting Basic Text Features")
    print("=" * 40)
    
    # Use processed text for feature extraction
    if processed_column in df.columns:
        text_data = df[processed_column].fillna("").tolist()
        original_text_data = df[main_text_column].fillna("").tolist()
    else:
        text_data = df[main_text_column].fillna("").tolist()
        original_text_data = text_data.copy()
    
    # Extract basic statistical features from original text
    print("Extracting basic statistical features...")
    basic_features_df = feature_extractor.extract_basic_features(original_text_data)
    
    print(f"Basic Features Shape: {basic_features_df.shape}")
    print("\nBasic Features Summary:")
    print(basic_features_df.describe())
    
    # Fit and transform TF-IDF features
    print(f"\nFitting TF-IDF vectorizers...")
    
    # Unigrams TF-IDF
    unigram_key = feature_extractor.fit_tfidf(text_data, ngram_range=(1, 1), max_features=1000)
    unigram_tfidf, unigram_features = feature_extractor.transform_tfidf(text_data, unigram_key)
    
    # Bigrams TF-IDF
    bigram_key = feature_extractor.fit_tfidf(text_data, ngram_range=(2, 2), max_features=500)
    bigram_tfidf, bigram_features = feature_extractor.transform_tfidf(text_data, bigram_key)
    
    # Trigrams TF-IDF  
    trigram_key = feature_extractor.fit_tfidf(text_data, ngram_range=(3, 3), max_features=200)
    trigram_tfidf, trigram_features = feature_extractor.transform_tfidf(text_data, trigram_key)
    
    print(f"Unigram TF-IDF Shape: {unigram_tfidf.shape}")
    print(f"Bigram TF-IDF Shape: {bigram_tfidf.shape}")
    print(f"Trigram TF-IDF Shape: {trigram_tfidf.shape}")
    
    # Get top features for each n-gram type
    top_unigrams = feature_extractor.get_top_tfidf_features(unigram_tfidf, unigram_features, top_k=15)
    top_bigrams = feature_extractor.get_top_tfidf_features(bigram_tfidf, bigram_features, top_k=15)
    top_trigrams = feature_extractor.get_top_tfidf_features(trigram_tfidf, trigram_features, top_k=15)
    
    print(f"\nTop 15 Unigrams (by average TF-IDF score):")
    for feature, score in top_unigrams:
        print(f"  {feature}: {score:.4f}")
    
    print(f"\nTop 15 Bigrams:")
    for feature, score in top_bigrams:
        print(f"  {feature}: {score:.4f}")
    
    print(f"\nTop 15 Trigrams:")
    for feature, score in top_trigrams:
        print(f"  {feature}: {score:.4f}")
    
    # Extract traditional n-grams using NLTK
    print(f"\nExtracting traditional n-grams...")
    top_bigrams_nltk = feature_extractor.extract_ngrams(text_data, n=2, top_k=10)
    top_trigrams_nltk = feature_extractor.extract_ngrams(text_data, n=3, top_k=10)
    
    print(f"\nTop 10 Most Frequent Bigrams:")
    for ngram, count in top_bigrams_nltk:
        print(f"  {' '.join(ngram)}: {count}")
    
    print(f"\nTop 10 Most Frequent Trigrams:")
    for ngram, count in top_trigrams_nltk:
        print(f"  {' '.join(ngram)}: {count}")
    
    # Store the extracted features for later use
    feature_data = {
        'basic_features': basic_features_df,
        'unigram_tfidf': unigram_tfidf,
        'bigram_tfidf': bigram_tfidf,
        'trigram_tfidf': trigram_tfidf,
        'unigram_features': unigram_features,
        'bigram_features': bigram_features,
        'trigram_features': trigram_features
    }
    
    print(f"\nFeature extraction completed! Stored features for {len(text_data)} documents.")
else:
    print("No data available for feature extraction!")


## 5. Advanced Features (Sentiment Analysis, Named Entities, Readability)


In [None]:
from textblob import TextBlob
import nltk

# Download required NLTK data for sentiment analysis
nltk.download('vader_lexicon', quiet=True)
from nltk.sentiment.vader import SentimentIntensityAnalyzer

class AdvancedFeatureExtractor:
    """Extract advanced NLP features for text analysis"""
    
    def __init__(self):
        self.vader_analyzer = SentimentIntensityAnalyzer()
        
    def extract_sentiment_features(self, texts):
        """Extract sentiment-related features"""
        features = []
        
        for text in texts:
            if pd.isna(text) or not isinstance(text, str) or len(text) == 0:
                features.append({
                    'textblob_polarity': 0.0,
                    'textblob_subjectivity': 0.0,
                    'vader_positive': 0.0,
                    'vader_negative': 0.0,
                    'vader_neutral': 0.0,
                    'vader_compound': 0.0
                })
                continue
            
            # TextBlob sentiment
            blob = TextBlob(text)
            textblob_polarity = blob.sentiment.polarity
            textblob_subjectivity = blob.sentiment.subjectivity
            
            # VADER sentiment
            vader_scores = self.vader_analyzer.polarity_scores(text)
            
            features.append({
                'textblob_polarity': textblob_polarity,
                'textblob_subjectivity': textblob_subjectivity,
                'vader_positive': vader_scores['pos'],
                'vader_negative': vader_scores['neg'],
                'vader_neutral': vader_scores['neu'],
                'vader_compound': vader_scores['compound']
            })
        
        return pd.DataFrame(features)
    
    def extract_emotional_features(self, texts):
        """Extract emotion-related features"""
        features = []
        
        # Define emotion words (simplified version)
        emotion_words = {
            'anger': ['angry', 'furious', 'rage', 'hate', 'mad', 'annoyed', 'irritated'],
            'fear': ['afraid', 'scared', 'terrified', 'anxious', 'worried', 'nervous'],
            'joy': ['happy', 'joyful', 'excited', 'delighted', 'pleased', 'glad'],
            'sadness': ['sad', 'depressed', 'upset', 'disappointed', 'miserable'],
            'surprise': ['surprised', 'amazed', 'shocked', 'astonished', 'stunned'],
            'trust': ['trust', 'confident', 'reliable', 'believe', 'faith'],
            'disgust': ['disgusting', 'repulsive', 'revolting', 'gross', 'awful']
        }
        
        for text in texts:
            if pd.isna(text) or not isinstance(text, str):
                text = ""
            
            text_lower = text.lower()
            words = text_lower.split()
            word_count = max(len(words), 1)  # Avoid division by zero
            
            emotion_features = {}
            for emotion, emotion_word_list in emotion_words.items():
                emotion_count = sum(1 for word in emotion_word_list if word in text_lower)
                emotion_features[f'{emotion}_count'] = emotion_count
                emotion_features[f'{emotion}_ratio'] = emotion_count / word_count
            
            features.append(emotion_features)
        
        return pd.DataFrame(features)
    
    def extract_complexity_features(self, texts):
        """Extract text complexity and readability features"""
        features = []
        
        for text in texts:
            if pd.isna(text) or not isinstance(text, str) or len(text) == 0:
                features.append({
                    'flesch_reading_ease': 0,
                    'flesch_kincaid_grade': 0,
                    'gunning_fog': 0,
                    'coleman_liau_index': 0,
                    'automated_readability_index': 0,
                    'avg_syllables_per_word': 0,
                    'difficult_words_ratio': 0
                })
                continue
            
            # Various readability metrics
            try:
                flesch_ease = textstat.flesch_reading_ease(text)
                flesch_grade = textstat.flesch_kincaid_grade(text)
                gunning_fog = textstat.gunning_fog(text)
                coleman_liau = textstat.coleman_liau_index(text)
                auto_readability = textstat.automated_readability_index(text)
                
                # Syllable analysis
                avg_syllables = textstat.avg_sentence_per_word(text)
                difficult_words = textstat.difficult_words(text)
                word_count = len(text.split())
                difficult_ratio = difficult_words / max(word_count, 1)
                
            except:
                # Fallback values if textstat fails
                flesch_ease = flesch_grade = gunning_fog = 0
                coleman_liau = auto_readability = avg_syllables = difficult_ratio = 0
            
            features.append({
                'flesch_reading_ease': flesch_ease,
                'flesch_kincaid_grade': flesch_grade,
                'gunning_fog': gunning_fog,
                'coleman_liau_index': coleman_liau,
                'automated_readability_index': auto_readability,
                'avg_syllables_per_word': avg_syllables,
                'difficult_words_ratio': difficult_ratio
            })
        
        return pd.DataFrame(features)
    
    def extract_style_features(self, texts):
        """Extract writing style features"""
        features = []
        
        for text in texts:
            if pd.isna(text) or not isinstance(text, str):
                text = ""
            
            # Count different types of punctuation and style elements
            exclamations = text.count('!')
            questions = text.count('?')
            quotes = text.count('"') + text.count("'")
            ellipsis = text.count('...')
            caps_words = sum(1 for word in text.split() if word.isupper() and len(word) > 1)
            
            # Sentence length variation
            sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
            if sentences:
                sentence_lengths = [len(s.split()) for s in sentences]
                avg_sentence_length = np.mean(sentence_lengths)
                sentence_length_std = np.std(sentence_lengths)
            else:
                avg_sentence_length = sentence_length_std = 0
            
            word_count = len(text.split())
            
            features.append({
                'exclamation_density': exclamations / max(word_count, 1),
                'question_density': questions / max(word_count, 1),
                'quote_density': quotes / max(word_count, 1),
                'ellipsis_count': ellipsis,
                'caps_words_ratio': caps_words / max(word_count, 1),
                'avg_sentence_length': avg_sentence_length,
                'sentence_length_variation': sentence_length_std,
                'total_sentences': len(sentences)
            })
        
        return pd.DataFrame(features)

# Initialize advanced feature extractor
advanced_feature_extractor = AdvancedFeatureExtractor()
print("Advanced feature extractor initialized!")


In [None]:
# Apply advanced feature extraction
if 'df' in locals() and not df.empty and 'text_data' in locals():
    print("Extracting Advanced Text Features")
    print("=" * 45)
    
    # Use a subset for demonstration (first 100 rows for performance)
    demo_texts = original_text_data[:100] if len(original_text_data) > 100 else original_text_data
    
    print(f"Analyzing {len(demo_texts)} texts for advanced features...")
    
    # Extract sentiment features
    print("\nExtracting sentiment features...")
    sentiment_features = advanced_feature_extractor.extract_sentiment_features(demo_texts)
    print(f"Sentiment Features Shape: {sentiment_features.shape}")
    print("Sentiment Features Summary:")
    print(sentiment_features.describe())
    
    # Extract emotional features
    print("\nExtracting emotional features...")
    emotional_features = advanced_feature_extractor.extract_emotional_features(demo_texts)
    print(f"Emotional Features Shape: {emotional_features.shape}")
    print("Top Emotional Features:")
    emotion_summary = emotional_features.mean().sort_values(ascending=False)
    print(emotion_summary.head(10))
    
    # Extract complexity features
    print("\nExtracting complexity features...")
    complexity_features = advanced_feature_extractor.extract_complexity_features(demo_texts)
    print(f"Complexity Features Shape: {complexity_features.shape}")
    print("Complexity Features Summary:")
    print(complexity_features.describe())
    
    # Extract style features
    print("\nExtracting style features...")
    style_features = advanced_feature_extractor.extract_style_features(demo_texts)
    print(f"Style Features Shape: {style_features.shape}")
    print("Style Features Summary:")
    print(style_features.describe())
    
    # Combine all advanced features
    advanced_features_combined = pd.concat([
        sentiment_features,
        emotional_features,
        complexity_features,
        style_features
    ], axis=1)
    
    print(f"\nCombined Advanced Features Shape: {advanced_features_combined.shape}")
    print(f"Total advanced features extracted: {advanced_features_combined.shape[1]}")
    
    # Show some interesting insights
    print("\n" + "="*50)
    print("ADVANCED FEATURES INSIGHTS")
    print("="*50)
    
    # Sentiment insights
    avg_polarity = sentiment_features['textblob_polarity'].mean()
    avg_subjectivity = sentiment_features['textblob_subjectivity'].mean()
    
    sentiment_label = "Positive" if avg_polarity > 0.1 else "Negative" if avg_polarity < -0.1 else "Neutral"
    subjectivity_label = "Subjective" if avg_subjectivity > 0.5 else "Objective"
    
    print(f"Average Sentiment: {sentiment_label} (polarity: {avg_polarity:.3f})")
    print(f"Average Subjectivity: {subjectivity_label} ({avg_subjectivity:.3f})")
    
    # Readability insights  
    avg_flesch = complexity_features['flesch_reading_ease'].mean()
    reading_level = ("Very Easy" if avg_flesch >= 90 else
                    "Easy" if avg_flesch >= 80 else
                    "Fairly Easy" if avg_flesch >= 70 else
                    "Standard" if avg_flesch >= 60 else
                    "Fairly Difficult" if avg_flesch >= 50 else
                    "Difficult" if avg_flesch >= 30 else
                    "Very Difficult")
    
    print(f"Reading Level: {reading_level} (Flesch Score: {avg_flesch:.1f})")
    
    # Emotional insights
    top_emotions = emotional_features[[col for col in emotional_features.columns if col.endswith('_ratio')]].mean().sort_values(ascending=False)
    if len(top_emotions) > 0:
        dominant_emotion = top_emotions.index[0].replace('_ratio', '').capitalize()
        emotion_score = top_emotions.iloc[0]
        print(f"Dominant Emotion: {dominant_emotion} (ratio: {emotion_score:.4f})")
    
    # Style insights
    avg_exclamation = style_features['exclamation_density'].mean()
    avg_question = style_features['question_density'].mean()
    
    if avg_exclamation > 0.01:
        print(f"Writing Style: Emphatic (high exclamation usage: {avg_exclamation:.4f})")
    elif avg_question > 0.01:
        print(f"Writing Style: Inquisitive (high question usage: {avg_question:.4f})")
    else:
        print("Writing Style: Formal/Neutral")
    
    # Store advanced features for later use
    advanced_feature_data = {
        'sentiment_features': sentiment_features,
        'emotional_features': emotional_features,
        'complexity_features': complexity_features,
        'style_features': style_features,
        'combined_features': advanced_features_combined
    }
    
    print(f"\nAdvanced feature extraction completed!")
    print(f"Total features per document: {advanced_features_combined.shape[1]}")
    
else:
    print("No data available for advanced feature extraction!")


## 6. Word Embeddings and Semantic Features


In [None]:
# Install additional packages for embeddings
!pip install gensim transformers torch sentence-transformers

import gensim.downloader as api
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

class EmbeddingExtractor:
    """Extract various types of word and sentence embeddings"""
    
    def __init__(self):
        self.word2vec_model = None
        self.sentence_transformer = None
        self.embedding_models = {}
        
    def load_word2vec_model(self, model_name='word2vec-google-news-300'):
        """Load pre-trained Word2Vec model"""
        try:
            print(f"Loading Word2Vec model: {model_name}")
            self.word2vec_model = api.load(model_name)
            print(f"Word2Vec model loaded! Vocabulary size: {len(self.word2vec_model.key_to_index)}")
            return True
        except Exception as e:
            print(f"Error loading Word2Vec model: {e}")
            print("Continuing without Word2Vec features...")
            return False
    
    def load_sentence_transformer(self, model_name='all-MiniLM-L6-v2'):
        """Load sentence transformer model"""
        try:
            print(f"Loading Sentence Transformer: {model_name}")
            self.sentence_transformer = SentenceTransformer(model_name)
            print("Sentence Transformer loaded successfully!")
            return True
        except Exception as e:
            print(f"Error loading Sentence Transformer: {e}")
            print("Continuing without sentence transformer features...")
            return False
    
    def get_word2vec_features(self, texts, embedding_size=300):
        """Extract Word2Vec-based features from texts"""
        if not self.word2vec_model:
            print("Word2Vec model not loaded. Skipping Word2Vec features.")
            return None
        
        features = []
        
        for text in texts:
            if pd.isna(text) or not isinstance(text, str):
                # Return zero vector for invalid text
                features.append(np.zeros(embedding_size))
                continue
            
            words = text.lower().split()
            word_vectors = []
            
            for word in words:
                if word in self.word2vec_model.key_to_index:
                    word_vectors.append(self.word2vec_model[word])
            
            if word_vectors:
                # Average word vectors to get document vector
                doc_vector = np.mean(word_vectors, axis=0)
            else:
                # No words found in vocabulary
                doc_vector = np.zeros(embedding_size)
            
            features.append(doc_vector)
        
        return np.array(features)
    
    def get_sentence_transformer_features(self, texts):
        """Extract sentence transformer embeddings"""
        if not self.sentence_transformer:
            print("Sentence Transformer not loaded. Skipping sentence embeddings.")
            return None
        
        # Clean texts
        clean_texts = [text if isinstance(text, str) else "" for text in texts]
        
        try:
            embeddings = self.sentence_transformer.encode(clean_texts, show_progress_bar=True)
            return embeddings
        except Exception as e:
            print(f"Error generating sentence embeddings: {e}")
            return None
    
    def extract_semantic_similarity_features(self, texts, reference_texts=None):
        """Extract semantic similarity features"""
        if not self.sentence_transformer:
            return None
        
        # Use first few texts as reference if not provided
        if reference_texts is None:
            reference_texts = texts[:min(10, len(texts))]
        
        # Get embeddings
        text_embeddings = self.get_sentence_transformer_features(texts)
        ref_embeddings = self.get_sentence_transformer_features(reference_texts)
        
        if text_embeddings is None or ref_embeddings is None:
            return None
        
        # Calculate similarities
        similarities = cosine_similarity(text_embeddings, ref_embeddings)
        
        # Extract features
        features = {
            'max_similarity': np.max(similarities, axis=1),
            'mean_similarity': np.mean(similarities, axis=1),
            'min_similarity': np.min(similarities, axis=1),
            'similarity_std': np.std(similarities, axis=1)
        }
        
        return pd.DataFrame(features)
    
    def reduce_embeddings_pca(self, embeddings, n_components=50):
        """Reduce embedding dimensionality using PCA"""
        if embeddings is None:
            return None
        
        pca = PCA(n_components=min(n_components, embeddings.shape[1]))
        reduced_embeddings = pca.fit_transform(embeddings)
        
        print(f"PCA: Reduced from {embeddings.shape[1]} to {reduced_embeddings.shape[1]} dimensions")
        print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.3f}")
        
        return reduced_embeddings, pca

# Initialize embedding extractor
embedding_extractor = EmbeddingExtractor()
print("Embedding extractor initialized!")


In [None]:
# Apply word embedding extraction
if 'df' in locals() and not df.empty and 'text_data' in locals():
    print("Extracting Word Embeddings and Semantic Features")
    print("=" * 55)
    
    # Use a smaller subset for embeddings (computationally expensive)
    embedding_subset = original_text_data[:50] if len(original_text_data) > 50 else original_text_data
    print(f"Processing {len(embedding_subset)} texts for embeddings...")
    
    # Try to load sentence transformer (more reliable than Word2Vec)
    print("\nLoading embedding models...")
    st_loaded = embedding_extractor.load_sentence_transformer('all-MiniLM-L6-v2')
    
    embedding_features = {}
    
    if st_loaded:
        print("\nExtracting sentence transformer embeddings...")
        sentence_embeddings = embedding_extractor.get_sentence_transformer_features(embedding_subset)
        
        if sentence_embeddings is not None:
            print(f"Sentence embeddings shape: {sentence_embeddings.shape}")
            
            # Reduce dimensionality for visualization and efficiency
            reduced_embeddings, pca = embedding_extractor.reduce_embeddings_pca(
                sentence_embeddings, n_components=20
            )
            
            print(f"Reduced embeddings shape: {reduced_embeddings.shape}")
            
            # Convert to DataFrame for easier handling
            embedding_df = pd.DataFrame(
                reduced_embeddings, 
                columns=[f'embedding_{i}' for i in range(reduced_embeddings.shape[1])]
            )
            
            embedding_features['sentence_embeddings'] = embedding_df
            
            # Extract semantic similarity features
            print("\nExtracting semantic similarity features...")
            similarity_features = embedding_extractor.extract_semantic_similarity_features(
                embedding_subset, reference_texts=embedding_subset[:5]
            )
            
            if similarity_features is not None:
                print(f"Similarity features shape: {similarity_features.shape}")
                print("Similarity features summary:")
                print(similarity_features.describe())
                embedding_features['similarity_features'] = similarity_features
    
    # Try to load Word2Vec (optional, may be slow/fail)
    print("\nAttempting to load Word2Vec model (this may take time or fail)...")
    try:
        # Use a smaller Word2Vec model for faster loading
        w2v_loaded = embedding_extractor.load_word2vec_model('glove-wiki-gigaword-50')
        
        if w2v_loaded:
            print("Extracting Word2Vec features...")
            w2v_embeddings = embedding_extractor.get_word2vec_features(embedding_subset, embedding_size=50)
            
            if w2v_embeddings is not None:
                print(f"Word2Vec embeddings shape: {w2v_embeddings.shape}")
                
                # Convert to DataFrame
                w2v_df = pd.DataFrame(
                    w2v_embeddings,
                    columns=[f'w2v_{i}' for i in range(w2v_embeddings.shape[1])]
                )
                
                embedding_features['word2vec_embeddings'] = w2v_df
    except Exception as e:
        print(f"Word2Vec loading failed: {e}")
        print("Continuing without Word2Vec features...")
    
    # Summary of extracted embedding features
    print("\n" + "="*50)
    print("EMBEDDING FEATURES SUMMARY")
    print("="*50)
    
    total_embedding_features = 0
    for feature_type, features in embedding_features.items():
        feature_count = features.shape[1] if hasattr(features, 'shape') else 0
        total_embedding_features += feature_count
        print(f"{feature_type}: {feature_count} features")
    
    print(f"\nTotal embedding features: {total_embedding_features}")
    
    if embedding_features:
        print("\nEmbedding extraction completed successfully!")
        
        # Show some interesting analysis
        if 'sentence_embeddings' in embedding_features:
            print("\nSample embedding analysis:")
            embeddings_sample = embedding_features['sentence_embeddings']
            
            # Calculate pairwise similarities for first few documents
            sample_similarities = cosine_similarity(
                sentence_embeddings[:5], 
                sentence_embeddings[:5]
            )
            
            print("Pairwise similarities between first 5 documents:")
            for i in range(5):
                for j in range(i+1, 5):
                    sim_score = sample_similarities[i, j]
                    print(f"Doc {i} - Doc {j}: {sim_score:.3f}")
    else:
        print("No embedding features were successfully extracted.")
        
    # Store embedding features for later use
    if embedding_features:
        globals()['embedding_feature_data'] = embedding_features
        
else:
    print("No data available for embedding extraction!")


## 7. Data Visualization and Exploratory Analysis


In [None]:
# Install additional visualization packages
!pip install wordcloud plotly

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from wordcloud import WordCloud
import matplotlib.pyplot as plt

class DataVisualizer:
    """Create comprehensive visualizations for text data analysis"""
    
    def __init__(self, figsize=(12, 8)):
        self.figsize = figsize
        plt.rcParams['figure.figsize'] = figsize
    
    def plot_text_length_distribution(self, texts, title="Text Length Distribution"):
        """Plot distribution of text lengths"""
        lengths = [len(str(text)) if pd.notna(text) else 0 for text in texts]
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        fig.suptitle(title, fontsize=16, fontweight='bold')
        
        # Histogram
        axes[0, 0].hist(lengths, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
        axes[0, 0].set_title('Text Length Histogram')
        axes[0, 0].set_xlabel('Character Count')
        axes[0, 0].set_ylabel('Frequency')
        
        # Box plot
        axes[0, 1].boxplot(lengths, vert=True)
        axes[0, 1].set_title('Text Length Box Plot')
        axes[0, 1].set_ylabel('Character Count')
        
        # Log scale histogram
        axes[1, 0].hist(lengths, bins=50, alpha=0.7, color='lightcoral', edgecolor='black', log=True)
        axes[1, 0].set_title('Text Length Histogram (Log Scale)')
        axes[1, 0].set_xlabel('Character Count')
        axes[1, 0].set_ylabel('Log Frequency')
        
        # Statistics text
        stats_text = f"""Statistics:
        Mean: {np.mean(lengths):.1f}
        Median: {np.median(lengths):.1f}
        Std: {np.std(lengths):.1f}
        Min: {np.min(lengths)}
        Max: {np.max(lengths)}
        """
        axes[1, 1].text(0.1, 0.5, stats_text, fontsize=12, transform=axes[1, 1].transAxes,
                        bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgray", alpha=0.7))
        axes[1, 1].set_title('Length Statistics')
        axes[1, 1].axis('off')
        
        plt.tight_layout()
        plt.show()
    
    def create_wordcloud(self, texts, title="Word Cloud", max_words=200):
        """Create word cloud from texts"""
        # Combine all texts
        all_text = ' '.join([str(text) for text in texts if pd.notna(text)])
        
        if len(all_text.strip()) == 0:
            print("No valid text data for word cloud")
            return
        
        # Create word cloud
        wordcloud = WordCloud(
            width=800, height=400,
            max_words=max_words,
            background_color='white',
            colormap='viridis',
            collocations=False
        ).generate(all_text)
        
        plt.figure(figsize=(15, 8))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(title, fontsize=18, fontweight='bold', pad=20)
        plt.axis('off')
        plt.tight_layout(pad=0)
        plt.show()
    
    def plot_feature_correlation_matrix(self, features_df, title="Feature Correlation Matrix"):
        """Plot correlation matrix of features"""
        if features_df.empty:
            print("No features available for correlation analysis")
            return
        
        # Calculate correlation matrix
        corr_matrix = features_df.corr()
        
        # Create heatmap
        plt.figure(figsize=(12, 10))
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
        sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
                   square=True, linewidths=0.5, cbar_kws={"shrink": .5}, fmt='.2f')
        plt.title(title, fontsize=16, fontweight='bold', pad=20)
        plt.tight_layout()
        plt.show()
    
    def plot_feature_distributions(self, features_df, title="Feature Distributions", max_features=12):
        """Plot distributions of numerical features"""
        if features_df.empty:
            print("No features available for distribution analysis")
            return
        
        # Select numerical columns
        numeric_cols = features_df.select_dtypes(include=[np.number]).columns[:max_features]
        
        n_features = len(numeric_cols)
        n_cols = 4
        n_rows = (n_features + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 4 * n_rows))
        fig.suptitle(title, fontsize=16, fontweight='bold')
        
        for i, col in enumerate(numeric_cols):
            row = i // n_cols
            col_idx = i % n_cols
            
            if n_rows == 1:
                ax = axes[col_idx] if n_cols > 1 else axes
            else:
                ax = axes[row, col_idx]
            
            features_df[col].hist(bins=30, alpha=0.7, ax=ax, color='skyblue', edgecolor='black')
            ax.set_title(f'{col}', fontsize=12, fontweight='bold')
            ax.set_xlabel('Value')
            ax.set_ylabel('Frequency')
        
        # Hide unused subplots
        for i in range(n_features, n_rows * n_cols):
            row = i // n_cols
            col_idx = i % n_cols
            if n_rows == 1:
                ax = axes[col_idx] if n_cols > 1 else axes
            else:
                ax = axes[row, col_idx]
            ax.axis('off')
        
        plt.tight_layout()
        plt.show()
    
    def plot_sentiment_analysis(self, sentiment_features, title="Sentiment Analysis"):
        """Plot sentiment analysis results"""
        if sentiment_features is None or sentiment_features.empty:
            print("No sentiment features available for plotting")
            return
        
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle(title, fontsize=16, fontweight='bold')
        
        # TextBlob Polarity
        axes[0, 0].hist(sentiment_features['textblob_polarity'], bins=30, alpha=0.7, color='green', edgecolor='black')
        axes[0, 0].set_title('TextBlob Polarity')
        axes[0, 0].set_xlabel('Polarity (-1: Negative, +1: Positive)')
        axes[0, 0].set_ylabel('Frequency')
        axes[0, 0].axvline(x=0, color='red', linestyle='--', alpha=0.7)
        
        # TextBlob Subjectivity
        axes[0, 1].hist(sentiment_features['textblob_subjectivity'], bins=30, alpha=0.7, color='blue', edgecolor='black')
        axes[0, 1].set_title('TextBlob Subjectivity')
        axes[0, 1].set_xlabel('Subjectivity (0: Objective, 1: Subjective)')
        axes[0, 1].set_ylabel('Frequency')
        
        # VADER Compound
        axes[0, 2].hist(sentiment_features['vader_compound'], bins=30, alpha=0.7, color='purple', edgecolor='black')
        axes[0, 2].set_title('VADER Compound Score')
        axes[0, 2].set_xlabel('Compound Score (-1: Negative, +1: Positive)')
        axes[0, 2].set_ylabel('Frequency')
        axes[0, 2].axvline(x=0, color='red', linestyle='--', alpha=0.7)
        
        # VADER Components
        vader_cols = ['vader_positive', 'vader_negative', 'vader_neutral']
        vader_means = [sentiment_features[col].mean() for col in vader_cols]
        
        axes[1, 0].bar(['Positive', 'Negative', 'Neutral'], vader_means, 
                      color=['green', 'red', 'gray'], alpha=0.7, edgecolor='black')
        axes[1, 0].set_title('Average VADER Sentiment Components')
        axes[1, 0].set_ylabel('Average Score')
        
        # Sentiment Scatter: Polarity vs Subjectivity
        axes[1, 1].scatter(sentiment_features['textblob_polarity'], 
                          sentiment_features['textblob_subjectivity'], 
                          alpha=0.6, color='orange')
        axes[1, 1].set_title('Polarity vs Subjectivity')
        axes[1, 1].set_xlabel('Polarity')
        axes[1, 1].set_ylabel('Subjectivity')
        axes[1, 1].grid(True, alpha=0.3)
        
        # Sentiment Summary
        summary_text = f"""Sentiment Summary:
        
        TextBlob:
        • Avg Polarity: {sentiment_features['textblob_polarity'].mean():.3f}
        • Avg Subjectivity: {sentiment_features['textblob_subjectivity'].mean():.3f}
        
        VADER:
        • Avg Compound: {sentiment_features['vader_compound'].mean():.3f}
        • Avg Positive: {sentiment_features['vader_positive'].mean():.3f}
        • Avg Negative: {sentiment_features['vader_negative'].mean():.3f}
        """
        
        axes[1, 2].text(0.05, 0.95, summary_text, fontsize=10, transform=axes[1, 2].transAxes,
                       verticalalignment='top', bbox=dict(boxstyle="round,pad=0.3", 
                       facecolor="lightblue", alpha=0.7))
        axes[1, 2].set_title('Summary Statistics')
        axes[1, 2].axis('off')
        
        plt.tight_layout()
        plt.show()
    
    def plot_label_distribution(self, labels, title="Label Distribution"):
        """Plot distribution of labels/categories"""
        if pd.isna(labels).all():
            print("No label data available for plotting")
            return
        
        label_counts = pd.Series(labels).value_counts()
        
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))
        fig.suptitle(title, fontsize=16, fontweight='bold')
        
        # Bar plot
        label_counts.plot(kind='bar', ax=axes[0], color='lightblue', edgecolor='black')
        axes[0].set_title('Label Counts')
        axes[0].set_ylabel('Count')
        axes[0].tick_params(axis='x', rotation=45)
        
        # Pie chart
        axes[1].pie(label_counts.values, labels=label_counts.index, autopct='%1.1f%%', startangle=90)
        axes[1].set_title('Label Proportions')
        
        plt.tight_layout()
        plt.show()

# Initialize visualizer
visualizer = DataVisualizer()
print("Data visualizer initialized!")


In [None]:
# Create comprehensive data visualizations
if 'df' in locals() and not df.empty and 'original_text_data' in locals():
    print("Creating Comprehensive Data Visualizations")
    print("=" * 50)
    
    # 1. Text Length Distribution
    print("\n1. Plotting text length distribution...")
    visualizer.plot_text_length_distribution(original_text_data, "Original Text Length Distribution")
    
    # 2. Word Cloud for original text
    print("\n2. Creating word cloud from original text...")
    visualizer.create_wordcloud(original_text_data[:200], "Word Cloud - Original Text")  # Use subset for performance
    
    # 3. Word Cloud for processed text
    if 'text_data' in locals():
        print("\n3. Creating word cloud from processed text...")
        visualizer.create_wordcloud(text_data[:200], "Word Cloud - Processed Text")
    
    # 4. Label Distribution (if available)
    if label_columns:
        for label_col in label_columns[:2]:  # Show first 2 label columns
            print(f"\n4. Plotting distribution for label: {label_col}")
            visualizer.plot_label_distribution(df[label_col], f"Distribution of {label_col}")
    
    # 5. Basic Features Visualization
    if 'basic_features_df' in locals():
        print("\n5. Plotting basic feature distributions...")
        visualizer.plot_feature_distributions(basic_features_df, "Basic Text Features Distribution")
        
        print("\n6. Creating correlation matrix for basic features...")
        visualizer.plot_feature_correlation_matrix(basic_features_df, "Basic Features Correlation Matrix")
    
    # 6. Advanced Features Visualization
    if 'advanced_feature_data' in locals():
        
        # Sentiment Analysis
        if 'sentiment_features' in advanced_feature_data:
            print("\n7. Creating sentiment analysis visualizations...")
            visualizer.plot_sentiment_analysis(advanced_feature_data['sentiment_features'], 
                                              "Comprehensive Sentiment Analysis")
        
        # Combined advanced features
        if 'combined_features' in advanced_feature_data:
            print("\n8. Plotting advanced feature distributions...")
            combined_features = advanced_feature_data['combined_features']
            
            # Select a subset of interesting features for visualization
            interesting_features = []
            for col in combined_features.columns:
                if any(keyword in col.lower() for keyword in ['polarity', 'compound', 'flesch', 'subjectivity', 'complexity']):
                    interesting_features.append(col)
            
            if interesting_features:
                subset_features = combined_features[interesting_features[:12]]
                visualizer.plot_feature_distributions(subset_features, 
                                                    "Key Advanced Features Distribution")
    
    # 7. Feature Summary Dashboard
    print("\n" + "="*60)
    print("FEATURE EXTRACTION SUMMARY DASHBOARD")
    print("="*60)
    
    feature_summary = {}
    
    # Basic features
    if 'basic_features_df' in locals():
        feature_summary['Basic Text Features'] = basic_features_df.shape[1]
    
    # TF-IDF features
    if 'feature_data' in locals():
        tfidf_features = 0
        for key in feature_data.keys():
            if 'tfidf' in key:
                tfidf_features += feature_data[key].shape[1]
        feature_summary['TF-IDF Features'] = tfidf_features
    
    # Advanced features
    if 'advanced_feature_data' in locals():
        if 'combined_features' in advanced_feature_data:
            feature_summary['Advanced Features'] = advanced_feature_data['combined_features'].shape[1]
    
    # Embedding features
    if 'embedding_feature_data' in locals():
        embedding_features = 0
        for feature_type, features in embedding_feature_data.items():
            if hasattr(features, 'shape'):
                embedding_features += features.shape[1]
        feature_summary['Embedding Features'] = embedding_features
    
    # Create summary visualization
    if feature_summary:
        plt.figure(figsize=(12, 8))
        categories = list(feature_summary.keys())
        counts = list(feature_summary.values())
        
        colors = plt.cm.Set3(np.linspace(0, 1, len(categories)))
        bars = plt.bar(categories, counts, color=colors, edgecolor='black', alpha=0.8)
        
        # Add value labels on bars
        for bar, count in zip(bars, counts):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(counts)*0.01,
                    str(count), ha='center', va='bottom', fontweight='bold', fontsize=12)
        
        plt.title('NLP Pipeline Feature Extraction Summary', fontsize=16, fontweight='bold', pad=20)
        plt.xlabel('Feature Categories', fontsize=14)
        plt.ylabel('Number of Features', fontsize=14)
        plt.xticks(rotation=45, ha='right')
        plt.grid(axis='y', alpha=0.3)
        
        total_features = sum(counts)
        plt.text(0.02, 0.98, f'Total Features Extracted: {total_features}', 
                transform=plt.gca().transAxes, fontsize=14, fontweight='bold',
                bbox=dict(boxstyle="round,pad=0.3", facecolor="lightyellow", alpha=0.8))
        
        plt.tight_layout()
        plt.show()
        
        print(f"\nTotal features extracted across all categories: {total_features}")
        
        # Feature density analysis
        if 'df' in locals():
            print(f"Features per document: {total_features}")
            print(f"Documents processed: {len(df)}")
            print(f"Feature density: {total_features / len(df):.2f} features per document")
    
    print("\n" + "="*50)
    print("VISUALIZATION COMPLETE!")
    print("="*50)
    print("All visualizations have been generated successfully.")
    print("The NLP pipeline has extracted and visualized comprehensive")
    print("text features for machine learning applications.")
    
else:
    print("No data available for visualization!")


## 8. Complete NLP Pipeline Class
