In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Text processing and NLP libraries
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
from wordcloud import WordCloud
from textblob import TextBlob
import textstat

# Set up plotting parameters
plt.style.use('default')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")

# Load the data (assuming df is already loaded from data collection notebook)
df = pd.read_csv("../data/combined_final.csv")

**Getting Missing Value Info and Visualize**

In [None]:
# Getting Data Frame Info about Missing Values

missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing_Count': missing_values.values,
    'Missing_Percentage': missing_percentage.values
}).sort_values('Missing_Count', ascending=False)

print(missing_df)

In [None]:
# Visualize missing values
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Missing values heatmap
sns.heatmap(df.isnull(), cbar=True, yticklabels=False, cmap='viridis', ax=axes[0])
axes[0].set_title('Missing Values Heatmap', fontsize=14, fontweight='bold')

# Missing values bar chart
missing_df.plot(x='Column', y='Missing_Percentage', kind='bar', ax=axes[1], color='coral')
axes[1].set_title('Missing Values Percentage by Column', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Missing Percentage (%)')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

**Calculating length of all texts and creating new features and visualize**

In [None]:
# Calculate text lengths and create new features
df['title_length'] = df['title'].astype(str).str.len()
df['text_length'] = df['text'].astype(str).str.len()
df['title_word_count'] = df['title'].astype(str).str.split().str.len()
df['text_word_count'] = df['text'].astype(str).str.split().str.len()

# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Title length distribution
axes[0,0].hist(df['title_length'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,0].set_title('Distribution of Title Length (Characters)', fontweight='bold')
axes[0,0].set_xlabel('Title Length')
axes[0,0].set_ylabel('Frequency')

# Text length distribution
axes[0,1].hist(df['text_length'], bins=50, alpha=0.7, color='lightgreen', edgecolor='black')
axes[0,1].set_title('Distribution of Text Length (Characters)', fontweight='bold')
axes[0,1].set_xlabel('Text Length')
axes[0,1].set_ylabel('Frequency')

# Title word count distribution
axes[1,0].hist(df['title_word_count'], bins=30, alpha=0.7, color='coral', edgecolor='black')
axes[1,0].set_title('Distribution of Title Word Count', fontweight='bold')
axes[1,0].set_xlabel('Word Count')
axes[1,0].set_ylabel('Frequency')

# Text word count distribution
axes[1,1].hist(df['text_word_count'], bins=50, alpha=0.7, color='plum', edgecolor='black')
axes[1,1].set_title('Distribution of Text Word Count', fontweight='bold')
axes[1,1].set_xlabel('Word Count')
axes[1,1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

**Generating Word Cloud for fake news and real news**

In [None]:
# Generating Word Cloud
from wordcloud import WordCloud

fake_text = " ".join(df[df["label"]==0]["text"].astype(str).tolist())
real_text = " ".join(df[df["label"]==1]["text"].astype(str).tolist())

plt.figure(figsize=(12,6))
wc_fake = WordCloud(width=600, height=400, background_color="black").generate(fake_text)
plt.subplot(1,2,1)
plt.imshow(wc_fake, interpolation="bilinear")
plt.axis("off")
plt.title("Fake News Word Cloud", fontsize=14)

wc_real = WordCloud(width=600, height=400, background_color="white").generate(real_text)
plt.subplot(1,2,2)
plt.imshow(wc_real, interpolation="bilinear")
plt.axis("off")
plt.title("Real News Word Cloud", fontsize=14)

plt.show()

**Getting Average length Stats and generating visualization**

In [None]:
# Average length stats
stats = df.groupby("label")[["text_length", "title_length"]].mean()
print("\nAverage Length Stats by Label:\n", stats)

# Create a figure with 3 subplots in one row
plt.figure(figsize=(18, 5))

# Plot 1: Average Text Length per Label
plt.subplot(1, 3, 1)
sns.barplot(x=stats.index, y=stats["text_length"], hue=stats.index, palette="Set2", legend=False)
plt.title("Average Text Length per Label")
plt.xticks([0, 1], ["Fake", "Real"])
plt.ylabel("Avg Text Length")

# Plot 2: Correlation Heatmap
plt.subplot(1, 3, 2)
corr_data = df[["text_length", "title_length", "label"]].corr()
sns.heatmap(corr_data, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")

# Plot 3: Label Distribution
plt.subplot(1, 3, 3)
sns.countplot(x='label', data=df, hue='label', palette='coolwarm', legend=False)
plt.title("Fake (0) vs Real (1) Distribution")
plt.xlabel("Label")
plt.ylabel("Count")
plt.xticks([0, 1], ["Fake (0)", "Real (1)"])

# Adjust layout and show all 3 side by side
plt.tight_layout()
plt.show()

**Compare text Length distribution by label**

In [None]:
# Compare text length distributions by label (if label column exists)
if 'label' in df.columns:
    print("COMPARING TEXT LENGTHS BY NEWS TYPE")
    print("=" * 60)

    # Create box plots for text length comparison
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))

    # Title length by label
    df.boxplot(column='title_length', by='label', ax=axes[0,0])
    axes[0,0].set_title('Title Length Distribution by News Type')
    axes[0,0].set_xlabel('News Type')
    axes[0,0].set_ylabel('Title Length (Characters)')

    # Text length by label
    df.boxplot(column='text_length', by='label', ax=axes[0,1])
    axes[0,1].set_title('Text Length Distribution by News Type')
    axes[0,1].set_xlabel('News Type')
    axes[0,1].set_ylabel('Text Length (Characters)')

    # Title word count by label
    df.boxplot(column='title_word_count', by='label', ax=axes[1,0])
    axes[1,0].set_title('Title Word Count by News Type')
    axes[1,0].set_xlabel('News Type')
    axes[1,0].set_ylabel('Word Count')

    # Text word count by label
    df.boxplot(column='text_word_count', by='label', ax=axes[1,1])
    axes[1,1].set_title('Text Word Count by News Type')
    axes[1,1].set_xlabel('News Type')
    axes[1,1].set_ylabel('Word Count')

    plt.suptitle('')  # Remove the automatic title
    plt.tight_layout()
    plt.show()

    # Statistical comparison
    print("\nSTATISTICAL COMPARISON BY LABEL:")
    print("-" * 40)
    for label in df['label'].unique():
        print(f"\n{label} News:")
        subset = df[df['label'] == label]
        print(f"Average title length: {subset['title_length'].mean():.1f} characters")
        print(f"Average text length: {subset['text_length'].mean():.1f} characters")
        print(f"Average title words: {subset['title_word_count'].mean():.1f} words")
        print(f"Average text words: {subset['text_word_count'].mean():.1f} words")

**Label Distribution and Class balance**

In [None]:
# Analyze label distribution
if 'label' in df.columns:
    print("LABEL DISTRIBUTION ANALYSIS")
    print("=" * 60)

    # Count and percentage distribution
    label_counts = df['label'].value_counts()
    label_percentages = df['label'].value_counts(normalize=True) * 100

    print("Label Distribution:")
    print("-" * 30)
    for label in label_counts.index:
        print(f"{label}: {label_counts[label]:,} ({label_percentages[label]:.2f}%)")

    # Create visualizations
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))

    # Pie chart
    axes[0].pie(label_counts.values, labels=label_counts.index, autopct='%1.1f%%',
                startangle=90)
    axes[0].set_title('News Type Distributio', fontweight='bold')

    # Bar chart
    label_counts.plot(kind='bar', ax=axes[1], color=['coral', 'skyblue'])
    axes[1].set_title('News Type Distribution', fontweight='bold')
    axes[1].set_ylabel('Count')
    axes[1].tick_params(axis='x', rotation=45)

    # Horizontal bar chart with percentages
    axes[2].barh(range(len(label_percentages)), label_percentages.values,
                 color=['lightgreen', 'lightcoral'])
    axes[2].set_yticks(range(len(label_percentages)))
    axes[2].set_yticklabels(label_percentages.index)
    axes[2].set_xlabel('Percentage (%)')
    axes[2].set_title('News Type Distribution', fontweight='bold')

    # Add percentage labels on bars
    for i, v in enumerate(label_percentages.values):
        axes[2].text(v + 0.5, i, f'{v:.1f}%', va='center')

    plt.tight_layout()
    plt.show()

    # Class balance analysis
    print(f"\nCLASS BALANCE ANALYSIS:")
    print("-" * 30)
    minority_class = label_counts.min()
    majority_class = label_counts.max()
    imbalance_ratio = majority_class / minority_class

    print(f"Minority class size: {minority_class:,}")
    print(f"Majority class size: {majority_class:,}")
    print(f"Imbalance ratio: {imbalance_ratio:.2f}:1")

    if imbalance_ratio > 1.5:
        print("Dataset shows class imbalance - consider sampling techniques")
    else:
        print("Dataset is relatively balanced")

else:
    print("No 'label' column found in the dataset")
    print("Available columns:", df.columns.tolist())

**Word Frequency Analysis**

In [None]:
# Prepare text for analysis
def preprocess_text_basic(text):
    """Basic text preprocessing for word frequency analysis"""
    if pd.isna(text):
        return ""

    # Convert to lowercase and remove special characters
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize and remove stopwords
    try:
        stop_words = set(stopwords.words('english'))
        words = word_tokenize(text)
        words = [word for word in words if word not in stop_words and len(word) > 2]
        return ' '.join(words)
    except:
        # If NLTK data not available, do basic processing
        words = text.split()
        common_stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those'}
        words = [word for word in words if word not in common_stopwords and len(word) > 2]
        return ' '.join(words)

print("WORD FREQUENCY ANALYSIS")
print("=" * 60)

# Preprocess text data
print("Preprocessing text data...")
df['title_clean'] = df['title'].apply(preprocess_text_basic)
df['text_clean'] = df['text'].apply(preprocess_text_basic)

# Combine all text for overall analysis
all_text = ' '.join(df['text_clean'].fillna('') + ' ' + df['title_clean'].fillna(''))
all_words = all_text.split()

# Get word frequency
word_freq = Counter(all_words)
most_common_words = word_freq.most_common(20)

print(f"Total unique words: {len(word_freq):,}")
print(f"Total words: {len(all_words):,}")

# Create word frequency visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Top 20 most common words
words, counts = zip(*most_common_words)
axes[0].barh(range(len(words)), counts, color='lightblue')
axes[0].set_yticks(range(len(words)))
axes[0].set_yticklabels(words)
axes[0].set_xlabel('Frequency')
axes[0].set_title('Top 20 Most Common Words', fontweight='bold')
axes[0].invert_yaxis()

# Word cloud
if len(all_text) > 0:
    try:
        wordcloud = WordCloud(width=800, height=400, background_color='white',
                             max_words=100, colormap='viridis').generate(all_text)
        axes[1].imshow(wordcloud, interpolation='bilinear')
        axes[1].axis('off')
        axes[1].set_title('Word Cloud - All News Articles', fontweight='bold')
    except Exception as e:
        axes[1].text(0.5, 0.5, f'Word cloud generation failed:\n{str(e)}',
                    ha='center', va='center', transform=axes[1].transAxes)
        axes[1].set_title('Word Cloud - Generation Failed', fontweight='bold')

plt.tight_layout()
plt.show()

print("\nTOP 20 MOST COMMON WORDS:")
print("-" * 30)
for i, (word, count) in enumerate(most_common_words, 1):
    print(f"{i:2d}. {word:15} : {count:,}")

## N-gram Analysis (Unigrams, Bigrams, Trigrams)

In [None]:
# N-gram analysis functions
from collections import defaultdict

def get_ngrams(text, n):
    """Extract n-grams from text"""
    words = text.split()
    if len(words) < n:
        return []
    return [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

def analyze_ngrams(text_series, n, top_k=15):
    """Analyze n-grams in a text series"""
    all_ngrams = []
    for text in text_series.fillna(''):
        all_ngrams.extend(get_ngrams(text, n))

    ngram_freq = Counter(all_ngrams)
    return ngram_freq.most_common(top_k)

print("N-GRAM ANALYSIS")
print("=" * 60)

# Combine all processed text
all_processed_text = df['text_processed'].fillna('') + ' ' + df['title_processed'].fillna('')

# Analyze unigrams, bigrams, and trigrams
unigrams = analyze_ngrams(all_processed_text, 1, 20)
bigrams = analyze_ngrams(all_processed_text, 2, 15)
trigrams = analyze_ngrams(all_processed_text, 3, 10)

# Create visualizations
fig, axes = plt.subplots(3, 1, figsize=(14, 16))

# Unigrams
if unigrams:
    words, counts = zip(*unigrams)
    axes[0].barh(range(len(words)), counts, color='lightblue')
    axes[0].set_yticks(range(len(words)))
    axes[0].set_yticklabels(words)
    axes[0].set_xlabel('Frequency')
    axes[0].set_title('Top 20 Unigrams (Single Words)', fontweight='bold', fontsize=14)
    axes[0].invert_yaxis()

# Bigrams
if bigrams:
    words, counts = zip(*bigrams)
    axes[1].barh(range(len(words)), counts, color='lightgreen')
    axes[1].set_yticks(range(len(words)))
    axes[1].set_yticklabels(words)
    axes[1].set_xlabel('Frequency')
    axes[1].set_title('Top 15 Bigrams (Two-word Phrases)', fontweight='bold', fontsize=14)
    axes[1].invert_yaxis()

# Trigrams
if trigrams:
    words, counts = zip(*trigrams)
    axes[2].barh(range(len(words)), counts, color='lightcoral')
    axes[2].set_yticks(range(len(words)))
    axes[2].set_yticklabels(words)
    axes[2].set_xlabel('Frequency')
    axes[2].set_title('Top 10 Trigrams (Three-word Phrases)', fontweight='bold', fontsize=14)
    axes[2].invert_yaxis()

plt.tight_layout()
plt.show()

# Print results
print("TOP UNIGRAMS:")
print("-" * 30)
for i, (gram, count) in enumerate(unigrams[:15], 1):
    print(f"{i:2d}. {gram:20} : {count:,}")

print(f"\nTOP BIGRAMS:")
print("-" * 30)
for i, (gram, count) in enumerate(bigrams[:10], 1):
    print(f"{i:2d}. {gram:30} : {count:,}")

print(f"\nTOP TRIGRAMS:")
print("-" * 30)
for i, (gram, count) in enumerate(trigrams[:10], 1):
    print(f"{i:2d}. {gram:40} : {count:,}")

## Sentiment Analysis Distribution

In [None]:
# Sentiment analysis using TextBlob
def get_sentiment_score(text):
    """Calculate sentiment polarity using TextBlob"""
    if pd.isna(text) or text == "":
        return 0
    try:
        blob = TextBlob(str(text))
        return blob.sentiment.polarity
    except:
        return 0

def get_subjectivity_score(text):
    """Calculate subjectivity using TextBlob"""
    if pd.isna(text) or text == "":
        return 0
    try:
        blob = TextBlob(str(text))
        return blob.sentiment.subjectivity
    except:
        return 0

def categorize_sentiment(score):
    """Categorize sentiment score"""
    if score > 0.1:
        return 'Positive'
    elif score < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

print("SENTIMENT ANALYSIS")
print("=" * 60)

# Calculate sentiment scores
print("Calculating sentiment scores...")
df['title_sentiment'] = df['title'].apply(get_sentiment_score)
df['text_sentiment'] = df['text'].apply(get_sentiment_score)
df['title_subjectivity'] = df['title'].apply(get_subjectivity_score)
df['text_subjectivity'] = df['text'].apply(get_subjectivity_score)

# Categorize sentiments
df['title_sentiment_category'] = df['title_sentiment'].apply(categorize_sentiment)
df['text_sentiment_category'] = df['text_sentiment'].apply(categorize_sentiment)

# Create visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Sentiment score distributions
axes[0,0].hist(df['title_sentiment'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,0].set_title('Title Sentiment Distribution', fontweight='bold')
axes[0,0].set_xlabel('Sentiment Score (-1 to 1)')
axes[0,0].set_ylabel('Frequency')
axes[0,0].axvline(x=0, color='red', linestyle='--', alpha=0.7)

axes[0,1].hist(df['text_sentiment'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
axes[0,1].set_title('Text Sentiment Distribution', fontweight='bold')
axes[0,1].set_xlabel('Sentiment Score (-1 to 1)')
axes[0,1].set_ylabel('Frequency')
axes[0,1].axvline(x=0, color='red', linestyle='--', alpha=0.7)

# Subjectivity distributions
axes[0,2].hist(df['text_subjectivity'], bins=30, alpha=0.7, color='coral', edgecolor='black')
axes[0,2].set_title('Text Subjectivity Distribution', fontweight='bold')
axes[0,2].set_xlabel('Subjectivity Score (0 to 1)')
axes[0,2].set_ylabel('Frequency')

# Sentiment categories
title_sent_counts = df['title_sentiment_category'].value_counts()
axes[1,0].pie(title_sent_counts.values, labels=title_sent_counts.index, autopct='%1.1f%%',
              colors=['lightblue', 'lightcoral', 'lightgreen'])
axes[1,0].set_title('Title Sentiment Categories', fontweight='bold')

text_sent_counts = df['text_sentiment_category'].value_counts()
axes[1,1].pie(text_sent_counts.values, labels=text_sent_counts.index, autopct='%1.1f%%',
              colors=['lightblue', 'lightcoral', 'lightgreen'])
axes[1,1].set_title('Text Sentiment Categories', fontweight='bold')

# Sentiment vs Subjectivity scatter plot
scatter = axes[1,2].scatter(df['text_sentiment'], df['text_subjectivity'],
                           alpha=0.5, c=df['text_sentiment'], cmap='RdYlBu', s=10)
axes[1,2].set_xlabel('Sentiment Score')
axes[1,2].set_ylabel('Subjectivity Score')
axes[1,2].set_title('Sentiment vs Subjectivity', fontweight='bold')
plt.colorbar(scatter, ax=axes[1,2])

plt.tight_layout()
plt.show()

# Print sentiment statistics
print("SENTIMENT ANALYSIS STATISTICS:")
print("-" * 40)
print(f"Title Sentiment - Mean: {df['title_sentiment'].mean():.3f}, Std: {df['title_sentiment'].std():.3f}")
print(f"Text Sentiment - Mean: {df['text_sentiment'].mean():.3f}, Std: {df['text_sentiment'].std():.3f}")
print(f"Text Subjectivity - Mean: {df['text_subjectivity'].mean():.3f}, Std: {df['text_subjectivity'].std():.3f}")

print(f"\nTITLE SENTIMENT CATEGORIES:")
print(title_sent_counts)

print(f"\nTEXT SENTIMENT CATEGORIES:")
print(text_sent_counts)

In [None]:
# Compare sentiment between fake and real news (if label exists)
if 'label' in df.columns:
    print("SENTIMENT COMPARISON BY NEWS TYPE")
    print("=" * 60)

    fig, axes = plt.subplots(2, 2, figsize=(16, 10))

    # Box plots for sentiment comparison
    df.boxplot(column='title_sentiment', by='label', ax=axes[0,0])
    axes[0,0].set_title('Title Sentiment by News Type')
    axes[0,0].set_xlabel('News Type')
    axes[0,0].set_ylabel('Sentiment Score')

    df.boxplot(column='text_sentiment', by='label', ax=axes[0,1])
    axes[0,1].set_title('Text Sentiment by News Type')
    axes[0,1].set_xlabel('News Type')
    axes[0,1].set_ylabel('Sentiment Score')

    df.boxplot(column='text_subjectivity', by='label', ax=axes[1,0])
    axes[1,0].set_title('Text Subjectivity by News Type')
    axes[1,0].set_xlabel('News Type')
    axes[1,0].set_ylabel('Subjectivity Score')

    # Sentiment category comparison
    sentiment_cross = pd.crosstab(df['label'], df['text_sentiment_category'])
    sentiment_cross_pct = pd.crosstab(df['label'], df['text_sentiment_category'], normalize='index') * 100

    sentiment_cross_pct.plot(kind='bar', ax=axes[1,1], color=['lightcoral', 'lightgray', 'lightgreen'])
    axes[1,1].set_title('Sentiment Categories by News Type (%)')
    axes[1,1].set_xlabel('News Type')
    axes[1,1].set_ylabel('Percentage')
    axes[1,1].tick_params(axis='x', rotation=45)
    axes[1,1].legend(title='Sentiment')

    plt.suptitle('')  # Remove automatic title
    plt.tight_layout()
    plt.show()

    # Statistical comparison
    print("SENTIMENT STATISTICS BY LABEL:")
    print("-" * 40)
    for label in df['label'].unique():
        subset = df[df['label'] == label]
        print(f"\n{label} News:")
        print(f"  Title sentiment - Mean: {subset['title_sentiment'].mean():.3f}")
        print(f"  Text sentiment - Mean: {subset['text_sentiment'].mean():.3f}")
        print(f"  Text subjectivity - Mean: {subset['text_subjectivity'].mean():.3f}")

        print(f"  Sentiment categories:")
        categories = subset['text_sentiment_category'].value_counts()
        for cat, count in categories.items():
            pct = (count / len(subset)) * 100
            print(f"    {cat}: {count:,} ({pct:.1f}%)")

## Text Complexity Metrics

In [None]:
# Calculate text complexity metrics
def safe_textstat_metric(text, metric_func, default_value=0):
    """Safely calculate textstat metrics with error handling"""
    try:
        if pd.isna(text) or text == "":
            return default_value
        return metric_func(str(text))
    except:
        return default_value

def calculate_complexity_metrics(text):
    """Calculate various text complexity metrics"""
    if pd.isna(text) or text == "":
        return {
            'readability_score': 0,
            'avg_sentence_length': 0,
            'syllable_count': 0,
            'flesch_kincaid_grade': 0,
            'sentence_count': 0
        }

    text_str = str(text)

    try:
        # Use textstat if available
        readability = safe_textstat_metric(text_str, textstat.flesch_reading_ease, 0)
        fk_grade = safe_textstat_metric(text_str, textstat.flesch_kincaid_grade, 0)
        syllables = safe_textstat_metric(text_str, textstat.syllable_count, 0)
        sentences = safe_textstat_metric(text_str, textstat.sentence_count, 1)

        # Calculate average sentence length
        words = len(text_str.split())
        avg_sent_length = words / sentences if sentences > 0 else 0

    except:
        # Fallback calculations
        sentences = text_str.count('.') + text_str.count('!') + text_str.count('?')
        sentences = max(sentences, 1)
        words = len(text_str.split())

        readability = 50  # Default neutral score
        fk_grade = 8     # Default grade level
        syllables = words * 1.5  # Rough estimate
        avg_sent_length = words / sentences

    return {
        'readability_score': readability,
        'avg_sentence_length': avg_sent_length,
        'syllable_count': syllables,
        'flesch_kincaid_grade': fk_grade,
        'sentence_count': sentences
    }

print("TEXT COMPLEXITY ANALYSIS")
print("=" * 60)

print("Calculating text complexity metrics...")

# Calculate complexity metrics for a sample (to avoid long processing time)
sample_size = min(5000, len(df))
sample_df = df.sample(n=sample_size, random_state=42).copy()

# Calculate metrics
complexity_results = sample_df['text'].apply(calculate_complexity_metrics)
complexity_df = pd.DataFrame(complexity_results.tolist())

# Add metrics to sample dataframe
for col in complexity_df.columns:
    sample_df[col] = complexity_df[col]

# Create visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Readability score distribution
axes[0,0].hist(sample_df['readability_score'], bins=30, alpha=0.7, color='lightblue', edgecolor='black')
axes[0,0].set_title('Flesch Reading Ease Score Distribution', fontweight='bold')
axes[0,0].set_xlabel('Reading Ease Score (0-100)')
axes[0,0].set_ylabel('Frequency')
axes[0,0].axvline(x=50, color='red', linestyle='--', alpha=0.7, label='Average')
axes[0,0].legend()

# Average sentence length
axes[0,1].hist(sample_df['avg_sentence_length'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
axes[0,1].set_title('Average Sentence Length Distribution', fontweight='bold')
axes[0,1].set_xlabel('Average Words per Sentence')
axes[0,1].set_ylabel('Frequency')

# Flesch-Kincaid Grade Level
axes[0,2].hist(sample_df['flesch_kincaid_grade'], bins=30, alpha=0.7, color='coral', edgecolor='black')
axes[0,2].set_title('Flesch-Kincaid Grade Level Distribution', fontweight='bold')
axes[0,2].set_xlabel('Grade Level')
axes[0,2].set_ylabel('Frequency')

# Syllable count distribution
axes[1,0].hist(sample_df['syllable_count'], bins=30, alpha=0.7, color='plum', edgecolor='black')
axes[1,0].set_title('Syllable Count Distribution', fontweight='bold')
axes[1,0].set_xlabel('Number of Syllables')
axes[1,0].set_ylabel('Frequency')

# Sentence count distribution
axes[1,1].hist(sample_df['sentence_count'], bins=30, alpha=0.7, color='lightyellow', edgecolor='black')
axes[1,1].set_title('Sentence Count Distribution', fontweight='bold')
axes[1,1].set_xlabel('Number of Sentences')
axes[1,1].set_ylabel('Frequency')

# Correlation heatmap of complexity metrics
complexity_corr = sample_df[['readability_score', 'avg_sentence_length', 'syllable_count',
                           'flesch_kincaid_grade', 'sentence_count']].corr()
im = axes[1,2].imshow(complexity_corr, cmap='coolwarm', vmin=-1, vmax=1)
axes[1,2].set_title('Complexity Metrics Correlation', fontweight='bold')
axes[1,2].set_xticks(range(len(complexity_corr.columns)))
axes[1,2].set_yticks(range(len(complexity_corr.columns)))
axes[1,2].set_xticklabels([col.replace('_', '\n') for col in complexity_corr.columns], rotation=45)
axes[1,2].set_yticklabels([col.replace('_', '\n') for col in complexity_corr.columns])

# Add correlation values to heatmap
for i in range(len(complexity_corr.columns)):
    for j in range(len(complexity_corr.columns)):
        axes[1,2].text(j, i, f'{complexity_corr.iloc[i, j]:.2f}',
                      ha='center', va='center', color='black' if abs(complexity_corr.iloc[i, j]) < 0.5 else 'white')

plt.colorbar(im, ax=axes[1,2])
plt.tight_layout()
plt.show()

# Print statistics
print(f"COMPLEXITY METRICS STATISTICS (Sample of {sample_size:,} articles):")
print("-" * 60)
complexity_stats = sample_df[['readability_score', 'avg_sentence_length', 'syllable_count',
                            'flesch_kincaid_grade', 'sentence_count']].describe()
print(complexity_stats)

# Interpret readability scores
print(f"\nREADABILITY INTERPRETATION:")
print("-" * 30)
mean_readability = sample_df['readability_score'].mean()
if mean_readability >= 90:
    level = "Very Easy (5th grade)"
elif mean_readability >= 80:
    level = "Easy (6th grade)"
elif mean_readability >= 70:
    level = "Fairly Easy (7th grade)"
elif mean_readability >= 60:
    level = "Standard (8th-9th grade)"
elif mean_readability >= 50:
    level = "Fairly Difficult (10th-12th grade)"
elif mean_readability >= 30:
    level = "Difficult (college level)"
else:
    level = "Very Difficult (graduate level)"

print(f"Average readability score: {mean_readability:.1f} ({level})")

## Summary and Next Steps

### Key Findings from EDA:

1. **Dataset Overview**: The combined dataset contains news articles with text, titles, and labels for fake/real classification
2. **Text Characteristics**: Analyzed length distributions, word counts, and complexity metrics
3. **Label Distribution**: Examined class balance and potential imbalance issues
4. **Language Patterns**: Identified distinctive words and phrases in fake vs real news
5. **Sentiment Analysis**: Discovered sentiment patterns that may distinguish fake from real news
6. **Feature Engineering**: Created new features based on text statistics, sentiment, and linguistic patterns

### Recommendations for Machine Learning Model:

1. **Data Preprocessing**:
   - Handle missing values appropriately
   - Apply text cleaning and normalization
   - Consider sampling techniques if class imbalance exists

2. **Feature Selection**:
   - Use the engineered features showing high correlation with target
   - Apply dimensionality reduction techniques if needed
   - Consider TF-IDF or word embeddings for text features

3. **Model Selection**:
   - Start with baseline models (Logistic Regression, Naive Bayes)
   - Try ensemble methods (Random Forest, XGBoost)
   - Consider deep learning approaches (LSTM, BERT) for text classification

4. **Evaluation**:
   - Use appropriate metrics (accuracy, precision, recall, F1-score)
   - Implement cross-validation for robust evaluation
   - Monitor for overfitting

5. **Real-time Deployment**:
   - Design efficient preprocessing pipeline
   - Optimize model for speed and memory usage
   - Implement monitoring and retraining mechanisms

**Next Step**: Begin building machine learning models using the insights and features from this analysis!