# Metadata Analysis and Category Similarity Study
## Setup and Imports

In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import json
import os

## Metadata Feature Distribution Analysis

In [7]:
def analyze_metadata_distribution():
    """
    Analyze the distribution of metadata features across different categories
    to understand why the enhanced text approach failed to distinguish between
    slanted and unreliable source content.
    """
    
    print("Metadata Feature Distribution Analysis")
    print("=" * 60)
    
    # Load enhanced dataset
    enhanced_df = pd.read_csv("enhanced_dataset/training_ready_dataset.csv")
    
    # Define metadata features
    metadata_features = [
        "meta_misleadingUnverifiedClaimAsFact",
        "meta_misleadingMissingImportantContext", 
        "meta_misleadingFactualError",
        "meta_misleadingOutdatedInformation",
        "meta_misleadingManipulatedMedia",
        "meta_misleadingSatire"
    ]
    
    # Analyze overall feature distribution
    results = {
        'overall_distribution': {},
        'category_analysis': {},
        'signal_overlap': {}
    }
    
    print("Overall Metadata Feature Distribution:")
    for feature in metadata_features:
        if feature in enhanced_df.columns:
            values = enhanced_df[feature].value_counts()
            results['overall_distribution'][feature] = dict(values)
            active_count = values.get(1.0, 0)
            total_count = len(enhanced_df)
            percentage = (active_count / total_count) * 100
            print(f"  {feature}: {active_count}/{total_count} ({percentage:.1f}%)")
    
    print(f"\nCategory-specific Analysis:")
    
    # Analyze by category/label
    for label in enhanced_df['label'].unique():
        label_df = enhanced_df[enhanced_df['label'] == label]
        results['category_analysis'][label] = {
            'sample_count': len(label_df),
            'feature_rates': {}
        }
        
        print(f"\n{label} (n={len(label_df)}):")
        
        for feature in metadata_features:
            if feature in enhanced_df.columns:
                active_count = (label_df[feature] == 1.0).sum()
                rate = (active_count / len(label_df)) * 100
                results['category_analysis'][label]['feature_rates'][feature] = {
                    'count': int(active_count),
                    'rate': round(rate, 1)
                }
                
                feature_short = feature.replace('meta_misleading', '')
                print(f"  {feature_short}: {active_count}/{len(label_df)} ({rate:.1f}%)")
    
    # Calculate overlap between slanted and unreliable source
    if 'slanted' in enhanced_df['label'].str.lower().values and 'unreliable source' in enhanced_df['label'].str.lower().values:
        slanted_df = enhanced_df[enhanced_df['label'].str.lower() == 'slanted']
        unreliable_df = enhanced_df[enhanced_df['label'].str.lower() == 'unreliable source']
        
        print(f"\nOverlap Analysis: Slanted vs Unreliable Source")
        for feature in metadata_features:
            if feature in enhanced_df.columns:
                slanted_rate = (slanted_df[feature] == 1.0).mean() * 100
                unreliable_rate = (unreliable_df[feature] == 1.0).mean() * 100
                overlap = min(slanted_rate, unreliable_rate)
                
                feature_short = feature.replace('meta_misleading', '')
                results['signal_overlap'][feature] = {
                    'slanted_rate': round(slanted_rate, 1),
                    'unreliable_rate': round(unreliable_rate, 1),
                    'overlap_score': round(overlap, 1)
                }
                
                print(f"  {feature_short}:")
                print(f"    Slanted: {slanted_rate:.1f}% | Unreliable: {unreliable_rate:.1f}% | Overlap: {overlap:.1f}%")
    
    return results

## Similarity Analysis Between Categories

In [8]:
def find_similar_examples():
    """
    Find the most similar examples between slanted and unreliable source categories
    based on metadata feature vectors to demonstrate why classification failed.
    """
    
    print("\nSimilarity Analysis: Slanted vs Unreliable Source")
    print("=" * 60)
    
    # Load enhanced dataset
    df = pd.read_csv("enhanced_dataset/enhanced_dataset.csv")
    
    # Filter for slanted and unreliable source with metadata
    slanted = df[(df['label'].str.lower() == 'slanted') & (df['has_metadata'] == True)].copy()
    unreliable = df[(df['label'].str.lower() == 'unreliable source') & (df['has_metadata'] == True)].copy()
    
    print(f"Analyzing {len(slanted)} slanted vs {len(unreliable)} unreliable source examples")
    
    if len(slanted) == 0 or len(unreliable) == 0:
        print("Insufficient examples for similarity analysis")
        return None
    
    # Get metadata columns
    metadata_cols = [col for col in df.columns if col.startswith('meta_')]
    
    # Convert to numeric and handle missing values
    for col in metadata_cols:
        slanted[col] = pd.to_numeric(slanted[col], errors='coerce').fillna(0)
        unreliable[col] = pd.to_numeric(unreliable[col], errors='coerce').fillna(0)
    
    # Calculate pairwise similarities
    similarities = []
    
    for i, (_, slanted_row) in enumerate(slanted.iterrows()):
        for j, (_, unreliable_row) in enumerate(unreliable.iterrows()):
            
            # Create metadata vectors
            slanted_meta = [slanted_row[col] for col in metadata_cols]
            unreliable_meta = [unreliable_row[col] for col in metadata_cols]
            
            # Calculate cosine similarity
            similarity = cosine_similarity([slanted_meta], [unreliable_meta])[0][0]
            
            # Identify matching features
            matching_features = []
            differing_features = []
            
            for col in metadata_cols:
                slanted_val = slanted_row[col]
                unreliable_val = unreliable_row[col]
                
                feature_name = col.replace('meta_misleading', '').replace('_', ' ').title()
                
                if slanted_val == unreliable_val and slanted_val == 1:
                    matching_features.append(feature_name)
                elif slanted_val != unreliable_val and (slanted_val == 1 or unreliable_val == 1):
                    differing_features.append(feature_name)
            
            # Extract tweet content
            slanted_tweet = slanted_row['text'].split('[SEP]')[0].strip() if '[SEP]' in str(slanted_row['text']) else str(slanted_row['text'])
            unreliable_tweet = unreliable_row['text'].split('[SEP]')[0].strip() if '[SEP]' in str(unreliable_row['text']) else str(unreliable_row['text'])
            
            similarities.append({
                'similarity_score': similarity,
                'slanted_content': slanted_tweet,
                'unreliable_content': unreliable_tweet,
                'matching_features': matching_features,
                'differing_features': differing_features,
                'num_matching': len(matching_features),
                'num_differing': len(differing_features),
            })
    
    # Sort by similarity score
    similarities.sort(key=lambda x: x['similarity_score'], reverse=True)
    
    print(f"Similarity Analysis Results:")
    print(f"  Total pairs analyzed: {len(similarities)}")
    print(f"  Highest similarity: {similarities[0]['similarity_score']:.3f}")
    print(f"  Average similarity: {np.mean([s['similarity_score'] for s in similarities]):.3f}")
    print(f"  Median similarity: {np.median([s['similarity_score'] for s in similarities]):.3f}")
    
    return similarities

## Export Results for Thesis

In [9]:
def export_analysis_results():
    """
    Export analysis results to text files in metadata_analysis_results/ folder for inclusion in thesis.
    """
    
    print("\nExporting Analysis Results")
    print("=" * 60)
    
    # Create output directory
    output_dir = "metadata_analysis_results"
    os.makedirs(output_dir, exist_ok=True)
    print(f"Created output directory: {output_dir}/")
    
    # Perform analyses
    distribution_results = analyze_metadata_distribution()
    similarity_results = find_similar_examples()
    
    # Export metadata distribution analysis
    with open(f"{output_dir}/metadata_distribution_analysis.txt", "w", encoding="utf-8") as f:
        f.write("METADATA FEATURE DISTRIBUTION ANALYSIS\n")
        f.write("=" * 50 + "\n\n")
        
        f.write("Overall Feature Distribution:\n")
        for feature, values in distribution_results['overall_distribution'].items():
            active_count = values.get(1.0, 0)
            total_count = sum(values.values())
            percentage = (active_count / total_count) * 100
            feature_short = feature.replace('meta_misleading', '')
            f.write(f"  {feature_short}: {active_count}/{total_count} ({percentage:.1f}%)\n")
        
        f.write(f"\nCategory-specific Rates:\n")
        for category, data in distribution_results['category_analysis'].items():
            f.write(f"\n{category} (n={data['sample_count']}):\n")
            for feature, stats in data['feature_rates'].items():
                feature_short = feature.replace('meta_misleading', '')
                f.write(f"  {feature_short}: {stats['count']}/{data['sample_count']} ({stats['rate']:.1f}%)\n")
        
        if distribution_results['signal_overlap']:
            f.write(f"\nSlanted vs Unreliable Source Overlap:\n")
            for feature, overlap_data in distribution_results['signal_overlap'].items():
                feature_short = feature.replace('meta_misleading', '')
                f.write(f"  {feature_short}:\n")
                f.write(f"    Slanted: {overlap_data['slanted_rate']:.1f}%\n")
                f.write(f"    Unreliable: {overlap_data['unreliable_rate']:.1f}%\n")
                f.write(f"    Overlap Score: {overlap_data['overlap_score']:.1f}%\n")
    
    print(f"Saved: {output_dir}/metadata_distribution_analysis.txt")
    
    # Export top similar examples
    if similarity_results:
        with open(f"{output_dir}/similar_examples_analysis.txt", "w", encoding="utf-8") as f:
            f.write("SIMILAR EXAMPLES: SLANTED VS UNRELIABLE SOURCE\n")
            f.write("=" * 50 + "\n\n")
            
            f.write(f"Analysis Summary:\n")
            f.write(f"  Total pairs analyzed: {len(similarity_results)}\n")
            f.write(f"  Highest similarity: {similarity_results[0]['similarity_score']:.3f}\n")
            f.write(f"  Average similarity: {np.mean([s['similarity_score'] for s in similarity_results]):.3f}\n\n")
            
            f.write("Top 5 Most Similar Examples:\n")
            f.write("-" * 40 + "\n")
            
            for i, pair in enumerate(similarity_results[:5], 1):
                f.write(f"\nExample {i} (Similarity: {pair['similarity_score']:.3f})\n")
                f.write(f"Slanted Content:\n  {pair['slanted_content'][:200]}...\n")
                f.write(f"Unreliable Content:\n  {pair['unreliable_content'][:200]}...\n")
                f.write(f"Matching Features ({pair['num_matching']}): {', '.join(pair['matching_features'])}\n")
                f.write(f"Differing Features ({pair['num_differing']}): {', '.join(pair['differing_features'])}\n")
                f.write("-" * 40 + "\n")
        
        print(f"Saved: {output_dir}/similar_examples_analysis.txt")
        
        # Save detailed similarity results as CSV
        similarity_df = pd.DataFrame(similarity_results[:20])  # Top 20
        similarity_df.to_csv(f"{output_dir}/top_similar_examples.csv", index=False)
        print(f"Saved: {output_dir}/top_similar_examples.csv")
    
    # Export summary statistics for thesis
    with open(f"{output_dir}/thesis_summary_statistics.txt", "w", encoding="utf-8") as f:
        f.write("SUMMARY STATISTICS FOR THESIS\n")
        f.write("=" * 50 + "\n\n")
        
        f.write("Key Findings:\n")
        
        # Key overlap statistics
        if distribution_results['signal_overlap']:
            missing_context = distribution_results['signal_overlap'].get('meta_misleadingMissingImportantContext', {})
            if missing_context:
                f.write(f"1. Missing Important Context overlap:\n")
                f.write(f"   - Slanted: {missing_context['slanted_rate']:.1f}%\n")
                f.write(f"   - Unreliable Source: {missing_context['unreliable_rate']:.1f}%\n")
                f.write(f"   - Demonstrates substantial overlap in metadata signals\n\n")
        
        # Similarity findings
        if similarity_results:
            high_similarity_count = sum(1 for s in similarity_results if s['similarity_score'] > 0.8)
            f.write(f"2. Category Similarity Analysis:\n")
            f.write(f"   - {high_similarity_count}/{len(similarity_results)} pairs with >80% similarity\n")
            f.write(f"   - Highest similarity score: {similarity_results[0]['similarity_score']:.3f}\n")
            f.write(f"   - Average similarity: {np.mean([s['similarity_score'] for s in similarity_results]):.3f}\n\n")
        
        f.write("3. Implications:\n")
        f.write("   - Metadata enhancement failed due to overlapping signal patterns\n")
        f.write("   - Traditional text-only classification may be more effective\n")
        f.write("   - Need for more discriminative metadata features\n")
    
    print("Saved: thesis_summary_statistics.txt")
    print("\nAll analysis results exported successfully.")


## Execute Analysis

In [10]:
if __name__ == "__main__":
    """
    Execute complete metadata analysis and similarity study.
    Results are saved to text files for thesis inclusion.
    """
    
    print("Metadata Analysis and Category Similarity Study")
    print("Bachelor Thesis - Enhanced Text Classification Analysis")
    print("=" * 70)
    
    try:
        export_analysis_results()
        print("\nAnalysis completed successfully.")
        print("Check the metadata_analysis_results/ folder for thesis inclusion files.")
        
    except FileNotFoundError as e:
        print(f"Error: Required data file not found - {e}")
        print("Ensure enhanced_dataset/ directory contains the required CSV files.")
        
    except Exception as e:
        print(f"Error during analysis: {e}")
        print("Check data format and file paths.")

Metadata Analysis and Category Similarity Study
Bachelor Thesis - Enhanced Text Classification Analysis

Exporting Analysis Results
Created output directory: metadata_analysis_results/
Metadata Feature Distribution Analysis
Overall Metadata Feature Distribution:
  meta_misleadingUnverifiedClaimAsFact: 100/331 (30.2%)
  meta_misleadingMissingImportantContext: 179/331 (54.1%)
  meta_misleadingFactualError: 168/331 (50.8%)
  meta_misleadingOutdatedInformation: 46/331 (13.9%)
  meta_misleadingManipulatedMedia: 49/331 (14.8%)
  meta_misleadingSatire: 30/331 (9.1%)

Category-specific Analysis:

false (n=80):
  UnverifiedClaimAsFact: 18/80 (22.5%)
  MissingImportantContext: 27/80 (33.8%)
  FactualError: 28/80 (35.0%)
  OutdatedInformation: 7/80 (8.8%)
  ManipulatedMedia: 40/80 (50.0%)
  Satire: 16/80 (20.0%)

repurposed (n=90):
  UnverifiedClaimAsFact: 31/90 (34.4%)
  MissingImportantContext: 46/90 (51.1%)
  FactualError: 53/90 (58.9%)
  OutdatedInformation: 20/90 (22.2%)
  ManipulatedMedia: 

Analyzing 67 slanted vs 16 unreliable source examples
Similarity Analysis Results:
  Total pairs analyzed: 1072
  Highest similarity: 1.000
  Average similarity: 1.000
  Median similarity: 1.000
Saved: metadata_analysis_results/metadata_distribution_analysis.txt
Saved: metadata_analysis_results/similar_examples_analysis.txt
Saved: metadata_analysis_results/top_similar_examples.csv
Saved: thesis_summary_statistics.txt

All analysis results exported successfully.

Analysis completed successfully.
Check the metadata_analysis_results/ folder for thesis inclusion files.
