# Enhanced Dataset Creation with Community Notes Metadata

## Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import os
from pathlib import Path
import re
from fuzzywuzzy import fuzz, process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

warnings.filterwarnings('ignore')
plt.style.use('default')

## Configuration

In [None]:
CONFIG = {
    "rel_typ_file": "rel_typ.xlsx",
    "metadata_file": "for_bsc_project.csv", 
    "output_dir": "enhanced_dataset",
    "matching_method": "hybrid",  # Options: "text_similarity", "fuzzy_match", "hybrid"
    "similarity_threshold": 0.8,  # For text similarity matching
    "fuzzy_threshold": 85,  # For fuzzy string matching
    
    # Metadata columns to include for classification
    "metadata_columns": [
        'misleadingOther',
        'misleadingFactualError', 
        'misleadingManipulatedMedia',
        'misleadingOutdatedInformation',
        'misleadingMissingImportantContext',
        'misleadingUnverifiedClaimAsFact',
        'misleadingSatire',
        'notMisleadingOther',
        'notMisleadingFactuallyCorrect',
        'notMisleadingOutdatedButNotWhenWritten',
        'notMisleadingClearlySatire',
        'notMisleadingPersonalOpinion',
        'trustworthySources',
        'believable',
        'harmful',
        'validationDifficulty'
    ],
    
    # Additional useful columns
    "additional_columns": [
        'tweetId',
        'noteId', 
        'classification',
        'summary',
        'created_at',
        'favorite_count',
        'retweet_count',
        'reply_count',
        'quote_count'
    ]
}

# Create output directory
for subdir in ['plots', 'reports']:
    os.makedirs(f"{CONFIG['output_dir']}/{subdir}", exist_ok=True)

print(f"🔗 Enhanced Dataset Creation with Community Notes Metadata")
print(f"Output directory: {CONFIG['output_dir']}")

## Load Original Dataset

In [None]:
def load_rel_typ_data(file_path):
    """Load the original rel_typ.xlsx dataset"""
    print(f"\n📊 Loading original dataset: {file_path}")
    
    try:
        df = pd.read_excel(file_path)
        print(f"✅ Loaded {len(df)} rows from {file_path}")
        
        # Show basic info
        print(f"📋 Columns: {list(df.columns)}")
        print(f"🏷️ Labels: {df['label'].value_counts().to_dict()}")
        
        # Clean basic columns
        df["tweet_text"] = df["tweet_text"].astype(str).fillna("").str.strip()
        df["note_text"] = df["note_text"].astype(str).fillna("").str.strip()
        df["label"] = df["label"].astype(str).str.strip()
        
        # Create combined text like in training
        df["text"] = df["tweet_text"] + " [SEP] " + df["note_text"]
        
        return df
        
    except Exception as e:
        print(f"❌ Error loading {file_path}: {e}")
        return None

# Load the original dataset
rel_typ_df = load_rel_typ_data(CONFIG["rel_typ_file"])

if rel_typ_df is None:
    print("❌ Failed to load rel_typ dataset")
else:
    print(f"📊 Ready to enhance {len(rel_typ_df)} samples")

## Load Community Notes Metadata

In [None]:
def load_metadata_file(file_path):
    """Load the large metadata CSV file"""
    print(f"\n📈 Loading metadata file: {file_path}")
    
    try:
        # Try to read with different encodings
        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
        df = None
        
        for encoding in encodings:
            try:
                df = pd.read_csv(file_path, encoding=encoding, low_memory=False)
                print(f"✅ Loaded {len(df)} rows with {encoding} encoding")
                break
            except UnicodeDecodeError:
                continue
        
        if df is None:
            print("❌ Could not read file with any encoding")
            return None
        
        print(f"📋 Shape: {df.shape}")
        print(f"📋 Available columns: {len(df.columns)}")
        
        # Check which metadata columns we actually have
        available_metadata = [col for col in CONFIG["metadata_columns"] if col in df.columns]
        missing_metadata = [col for col in CONFIG["metadata_columns"] if col not in df.columns]
        
        print(f"✅ Available metadata columns ({len(available_metadata)}): {available_metadata}")
        if missing_metadata:
            print(f"⚠️ Missing metadata columns ({len(missing_metadata)}): {missing_metadata}")
        
        # Check for text columns we can use for matching
        text_columns = [col for col in ['full_text', 'tweet_text', 'summary', 'text'] if col in df.columns]
        print(f"📝 Available text columns for matching: {text_columns}")
        
        return df
        
    except Exception as e:
        print(f"❌ Error loading {file_path}: {e}")
        return None

# Load the metadata file
metadata_df = load_metadata_file(CONFIG["metadata_file"])

if metadata_df is None:
    print("❌ Failed to load metadata dataset")
else:
    print(f"📊 Ready to match with {len(metadata_df)} metadata records")

## Text Preprocessing and Matching Functions

In [None]:
def clean_text_for_matching(text):
    """Clean text for better matching"""
    if pd.isna(text) or text == "":
        return ""
    
    text = str(text).lower()
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    # Remove mentions and hashtags
    text = re.sub(r'[@#]\w+', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def extract_tweet_from_combined_text(combined_text):
    """Extract just the tweet part from 'tweet [SEP] note' format"""
    if pd.isna(combined_text):
        return ""
    
    text = str(combined_text)
    if '[SEP]' in text:
        return text.split('[SEP]')[0].strip()
    return text

def text_similarity_matching(rel_typ_df, metadata_df, text_col_metadata='full_text'):
    """Match using TF-IDF cosine similarity"""
    print(f"\n🔍 Performing text similarity matching...")
    
    # Prepare texts
    rel_typ_texts = [clean_text_for_matching(extract_tweet_from_combined_text(text)) 
                    for text in rel_typ_df['text']]
    
    if text_col_metadata not in metadata_df.columns:
        print(f"❌ Column {text_col_metadata} not found in metadata")
        return pd.DataFrame()
    
    metadata_texts = [clean_text_for_matching(text) 
                     for text in metadata_df[text_col_metadata]]
    
    # Remove empty texts
    rel_typ_valid = [(i, text) for i, text in enumerate(rel_typ_texts) if text and len(text) > 10]
    metadata_valid = [(i, text) for i, text in enumerate(metadata_texts) if text and len(text) > 10]
    
    if not rel_typ_valid or not metadata_valid:
        print("❌ No valid texts for matching")
        return pd.DataFrame()
    
    print(f"📊 Matching {len(rel_typ_valid)} rel_typ texts with {len(metadata_valid)} metadata texts")
    
    # Create TF-IDF vectors
    all_texts = [text for _, text in rel_typ_valid] + [text for _, text in metadata_valid]
    
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    
    # Split matrices
    rel_typ_matrix = tfidf_matrix[:len(rel_typ_valid)]
    metadata_matrix = tfidf_matrix[len(rel_typ_valid):]
    
    # Calculate similarities
    similarities = cosine_similarity(rel_typ_matrix, metadata_matrix)
    
    matches = []
    for i, (rel_idx, _) in enumerate(rel_typ_valid):
        best_match_idx = np.argmax(similarities[i])
        best_similarity = similarities[i][best_match_idx]
        
        if best_similarity >= CONFIG["similarity_threshold"]:
            metadata_idx = metadata_valid[best_match_idx][0]
            matches.append({
                'rel_typ_idx': rel_idx,
                'metadata_idx': metadata_idx,
                'similarity_score': best_similarity,
                'rel_typ_text': rel_typ_texts[rel_idx][:100] + "...",
                'metadata_text': metadata_texts[metadata_idx][:100] + "..."
            })
    
    print(f"✅ Found {len(matches)} matches above threshold {CONFIG['similarity_threshold']}")
    
    return pd.DataFrame(matches)

def fuzzy_matching(rel_typ_df, metadata_df, text_col_metadata='full_text'):
    """Match using fuzzy string matching"""
    print(f"\n🔍 Performing fuzzy string matching...")
    
    rel_typ_texts = [clean_text_for_matching(extract_tweet_from_combined_text(text)) 
                    for text in rel_typ_df['text']]
    
    if text_col_metadata not in metadata_df.columns:
        print(f"❌ Column {text_col_metadata} not found in metadata")
        return pd.DataFrame()
    
    metadata_texts = [clean_text_for_matching(text) 
                     for text in metadata_df[text_col_metadata]]
    
    matches = []
    
    for i, rel_text in enumerate(rel_typ_texts):
        if not rel_text or len(rel_text) < 10:
            continue
        
        # Find best fuzzy match
        valid_metadata = [(j, text) for j, text in enumerate(metadata_texts) 
                         if text and len(text) > 10]
        
        if not valid_metadata:
            continue
        
        metadata_texts_only = [text for _, text in valid_metadata]
        best_match = process.extractOne(rel_text, metadata_texts_only, scorer=fuzz.ratio)
        
        if best_match and best_match[1] >= CONFIG["fuzzy_threshold"]:
            # Find the original index
            matched_text = best_match[0]
            metadata_idx = None
            for j, text in valid_metadata:
                if text == matched_text:
                    metadata_idx = j
                    break
            
            if metadata_idx is not None:
                matches.append({
                    'rel_typ_idx': i,
                    'metadata_idx': metadata_idx,
                    'fuzzy_score': best_match[1],
                    'rel_typ_text': rel_text[:100] + "...",
                    'metadata_text': matched_text[:100] + "..."
                })
    
    print(f"✅ Found {len(matches)} fuzzy matches above threshold {CONFIG['fuzzy_threshold']}")
    
    return pd.DataFrame(matches)

def hybrid_matching(rel_typ_df, metadata_df):
    """Combine multiple matching methods"""
    print(f"\n🔍 Performing hybrid matching...")
    
    all_matches = []
    
    # Try different text columns in metadata
    text_columns_to_try = ['full_text', 'summary', 'tweet_text', 'text']
    available_text_cols = [col for col in text_columns_to_try if col in metadata_df.columns]
    
    for text_col in available_text_cols:
        print(f"\n📝 Trying matching with column: {text_col}")
        
        # Text similarity matching
        similarity_matches = text_similarity_matching(rel_typ_df, metadata_df, text_col)
        if not similarity_matches.empty:
            similarity_matches['match_method'] = f'similarity_{text_col}'
            similarity_matches['match_score'] = similarity_matches['similarity_score']
            all_matches.append(similarity_matches)
        
        # Fuzzy matching  
        fuzzy_matches = fuzzy_matching(rel_typ_df, metadata_df, text_col)
        if not fuzzy_matches.empty:
            fuzzy_matches['match_method'] = f'fuzzy_{text_col}'
            fuzzy_matches['match_score'] = fuzzy_matches['fuzzy_score'] / 100  # Normalize to 0-1
            all_matches.append(fuzzy_matches)
    
    if not all_matches:
        print("❌ No matches found with any method")
        return pd.DataFrame()
    
    # Combine all matches
    combined_matches = pd.concat(all_matches, ignore_index=True)
    
    # Remove duplicates, keeping the best match for each rel_typ_idx
    best_matches = combined_matches.loc[combined_matches.groupby('rel_typ_idx')['match_score'].idxmax()]
    
    print(f"✅ Final hybrid matching: {len(best_matches)} unique matches")
    
    return best_matches

## Perform Dataset Matching

In [None]:
# Perform matching using the configured method
if rel_typ_df is not None and metadata_df is not None:
    print(f"\n🔍 Starting {CONFIG['matching_method']} matching...")
    
    if CONFIG["matching_method"] == "hybrid":
        matches_df = hybrid_matching(rel_typ_df, metadata_df)
    elif CONFIG["matching_method"] == "text_similarity":
        matches_df = text_similarity_matching(rel_typ_df, metadata_df)
    elif CONFIG["matching_method"] == "fuzzy_match":
        matches_df = fuzzy_matching(rel_typ_df, metadata_df)
    else:
        print(f"❌ Unknown matching method: {CONFIG['matching_method']}")
        matches_df = pd.DataFrame()
    
    if matches_df.empty:
        print("❌ No matches found - cannot create enhanced dataset")
    else:
        print(f"✅ Found {len(matches_df)} matches for dataset enhancement")
else:
    print("❌ Cannot perform matching - datasets not loaded")
    matches_df = pd.DataFrame()

## Merge Datasets and Add Metadata

In [None]:
def merge_datasets(rel_typ_df, metadata_df, matches_df):
    """Merge the datasets based on matches"""
    print(f"\n🔗 Merging datasets...")
    
    if matches_df.empty:
        print("❌ No matches to merge")
        return None
    
    # Prepare metadata columns
    available_metadata_cols = [col for col in CONFIG["metadata_columns"] if col in metadata_df.columns]
    available_additional_cols = [col for col in CONFIG["additional_columns"] if col in metadata_df.columns]
    
    all_cols_to_merge = available_metadata_cols + available_additional_cols
    
    print(f"📊 Merging {len(all_cols_to_merge)} metadata columns: {all_cols_to_merge}")
    
    # Create enhanced dataset
    enhanced_data = []
    
    for _, match in matches_df.iterrows():
        rel_idx = int(match['rel_typ_idx'])
        meta_idx = int(match['metadata_idx'])
        
        # Get original data
        rel_row = rel_typ_df.iloc[rel_idx].copy()
        meta_row = metadata_df.iloc[meta_idx]
        
        # Add metadata
        for col in all_cols_to_merge:
            if col in metadata_df.columns:
                rel_row[f'meta_{col}'] = meta_row[col]
        
        # Add matching info
        rel_row['match_method'] = match['match_method']
        rel_row['match_score'] = match['match_score']
        rel_row['has_metadata'] = True
        
        enhanced_data.append(rel_row)
    
    # Add unmatched rows
    matched_indices = set(matches_df['rel_typ_idx'].astype(int))
    unmatched_indices = set(range(len(rel_typ_df))) - matched_indices
    
    print(f"📊 Matched: {len(matched_indices)}, Unmatched: {len(unmatched_indices)}")
    
    for idx in unmatched_indices:
        rel_row = rel_typ_df.iloc[idx].copy()
        
        # Add empty metadata columns
        for col in all_cols_to_merge:
            rel_row[f'meta_{col}'] = np.nan
        
        rel_row['match_method'] = 'no_match'
        rel_row['match_score'] = 0.0
        rel_row['has_metadata'] = False
        
        enhanced_data.append(rel_row)
    
    enhanced_df = pd.DataFrame(enhanced_data)
    
    print(f"✅ Enhanced dataset created: {len(enhanced_df)} rows")
    print(f"📊 Rows with metadata: {enhanced_df['has_metadata'].sum()}")
    print(f"📊 Coverage: {enhanced_df['has_metadata'].sum() / len(enhanced_df) * 100:.1f}%")
    
    return enhanced_df

# Merge the datasets
if not matches_df.empty:
    enhanced_df = merge_datasets(rel_typ_df, metadata_df, matches_df)
    
    if enhanced_df is not None:
        print(f"🎯 Enhanced dataset ready with {enhanced_df['has_metadata'].sum()} samples containing metadata")
else:
    enhanced_df = None
    print("❌ Cannot create enhanced dataset - no matches found")

## Analyze Metadata Patterns by Label

In [None]:
def analyze_metadata_patterns(enhanced_df):
    """Analyze patterns in the metadata by label"""
    print(f"\n📊 Analyzing metadata patterns by label...")
    
    if enhanced_df is None or enhanced_df.empty:
        return None, None
    
    # Filter to rows with metadata
    with_metadata = enhanced_df[enhanced_df['has_metadata'] == True].copy()
    
    if len(with_metadata) == 0:
        print("❌ No rows with metadata to analyze")
        return None, None
    
    print(f"📊 Analyzing {len(with_metadata)} rows with metadata")
    
    # Get metadata columns
    metadata_cols = [col for col in with_metadata.columns if col.startswith('meta_') and col in with_metadata.columns]
    numeric_metadata = []
    
    for col in metadata_cols:
        if with_metadata[col].dtype in ['bool', 'int64', 'float64'] or with_metadata[col].isin([0, 1, True, False]).all():
            numeric_metadata.append(col)
    
    print(f"📊 Found {len(numeric_metadata)} numeric metadata columns")
    
    if not numeric_metadata:
        print("⚠️ No numeric metadata columns found")
        return None, None
    
    # Analyze by label
    analysis_results = {}
    
    for label in with_metadata['label'].unique():
        label_data = with_metadata[with_metadata['label'] == label]
        label_analysis = {'label': label, 'count': len(label_data)}
        
        for col in numeric_metadata:
            if col in label_data.columns:
                # Convert to numeric, handling various formats
                values = pd.to_numeric(label_data[col], errors='coerce')
                values = values.dropna()
                
                if len(values) > 0:
                    label_analysis[col] = {
                        'mean': values.mean(),
                        'sum': values.sum(),
                        'count_true': (values == 1).sum() if values.isin([0, 1]).all() else values.sum(),
                        'percentage': (values == 1).mean() * 100 if values.isin([0, 1]).all() else values.mean() * 100
                    }
                else:
                    label_analysis[col] = {'mean': 0, 'sum': 0, 'count_true': 0, 'percentage': 0}
        
        analysis_results[label] = label_analysis
    
    return analysis_results, numeric_metadata

# Analyze metadata patterns
if enhanced_df is not None:
    analysis_results, numeric_metadata = analyze_metadata_patterns(enhanced_df)
    
    if analysis_results:
        print("✅ Metadata pattern analysis complete")
        print(f"📊 Analyzed {len(numeric_metadata)} metadata features across {len(analysis_results)} labels")
    else:
        print("⚠️ No metadata analysis results")
else:
    analysis_results, numeric_metadata = None, None

## Create Visualizations

In [None]:
def create_metadata_analysis_plots(enhanced_df, analysis_results, numeric_metadata):
    """Create visualization of metadata patterns"""
    
    if not analysis_results or not numeric_metadata:
        print("❌ No data to plot")
        return None
    
    # Filter to most important metadata columns for plotting
    important_cols = [col for col in numeric_metadata if any(keyword in col.lower() for keyword in 
                     ['misleading', 'trustworthy', 'factual', 'harmful', 'believable'])]
    
    if len(important_cols) > 12:
        important_cols = important_cols[:12]  # Limit for readability
    
    n_cols = min(4, len(important_cols))
    n_rows = (len(important_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows))
    if n_rows == 1 and n_cols == 1:
        axes = [axes]
    elif n_rows == 1:
        axes = [axes]
    else:
        axes = axes.flatten()
    
    labels = list(analysis_results.keys())
    
    for i, col in enumerate(important_cols):
        if i >= len(axes):
            break
            
        ax = axes[i]
        
        # Get percentages for each label
        percentages = []
        for label in labels:
            if col in analysis_results[label]:
                percentages.append(analysis_results[label][col]['percentage'])
            else:
                percentages.append(0)
        
        # Create bar plot
        bars = ax.bar(labels, percentages, alpha=0.7)
        ax.set_title(col.replace('meta_', '').replace('_', ' ').title())
        ax.set_ylabel('Percentage (%)')
        ax.tick_params(axis='x', rotation=45)
        
        # Add value labels
        for bar, pct in zip(bars, percentages):
            height = bar.get_height()
            if height > 0:
                ax.text(bar.get_x() + bar.get_width()/2., height + 1,
                        f'{pct:.1f}%', ha='center', va='bottom', fontsize=8)
    
    # Hide empty subplots
    for i in range(len(important_cols), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    
    plot_path = f"{CONFIG['output_dir']}/plots/metadata_analysis_by_label.png"
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    return plot_path

def create_coverage_analysis_plot(enhanced_df, matches_df):
    """Create plot showing matching coverage"""
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Plot 1: Overall coverage
    coverage_data = {
        'With Metadata': enhanced_df['has_metadata'].sum(),
        'Without Metadata': (~enhanced_df['has_metadata']).sum()
    }
    
    colors = ['lightgreen', 'lightcoral']
    wedges, texts, autotexts = ax1.pie(coverage_data.values(), labels=coverage_data.keys(), 
                                      autopct='%1.1f%%', colors=colors, startangle=90)
    ax1.set_title('Metadata Coverage')
    
    # Plot 2: Coverage by label
    coverage_by_label = enhanced_df.groupby('label')['has_metadata'].agg(['sum', 'count'])
    coverage_by_label['percentage'] = coverage_by_label['sum'] / coverage_by_label['count'] * 100
    
    bars = ax2.bar(coverage_by_label.index, coverage_by_label['percentage'], alpha=0.7)
    ax2.set_title('Metadata Coverage by Label')
    ax2.set_ylabel('Coverage Percentage (%)')
    ax2.tick_params(axis='x', rotation=45)
    
    # Add count labels
    for bar, row in zip(bars, coverage_by_label.itertuples()):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + 1,
                f'{row.sum}/{row.count}', ha='center', va='bottom', fontsize=8)
    
    plt.tight_layout()
    
    plot_path = f"{CONFIG['output_dir']}/plots/metadata_coverage_analysis.png"
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    return plot_path

# Create visualizations
if enhanced_df is not None and analysis_results:
    print("\n📊 Creating visualizations...")
    
    # Coverage analysis plot
    coverage_plot = create_coverage_analysis_plot(enhanced_df, matches_df)
    if coverage_plot:
        print(f"📊 Coverage analysis plot saved: {coverage_plot}")
    
    # Metadata analysis plot
    metadata_plot = create_metadata_analysis_plots(enhanced_df, analysis_results, numeric_metadata)
    if metadata_plot:
        print(f"📊 Metadata analysis plot saved: {metadata_plot}")
else:
    print("⚠️ Skipping visualizations - no enhanced dataset available")

## Generate Enhancement Report


In [None]:
def create_enhancement_report(enhanced_df, matches_df, analysis_results, numeric_metadata):
    """Create detailed report of the enhancement process"""
    
    report = []
    report.append("# 🔗 Enhanced Dataset Creation Report")
    report.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report.append("")
    
    # Dataset Summary
    report.append("## 📊 Dataset Enhancement Summary")
    original_count = len(enhanced_df)
    with_metadata_count = enhanced_df['has_metadata'].sum() if enhanced_df is not None else 0
    coverage_pct = with_metadata_count / original_count * 100 if original_count > 0 else 0
    
    report.append(f"- **Original dataset size**: {original_count}")
    report.append(f"- **Rows with metadata**: {with_metadata_count}")
    report.append(f"- **Coverage**: {coverage_pct:.1f}%")
    report.append(f"- **Metadata columns added**: {len(numeric_metadata) if numeric_metadata else 0}")
    report.append("")
    
    # Matching Results
    if matches_df is not None and not matches_df.empty:
        report.append("## 🔍 Matching Results")
        match_methods = matches_df['match_method'].value_counts()
        
        for method, count in match_methods.items():
            report.append(f"- **{method}**: {count} matches")
        
        avg_score = matches_df['match_score'].mean()
        report.append(f"- **Average match score**: {avg_score:.3f}")
        report.append("")
    
    # Coverage by Label
    if enhanced_df is not None:
        report.append("## 🏷️ Coverage by Label")
        coverage_by_label = enhanced_df.groupby('label')['has_metadata'].agg(['sum', 'count'])
        coverage_by_label['percentage'] = coverage_by_label['sum'] / coverage_by_label['count'] * 100
        
        for label in coverage_by_label.index:
            row = coverage_by_label.loc[label]
            report.append(f"- **{label}**: {row['sum']}/{row['count']} ({row['percentage']:.1f}%)")
        report.append("")
    
    # Metadata Analysis
    if analysis_results and numeric_metadata:
        report.append("## 📈 Metadata Insights by Label")
        
        # Focus on most discriminative metadata
        key_metadata = [col for col in numeric_metadata if any(keyword in col.lower() for keyword in 
                       ['misleading', 'trustworthy', 'factual', 'unverified', 'harmful'])]
        
        for col in key_metadata[:10]:  # Top 10 most relevant
            col_name = col.replace('meta_', '').replace('_', ' ').title()
            report.append(f"### {col_name}")
            
            for label, data in analysis_results.items():
                if col in data:
                    pct = data[col]['percentage']
                    count = data[col]['count_true']
                    total = data['count']
                    report.append(f"- **{label}**: {pct:.1f}% ({count}/{total})")
            report.append("")
    
    # Feature Importance for Each Label
    if analysis_results:
        report.append("## 🎯 Key Metadata Features by Label")
        
        for label, data in analysis_results.items():
            report.append(f"### {label}")
            
            # Find metadata with highest percentages for this label
            label_features = []
            for col in numeric_metadata:
                if col in data and isinstance(data[col], dict):
                    pct = data[col]['percentage']
                    if pct > 20:  # Only show features with >20% presence
                        label_features.append((col.replace('meta_', ''), pct))
            
            # Sort by percentage
            label_features.sort(key=lambda x: x[1], reverse=True)
            
            if label_features:
                report.append("**High-signal metadata features:**")
                for feature, pct in label_features[:5]:  # Top 5
                    report.append(f"- {feature.replace('_', ' ').title()}: {pct:.1f}%")
            else:
                report.append("**No strongly predictive metadata features found**")
            
            report.append("")
    
    # Recommendations
    report.append("## 💡 Recommendations")
    
    if coverage_pct >= 80:
        report.append("1. **Excellent coverage** - metadata is available for most samples")
    elif coverage_pct >= 50:
        report.append("1. **Good coverage** - metadata available for majority of samples")
    elif coverage_pct >= 20:
        report.append("1. **Moderate coverage** - consider improving matching algorithms")
    else:
        report.append("1. **Poor coverage** - may need different matching strategy")
    
    if analysis_results:
        # Find most discriminative features
        discriminative_features = []
        for col in numeric_metadata:
            percentages = []
            for label_data in analysis_results.values():
                if col in label_data and isinstance(label_data[col], dict):
                    percentages.append(label_data[col]['percentage'])
            
            if percentages and len(percentages) > 1:
                range_pct = max(percentages) - min(percentages)
                if range_pct > 30:  # High variance across labels
                    discriminative_features.append((col.replace('meta_', ''), range_pct))
        
        discriminative_features.sort(key=lambda x: x[1], reverse=True)
        
        if discriminative_features:
            report.append("2. **Most discriminative metadata features**:")
            for feature, variance in discriminative_features[:5]:
                report.append(f"   - {feature.replace('_', ' ').title()} (variance: {variance:.1f}%)")
        
        report.append("3. **Use enhanced dataset for training** - metadata should improve classification")
    
    report.append("4. **Combine with original text features** - metadata complements text analysis")
    
    report.append("")
    report.append("---")
    report.append("*Report generated by Enhanced Dataset Creation*")
    
    # Save report
    report_path = f"{CONFIG['output_dir']}/reports/enhancement_report.md"
    with open(report_path, 'w') as f:
        f.write('\n'.join(report))
    
    return report_path

# Generate the enhancement report
if enhanced_df is not None:
    print("\n📝 Generating enhancement report...")
    report_path = create_enhancement_report(enhanced_df, matches_df, analysis_results, numeric_metadata)
    if report_path:
        print(f"📝 Enhancement report saved: {report_path}")
else:
    print("⚠️ Cannot generate report - no enhanced dataset available")

## Prepare Training-Ready Dataset

In [None]:
def create_training_ready_dataset(enhanced_df, output_path=None):
    """Prepare the enhanced dataset for training with proper feature encoding"""
    
    if enhanced_df is None:
        return None
    
    print(f"\n🔧 Preparing training-ready dataset...")
    
    # Create a copy for training
    training_df = enhanced_df.copy()
    
    # Fill missing metadata with 0 (assumes binary features)
    metadata_cols = [col for col in training_df.columns if col.startswith('meta_')]
    
    for col in metadata_cols:
        if training_df[col].dtype in ['object', 'bool']:
            # Convert boolean/object to numeric
            training_df[col] = pd.to_numeric(training_df[col], errors='coerce')
        
        # Fill missing values with 0
        training_df[col] = training_df[col].fillna(0)
    
    # Clean label column
    training_df['label'] = training_df['label'].astype(str).str.strip().str.lower()
    
    # Remove excluded labels
    excluded_labels = ["irrelevant", "probative"]
    training_df = training_df[~training_df["label"].isin(excluded_labels)]
    training_df = training_df.dropna(subset=["label"])
    
    print(f"✅ Training-ready dataset: {len(training_df)} samples")
    print(f"🏷️ Labels: {training_df['label'].value_counts().to_dict()}")
    print(f"🔧 Metadata features: {len(metadata_cols)}")
    
    if output_path is None:
        output_path = f"{CONFIG['output_dir']}/training_ready_dataset.csv"
    
    training_df.to_csv(output_path, index=False)
    print(f"💾 Saved training-ready dataset to: {output_path}")
    
    return training_df

# Create training-ready dataset
if enhanced_df is not None:
    training_ready = create_training_ready_dataset(enhanced_df)
else:
    training_ready = None
    print("⚠️ Cannot create training-ready dataset - no enhanced dataset available")

## Save All Results

In [None]:
# Save enhanced dataset and matching results
if enhanced_df is not None:
    print("\n💾 Saving all results...")
    
    # Save enhanced dataset
    enhanced_dataset_path = f"{CONFIG['output_dir']}/enhanced_dataset.csv"
    enhanced_df.to_csv(enhanced_dataset_path, index=False)
    print(f"📊 Enhanced dataset saved: {enhanced_dataset_path}")
    
    # Save matches for inspection
    if not matches_df.empty:
        matches_path = f"{CONFIG['output_dir']}/matching_results.csv"
        matches_df.to_csv(matches_path, index=False)
        print(f"🔗 Matching results saved: {matches_path}")
    
    # Print final summary
    print(f"\n{'='*80}")
    print("✅ ENHANCED DATASET CREATION COMPLETE!")
    print(f"{'='*80}")
    print(f"📊 **Original dataset**: {len(rel_typ_df)} samples")
    print(f"🔗 **Successful matches**: {len(matches_df) if not matches_df.empty else 0}")
    print(f"📈 **Enhanced dataset**: {len(enhanced_df)} samples")
    print(f"🎯 **Coverage**: {enhanced_df['has_metadata'].sum()}/{len(enhanced_df)} ({enhanced_df['has_metadata'].sum()/len(enhanced_df)*100:.1f}%)")
    
    if numeric_metadata:
        print(f"🔧 **Metadata features added**: {len(numeric_metadata)}")
        
        # Show most promising features
        if analysis_results:
            print(f"\n🎯 **Most promising metadata features**:")
            feature_variances = []
            for col in numeric_metadata[:10]:  # Check top 10
                percentages = []
                for label_data in analysis_results.values():
                    if col in label_data and isinstance(label_data[col], dict):
                        percentages.append(label_data[col]['percentage'])
                
                if len(percentages) > 1:
                    variance = max(percentages) - min(percentages)
                    feature_variances.append((col.replace('meta_', ''), variance))
            
            feature_variances.sort(key=lambda x: x[1], reverse=True)
            for feature, variance in feature_variances[:5]:
                print(f"   - {feature.replace('_', ' ').title()}: {variance:.1f}% variance across labels")
    
    print(f"\n📁 **Files created**:")
    print(f"   - Enhanced dataset: {enhanced_dataset_path}")
    if not matches_df.empty:
        print(f"   - Matching results: {CONFIG['output_dir']}/matching_results.csv")
    if training_ready is not None:
        print(f"   - Training-ready dataset: {CONFIG['output_dir']}/training_ready_dataset.csv")
    print(f"   - Enhancement report: {CONFIG['output_dir']}/reports/enhancement_report.md")
    print(f"   - Analysis plots: {CONFIG['output_dir']}/plots/")
    
    print(f"\n🚀 **Dataset ready for enhanced classification experiments**")
    
else:
    print("❌ No enhanced dataset created - check input files and matching parameters")