In [47]:
#!/usr/bin/env python3
"""
Gemini Pro 1.5 Omni Dashboard - HTML-Based Complete Analysis
Creates comprehensive HTML dashboard with ML metrics for Gemini 1.5 Pro evaluations
Similar to your enhanced Pro 2.5 analysis but for existing 1.5 results
"""

import pandas as pd
import numpy as np
from google.cloud import bigquery
import json
from datetime import datetime
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, classification_report
import traceback

# Configuration - Using your existing Gemini 1.5 Pro tables
PROJECT_ID = "scope3-dev"
DATASET_ID = "research_bs_monitoring"
META_RESULTS_TABLE = "BA_Meta_Gemini_Pro_Judge_Results"
WEB_RESULTS_TABLE = "BA_Web_Gemini_Pro_Judge_Results"

class GeminiPro15OmniAnalysis:
    """Complete analysis class for Gemini Pro 1.5 with HTML dashboard generation"""
    
    def __init__(self):
        self.client = self.initialize_bigquery_client()
        
    def initialize_bigquery_client(self):
        """Initialize BigQuery client"""
        try:
            client = bigquery.Client(project=PROJECT_ID)
            print(f"✅ BigQuery client initialized for project: {PROJECT_ID}")
            return client
        except Exception as e:
            print(f"❌ BigQuery initialization failed: {e}")
            return None
    
    def load_gemini15_results(self, table_name: str) -> pd.DataFrame:
        """Load Gemini 1.5 Pro results from BigQuery"""
        
        query = f"""
        SELECT 
            artifact_id,
            data_source,
            source,
            -- Flash results (ground truth)
            flash_classification,
            flash_reasoning,
            model_prompt,
            
            -- Pro 1.5 results (judge evaluations)
            pro_judge_agreement,
            pro_verdict,
            pro_confidence,
            pro_would_reach_same_conclusion,
            pro_reasoning,
            flash_vs_pro_analysis,
            improvements,
            api_call_time,
            batch_number,
            created_at,
            model_used,
            error_message,
            
            -- ML Analysis Fields
            CASE 
                WHEN flash_classification = 1 THEN 1 
                WHEN flash_classification = 0 THEN 0 
                ELSE flash_classification
            END as flash_binary,
            
            CASE 
                WHEN pro_verdict = 'Aligned' THEN 1 
                WHEN pro_verdict = 'Not-Aligned' THEN 0 
                ELSE NULL
            END as pro_binary,
            
            -- Agreement analysis
            CASE 
                WHEN (flash_classification = 1 AND pro_verdict = 'Aligned') OR
                     (flash_classification = 0 AND pro_verdict = 'Not-Aligned') 
                THEN TRUE 
                ELSE FALSE 
            END as classification_agreement
            
        FROM `{PROJECT_ID}.{DATASET_ID}.{table_name}`
        WHERE error_message IS NULL  -- Only successful evaluations
        ORDER BY created_at DESC
        """
        
        try:
            print(f"📥 Loading data from {table_name}...")
            df = self.client.query(query).to_dataframe()
            
            if not df.empty:
                df['created_at'] = pd.to_datetime(df['created_at'])
                # Clean data for ML analysis
                df = df.dropna(subset=['flash_binary', 'pro_binary'])
                print(f"✅ Loaded {len(df)} records from {table_name}")
                return df
            else:
                print(f"⚠️ No data found in {table_name}")
                return pd.DataFrame()
                
        except Exception as e:
            print(f"❌ Error loading from {table_name}: {e}")
            return pd.DataFrame()
    
    def calculate_comprehensive_metrics(self, df: pd.DataFrame, dataset_name: str) -> dict:
        """Calculate comprehensive ML metrics similar to your Pro 2.5 analysis"""
        
        if df.empty:
            return {}
        
        print(f"\n🎯 CALCULATING ML METRICS FOR {dataset_name.upper()} DATASET")
        print("=" * 60)
        
        # Flash = ground truth, Pro 1.5 = predictions being evaluated
        flash_truth = df['flash_binary'].astype(int)
        pro_pred = df['pro_binary'].astype(int)
        total = len(df)
        
        # Basic accuracy
        accuracy = (flash_truth == pro_pred).mean()
        
        # Class distribution
        flash_pos = (flash_truth == 1).sum()
        flash_neg = (flash_truth == 0).sum()
        pro_pos = (pro_pred == 1).sum()
        pro_neg = (pro_pred == 0).sum()
        
        print(f"📊 Class Distribution ({dataset_name}):")
        print(f"   Flash Positive (Aligned): {flash_pos} ({flash_pos/total:.1%})")
        print(f"   Flash Negative (Not-Aligned): {flash_neg} ({flash_neg/total:.1%})")
        print(f"   Pro 1.5 Positive: {pro_pos} ({pro_pos/total:.1%})")
        print(f"   Pro 1.5 Negative: {pro_neg} ({pro_neg/total:.1%})")
        
        # Full ML metrics calculation
        try:
            # Check if we have both classes in the ground truth
            unique_classes = np.unique(flash_truth)
            if len(unique_classes) < 2:
                print(f"⚠️ Warning: Only one class found in Flash ground truth for {dataset_name}")
                print(f"   Available classes: {unique_classes}")
                print(f"   Cannot calculate full binary classification metrics")
                
                # Return basic metrics only
                return {
                    'dataset_name': dataset_name,
                    'total_evaluations': total,
                    'accuracy': round(accuracy, 3),
                    'class_distribution': {
                        'flash_positive': {'count': int(flash_pos), 'percentage': round(flash_pos/total, 3)},
                        'flash_negative': {'count': int(flash_neg), 'percentage': round(flash_neg/total, 3)},
                        'pro_positive': {'count': int(pro_pos), 'percentage': round(pro_pos/total, 3)},
                        'pro_negative': {'count': int(pro_neg), 'percentage': round(pro_neg/total, 3)}
                    },
                    'error': 'Single class in ground truth - limited metrics available'
                }
            
            precision = precision_score(flash_truth, pro_pred, zero_division=0, average='binary')
            recall = recall_score(flash_truth, pro_pred, zero_division=0, average='binary')
            f1 = f1_score(flash_truth, pro_pred, zero_division=0, average='binary')
            
            # Confusion matrix
            cm = confusion_matrix(flash_truth, pro_pred)
            if cm.shape == (2, 2):
                tn, fp, fn, tp = cm.ravel()
            else:
                tn, fp, fn, tp = 0, 0, 0, total
            
            # Additional metrics
            specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
            sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
            fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
            fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
            npv = tn / (tn + fn) if (tn + fn) > 0 else 0
            ppv = tp / (tp + fp) if (tp + fp) > 0 else 0
            
            # Balanced accuracy
            balanced_accuracy = (sensitivity + specificity) / 2
            
            # Matthews Correlation Coefficient
            mcc_num = (tp * tn) - (fp * fn)
            mcc_den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) if (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) > 0 else 1
            mcc = mcc_num / mcc_den if mcc_den > 0 else 0
            
            print(f"\n🎯 CORE METRICS ({dataset_name}):")
            print(f"   Accuracy: {accuracy:.3f} ({accuracy:.1%})")
            print(f"   Balanced Accuracy: {balanced_accuracy:.3f} ({balanced_accuracy:.1%})")
            print(f"   F1-Score: {f1:.3f}")
            print(f"   Precision: {precision:.3f}")
            print(f"   Recall (Sensitivity): {recall:.3f}")
            print(f"   Specificity: {specificity:.3f}")
            print(f"   Matthews Correlation: {mcc:.3f}")
            
            print(f"\n📊 CONFUSION MATRIX ({dataset_name}):")
            print(f"                    Flash Truth")
            print(f"                Not-Aligned  Aligned")
            print(f"Pro 1.5  Not       {tn:3d}      {fn:3d}   = {tn+fn}")
            print(f"         Aligned    {fp:3d}      {tp:3d}   = {fp+tp}")
            print(f"                    ___      ___")
            print(f"                    {tn+fp:3d}      {fn+tp:3d}   = {total}")
            
            print(f"\n📈 ERROR ANALYSIS ({dataset_name}):")
            print(f"   False Positive Rate: {fpr:.3f} (Pro 1.5 more liberal than Flash)")
            print(f"   False Negative Rate: {fnr:.3f} (Pro 1.5 more conservative than Flash)")
            
            # Agreement scenario analysis (matching your Pro 2.5 logic)
            process_agreement = df['pro_judge_agreement'].mean()
            classification_agreement = df['classification_agreement'].mean()
            would_reach_same = df['pro_would_reach_same_conclusion'].mean() if 'pro_would_reach_same_conclusion' in df.columns else 0
            
            # Calculate agreement scenarios
            scenario_counts = df.groupby(['pro_judge_agreement', 'classification_agreement']).size()
            both_agree = scenario_counts.get((True, True), 0)
            process_only = scenario_counts.get((True, False), 0)
            classification_only = scenario_counts.get((False, True), 0)
            both_disagree = scenario_counts.get((False, False), 0)
            
            print(f"\n⚖️ PRO 1.5 AGREEMENT ANALYSIS ({dataset_name}):")
            print(f"   Process Agreement: {process_agreement:.1%}")
            print(f"   Classification Agreement: {classification_agreement:.1%}")
            print(f"   Would Reach Same: {would_reach_same:.1%}")
            
            print(f"\n🔍 AGREEMENT SCENARIOS ({dataset_name}):")
            print(f"   ✅ Perfect Alignment: {both_agree} ({both_agree/total:.1%})")
            print(f"   🟡 Good Reasoning, Wrong Verdict: {process_only} ({process_only/total:.1%})")
            print(f"   🟠 Right Verdict, Poor Reasoning: {classification_only} ({classification_only/total:.1%})")
            print(f"   ❌ Complete Disagreement: {both_disagree} ({both_disagree/total:.1%})")
            
            # Performance assessment
            if f1 >= 0.8:
                f1_assessment = "🟢 EXCELLENT"
            elif f1 >= 0.7:
                f1_assessment = "🟡 GOOD"
            elif f1 >= 0.6:
                f1_assessment = "🟠 MODERATE"
            else:
                f1_assessment = "🔴 POOR"
            
            if accuracy >= 0.8:
                acc_assessment = "🟢 EXCELLENT"
            elif accuracy >= 0.7:
                acc_assessment = "🟡 GOOD"
            elif accuracy >= 0.6:
                acc_assessment = "🟠 MODERATE"
            else:
                acc_assessment = "🔴 POOR"
            
            print(f"\n🎯 PERFORMANCE ASSESSMENT ({dataset_name}):")
            print(f"   F1-Score: {f1_assessment} ({f1:.3f})")
            print(f"   Accuracy: {acc_assessment} ({accuracy:.3f})")
            print(f"   Primary Issue: {'Pro 1.5 more liberal than Flash' if fp > fn else 'Pro 1.5 more conservative than Flash' if fn > fp else 'Balanced disagreement'}")
            
            return {
                'dataset_name': dataset_name,
                'total_evaluations': total,
                'accuracy': round(accuracy, 3),
                'balanced_accuracy': round(balanced_accuracy, 3),
                'f1_score': round(f1, 3),
                'precision': round(precision, 3),
                'recall': round(recall, 3),
                'specificity': round(specificity, 3),
                'sensitivity': round(sensitivity, 3),
                'false_positive_rate': round(fpr, 3),
                'false_negative_rate': round(fnr, 3),
                'positive_predictive_value': round(ppv, 3),
                'negative_predictive_value': round(npv, 3),
                'matthews_correlation_coefficient': round(mcc, 3),
                'confusion_matrix': {
                    'true_positives': int(tp),
                    'false_positives': int(fp), 
                    'true_negatives': int(tn),
                    'false_negatives': int(fn),
                    'total': int(total)
                },
                'class_distribution': {
                    'flash_positive': {'count': int(flash_pos), 'percentage': round(flash_pos/total, 3)},
                    'flash_negative': {'count': int(flash_neg), 'percentage': round(flash_neg/total, 3)},
                    'pro_positive': {'count': int(pro_pos), 'percentage': round(pro_pos/total, 3)},
                    'pro_negative': {'count': int(pro_neg), 'percentage': round(pro_neg/total, 3)}
                },
                'pro_15_agreement': {
                    'process_agreement_rate': round(process_agreement, 3),
                    'classification_agreement_rate': round(classification_agreement, 3),
                    'would_reach_same_rate': round(would_reach_same, 3)
                },
                'scenario_breakdown': {
                    'both_agree': {'count': int(both_agree), 'percentage': round(both_agree/total, 3)},
                    'process_only': {'count': int(process_only), 'percentage': round(process_only/total, 3)},
                    'classification_only': {'count': int(classification_only), 'percentage': round(classification_only/total, 3)},
                    'both_disagree': {'count': int(both_disagree), 'percentage': round(both_disagree/total, 3)}
                },
                'performance_assessment': {
                    'f1_category': f1_assessment,
                    'accuracy_category': acc_assessment,
                    'primary_issue': 'Pro 1.5 more liberal than Flash' if fp > fn else 'Pro 1.5 more conservative than Flash' if fn > fp else 'Balanced disagreement',
                    'recommendation': 'Pro 1.5 tends to be more permissive' if fp > fn else 'Pro 1.5 tends to be more strict' if fn > fp else 'Review edge cases where models disagree'
                },
                'confidence_analysis': {
                    'average_confidence': round(df['pro_confidence'].mean(), 3),
                    'high_confidence_agreement': round(df[df['pro_confidence'] > 0.8]['pro_judge_agreement'].mean() if len(df[df['pro_confidence'] > 0.8]) > 0 else 0, 3),
                    'low_confidence_agreement': round(df[df['pro_confidence'] <= 0.7]['pro_judge_agreement'].mean() if len(df[df['pro_confidence'] <= 0.7]) > 0 else 0, 3)
                }
            }
            
        except Exception as e:
            print(f"❌ Error calculating metrics for {dataset_name}: {e}")
            return {
                'dataset_name': dataset_name,
                'total_evaluations': total,
                'accuracy': round(accuracy, 3),
                'class_distribution': {
                    'flash_positive': {'count': int(flash_pos), 'percentage': round(flash_pos/total, 3)},
                    'flash_negative': {'count': int(flash_neg), 'percentage': round(flash_neg/total, 3)},
                    'pro_positive': {'count': int(pro_pos), 'percentage': round(pro_pos/total, 3)},
                    'pro_negative': {'count': int(pro_neg), 'percentage': round(pro_neg/total, 3)}
                },
                'error': f'Calculation failed: {str(e)}'
            }
    
    def create_omni_html_dashboard(self, meta_results: dict, web_results: dict, combined_results: dict):
        """Create comprehensive HTML dashboard matching your enhanced analysis style"""
        
        # Determine which results are available
        datasets_available = []
        if meta_results: datasets_available.append('Meta')
        if web_results: datasets_available.append('Web') 
        if combined_results: datasets_available.append('Combined')
        
        if not datasets_available:
            print("❌ No results available for dashboard creation")
            return
        
        html_content = f"""<!DOCTYPE html>
<html>
<head>
    <title>Gemini Pro 1.5 Omni Dashboard - Complete ML Analysis</title>
    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
    <style>
        body {{ 
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; 
            margin: 0; 
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); 
            min-height: 100vh; 
        }}
        .container {{ 
            max-width: 1600px; 
            margin: 0 auto; 
            padding: 20px; 
        }}
        .header {{ 
            background: rgba(255,255,255,0.95); 
            padding: 40px; 
            border-radius: 20px; 
            text-align: center; 
            margin-bottom: 30px; 
            backdrop-filter: blur(10px); 
            box-shadow: 0 20px 40px rgba(0,0,0,0.1);
        }}
        .dataset-tabs {{
            display: flex;
            justify-content: center;
            margin-bottom: 30px;
            gap: 10px;
        }}
        .tab-button {{
            padding: 15px 30px;
            border: none;
            border-radius: 25px;
            font-weight: bold;
            cursor: pointer;
            transition: all 0.3s ease;
            font-size: 16px;
        }}
        .tab-button.meta {{ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; }}
        .tab-button.web {{ background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); color: white; }}
        .tab-button.combined {{ background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%); color: white; }}
        .tab-button:hover {{ transform: translateY(-2px); box-shadow: 0 10px 20px rgba(0,0,0,0.2); }}
        .dataset-content {{ display: none; }}
        .dataset-content.active {{ display: block; }}
        .metrics-grid {{ 
            display: grid; 
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); 
            gap: 20px; 
            margin: 20px 0; 
        }}
        .metric-card {{ 
            background: rgba(255,255,255,0.95); 
            padding: 25px; 
            border-radius: 15px; 
            box-shadow: 0 10px 30px rgba(0,0,0,0.15); 
            text-align: center; 
            backdrop-filter: blur(10px); 
            transition: transform 0.3s ease;
        }}
        .metric-card:hover {{ transform: translateY(-5px); }}
        .metric-value {{ 
            font-size: 2.8em; 
            font-weight: bold; 
            margin: 15px 0; 
        }}
        .metric-label {{ 
            color: #666; 
            font-weight: 600; 
            font-size: 1.1em;
        }}
        .excellent {{ color: #10B981; }}
        .good {{ color: #F59E0B; }}
        .moderate {{ color: #FF6B35; }}
        .poor {{ color: #EF4444; }}
        .section {{ 
            background: rgba(255,255,255,0.95); 
            padding: 30px; 
            border-radius: 20px; 
            margin: 20px 0; 
            backdrop-filter: blur(10px); 
            box-shadow: 0 15px 35px rgba(0,0,0,0.1);
        }}
        .two-column {{ 
            display: grid; 
            grid-template-columns: 1fr 1fr; 
            gap: 30px; 
        }}
        .three-column {{ 
            display: grid; 
            grid-template-columns: repeat(3, 1fr); 
            gap: 20px; 
        }}
        .confusion-matrix {{ 
            display: grid; 
            grid-template-columns: repeat(3, 1fr); 
            gap: 10px; 
            margin: 20px 0; 
            text-align: center; 
        }}
        .cm-cell {{ 
            padding: 20px; 
            border-radius: 12px; 
            font-weight: bold; 
            transition: transform 0.2s ease;
        }}
        .cm-cell:hover {{ transform: scale(1.05); }}
        .cm-header {{ background: linear-gradient(135deg, #f8f9fa, #e9ecef); color: #333; }}
        .cm-tp {{ background: linear-gradient(135deg, #d4edda, #c3e6cb); color: #155724; }}
        .cm-fp {{ background: linear-gradient(135deg, #f8d7da, #f1b0b7); color: #721c24; }}
        .cm-fn {{ background: linear-gradient(135deg, #fff3cd, #fce4a6); color: #856404; }}
        .cm-tn {{ background: linear-gradient(135deg, #d1ecf1, #bee5eb); color: #0c5460; }}
        .breakdown-grid {{ 
            display: grid; 
            grid-template-columns: repeat(2, 1fr); 
            gap: 20px; 
        }}
        .breakdown-item {{ 
            padding: 25px; 
            border-radius: 15px; 
            text-align: center; 
            transition: transform 0.3s ease;
        }}
        .breakdown-item:hover {{ transform: translateY(-3px); }}
        .both-agree {{ background: linear-gradient(135deg, #d4edda, #c3e6cb); }}
        .process-only {{ background: linear-gradient(135deg, #fff3cd, #fce4a6); }}
        .classification-only {{ background: linear-gradient(135deg, #e2e3e5, #d1d2d4); }}
        .both-disagree {{ background: linear-gradient(135deg, #f8d7da, #f1b0b7); }}
        .insight-box {{ 
            background: linear-gradient(135deg, #a8edea 0%, #fed6e3 100%); 
            padding: 30px; 
            border-radius: 20px; 
            margin: 25px 0; 
            border: 3px solid rgba(255,255,255,0.4);
            box-shadow: 0 15px 35px rgba(0,0,0,0.1);
        }}
        .dataset-badge {{ 
            display: inline-block; 
            padding: 8px 18px; 
            border-radius: 25px; 
            font-weight: bold; 
            margin: 8px; 
            color: white; 
            font-size: 0.95em;
            box-shadow: 0 4px 15px rgba(0,0,0,0.2);
        }}
        .comparison-section {{
            background: linear-gradient(135deg, #f1f3f4 0%, #e8eaed 100%);
            padding: 30px;
            border-radius: 20px;
            margin: 25px 0;
            border: 2px solid rgba(255,255,255,0.6);
        }}
    </style>
</head>
<body>
    <div class="container">
        <div class="header">
            <h1>🧠 Gemini Pro 1.5 Omni Dashboard</h1>
            <h2>Complete ML Analysis Suite - Flash vs Pro 1.5 Judge Evaluation</h2>
            <p>Comprehensive analysis of Pro 1.5's performance in judging Flash classifications</p>
            <p><strong>Generated:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | <strong>Datasets Available:</strong> {', '.join(datasets_available)}</p>
        </div>
        
        <div class="dataset-tabs">"""
        
        # Add tab buttons for available datasets
        for dataset in datasets_available:
            tab_class = dataset.lower()
            html_content += f'<button class="tab-button {tab_class}" onclick="showDataset(\'{dataset.lower()}\')">{dataset} Dataset</button>'
        
        html_content += """
        </div>"""
        
            # Generate content for each available dataset
        for results, dataset_key in [(meta_results, 'meta'), (web_results, 'web'), (combined_results, 'combined')]:
            if not results or 'total_evaluations' not in results:
                continue
                
            dataset_name = results['dataset_name']
            cm = results.get('confusion_matrix', {})
            
            # Handle cases where full ML metrics aren't available
            if 'error' in results:
                # Generate simplified dashboard for single-class data
                html_content += f"""
        
        <div id="{dataset_key}" class="dataset-content">
            <div class="section">
                <h2>⚠️ Limited Analysis - {dataset_name} Dataset</h2>
                <p><em>Single class detected in ground truth data - limited metrics available</em></p>
                <div class="insight-box">
                    <h3>🔍 Data Analysis Issue</h3>
                    <p><strong>Problem:</strong> {results.get('error', 'Unknown error')}</p>
                    <p><strong>Flash Positive Cases:</strong> {results.get('class_distribution', {}).get('flash_positive', {}).get('count', 0)}</p>
                    <p><strong>Flash Negative Cases:</strong> {results.get('class_distribution', {}).get('flash_negative', {}).get('count', 0)}</p>
                    <p><strong>Total Evaluations:</strong> {results.get('total_evaluations', 0):,}</p>
                    <p><strong>Basic Agreement Rate:</strong> {results.get('accuracy', 0):.1%}</p>
                    <p><strong>Recommendation:</strong> This dataset appears to have only one class in the Flash ground truth, making binary classification analysis impossible. Please check your data source.</p>
                </div>
            </div>
        </div>"""
                continue
            
            # Full dashboard for complete data
            ml = results
            html_content += f"""
        
        <div id="{dataset_key}" class="dataset-content">
            <div class="section">
                <h2>🎯 Core ML Performance Metrics - {dataset_name} Dataset</h2>
                <p><em>Pro 1.5 performance in agreeing with Flash classifications (Flash = Ground Truth)</em></p>
                <div class="metrics-grid">
                    <div class="metric-card">
                        <div class="metric-value {'excellent' if ml.get('f1_score', 0) >= 0.8 else 'good' if ml.get('f1_score', 0) >= 0.7 else 'moderate' if ml.get('f1_score', 0) >= 0.6 else 'poor'}">{ml.get('f1_score', 0):.3f}</div>
                        <div class="metric-label">F1-Score</div>
                        <small>Harmonic Mean of Precision & Recall</small>
                    </div>
                    <div class="metric-card">
                        <div class="metric-value {'excellent' if ml.get('accuracy', 0) >= 0.8 else 'good' if ml.get('accuracy', 0) >= 0.7 else 'moderate' if ml.get('accuracy', 0) >= 0.6 else 'poor'}">{ml.get('accuracy', 0):.3f}</div>
                        <div class="metric-label">Agreement Rate</div>
                        <small>Pro 1.5 vs Flash Overall</small>
                    </div>
                    <div class="metric-card">
                        <div class="metric-value {'excellent' if ml.get('balanced_accuracy', 0) >= 0.8 else 'good' if ml.get('balanced_accuracy', 0) >= 0.7 else 'moderate' if ml.get('balanced_accuracy', 0) >= 0.6 else 'poor'}">{ml.get('balanced_accuracy', 0):.3f}</div>
                        <div class="metric-label">Balanced Accuracy</div>
                        <small>Handles Class Imbalance</small>
                    </div>
                    <div class="metric-card">
                        <div class="metric-value {'excellent' if ml.get('precision', 0) >= 0.8 else 'good' if ml.get('precision', 0) >= 0.7 else 'moderate' if ml.get('precision', 0) >= 0.6 else 'poor'}">{ml.get('precision', 0):.3f}</div>
                        <div class="metric-label">Precision</div>
                        <small>TP / (TP + FP)</small>
                    </div>
                    <div class="metric-card">
                        <div class="metric-value {'excellent' if ml.get('recall', 0) >= 0.8 else 'good' if ml.get('recall', 0) >= 0.7 else 'moderate' if ml.get('recall', 0) >= 0.6 else 'poor'}">{ml.get('recall', 0):.3f}</div>
                        <div class="metric-label">Recall (TPR)</div>
                        <small>TP / (TP + FN)</small>
                    </div>
                    <div class="metric-card">
                        <div class="metric-value {'excellent' if ml.get('confidence_analysis', {}).get('average_confidence', 0) >= 0.9 else 'good' if ml.get('confidence_analysis', {}).get('average_confidence', 0) >= 0.8 else 'moderate' if ml.get('confidence_analysis', {}).get('average_confidence', 0) >= 0.7 else 'poor'}">{ml.get('confidence_analysis', {}).get('average_confidence', 0):.3f}</div>
                        <div class="metric-label">Avg Confidence</div>
                        <small>Pro 1.5 Confidence Score</small>
                    </div>
                    <div class="metric-card">
                        <div class="metric-value {'excellent' if ml.get('false_positive_rate', 1) <= 0.1 else 'good' if ml.get('false_positive_rate', 1) <= 0.2 else 'moderate' if ml.get('false_positive_rate', 1) <= 0.3 else 'poor'}">{ml.get('false_positive_rate', 0):.3f}</div>
                        <div class="metric-label">False Positive Rate</div>
                        <small>Pro 1.5 More Liberal</small>
                    </div>
                    <div class="metric-card">
                        <div class="metric-value {'excellent' if ml.get('false_negative_rate', 1) <= 0.1 else 'good' if ml.get('false_negative_rate', 1) <= 0.2 else 'moderate' if ml.get('false_negative_rate', 1) <= 0.3 else 'poor'}">{ml.get('false_negative_rate', 0):.3f}</div>
                        <div class="metric-label">False Negative Rate</div>
                        <small>Pro 1.5 More Conservative</small>
                    </div>
                </div>
            </div>
            
            <div class="two-column">
                <div class="section">
                    <h2>📊 Confusion Matrix - {dataset_name}</h2>
                    <p><em>Flash (Ground Truth) vs Pro 1.5 (Judge Predictions)</em></p>
                    <div class="confusion-matrix">
                        <div class="cm-cell cm-header"></div>
                        <div class="cm-cell cm-header">Flash: Not-Aligned</div>
                        <div class="cm-cell cm-header">Flash: Aligned</div>
                        
                        <div class="cm-cell cm-header">Pro: Not-Aligned</div>
                        <div class="cm-cell cm-tn">
                            <div style="font-size: 1.8em;">{cm.get('true_negatives', 0)}</div>
                            <small>True Negatives<br/>Both say Not-Aligned</small>
                        </div>
                        <div class="cm-cell cm-fn">
                            <div style="font-size: 1.8em;">{cm.get('false_negatives', 0)}</div>
                            <small>False Negatives<br/>Flash=Aligned, Pro=Not</small>
                        </div>
                        
                        <div class="cm-cell cm-header">Pro: Aligned</div>
                        <div class="cm-cell cm-fp">
                            <div style="font-size: 1.8em;">{cm.get('false_positives', 0)}</div>
                            <small>False Positives<br/>Flash=Not, Pro=Aligned</small>
                        </div>
                        <div class="cm-cell cm-tp">
                            <div style="font-size: 1.8em;">{cm.get('true_positives', 0)}</div>
                            <small>True Positives<br/>Both say Aligned</small>
                        </div>
                    </div>
                    
                    <div style="margin-top: 25px;">
                        <h4>📈 Matrix Interpretation:</h4>
                        <p><strong>✅ Pro 1.5 Agreement:</strong> {cm.get('true_positives', 0) + cm.get('true_negatives', 0)} ({((cm.get('true_positives', 0) + cm.get('true_negatives', 0)) / cm.get('total', 1)):.1%})</p>
                        <p><strong>❌ Pro 1.5 Disagreement:</strong> {cm.get('false_positives', 0) + cm.get('false_negatives', 0)} ({((cm.get('false_positives', 0) + cm.get('false_negatives', 0)) / cm.get('total', 1)):.1%})</p>
                        <p><strong>⚠️ Error Pattern:</strong> {ml.get('performance_assessment', {}).get('primary_issue', 'Unknown')}</p>
                    </div>
                </div>
                
                <div class="section">
                    <h2>⚖️ Pro 1.5 Agreement Analysis - {dataset_name}</h2>
                    <div class="breakdown-grid">
                        <div class="breakdown-item both-agree">
                            <h3>✅ Perfect Alignment</h3>
                            <div style="font-size: 2.2em; font-weight: bold; margin: 15px 0;">{ml.get('scenario_breakdown', {}).get('both_agree', {}).get('count', 0)}</div>
                            <p>{ml.get('scenario_breakdown', {}).get('both_agree', {}).get('percentage', 0):.1%} of cases</p>
                            <small>Process + Classification Agreement</small>
                        </div>
                        <div class="breakdown-item process-only">
                            <h3>🟡 Good Reasoning, Wrong Verdict</h3>
                            <div style="font-size: 2.2em; font-weight: bold; margin: 15px 0;">{ml.get('scenario_breakdown', {}).get('process_only', {}).get('count', 0)}</div>
                            <p>{ml.get('scenario_breakdown', {}).get('process_only', {}).get('percentage', 0):.1%} of cases</p>
                            <small>Pro likes reasoning but disagrees with classification</small>
                        </div>
                        <div class="breakdown-item classification-only">
                            <h3>🟠 Right Verdict, Poor Reasoning</h3>
                            <div style="font-size: 2.2em; font-weight: bold; margin: 15px 0;">{ml.get('scenario_breakdown', {}).get('classification_only', {}).get('count', 0)}</div>
                            <p>{ml.get('scenario_breakdown', {}).get('classification_only', {}).get('percentage', 0):.1%} of cases</p>
                            <small>Right classification but poor reasoning process</small>
                        </div>
                        <div class="breakdown-item both-disagree">
                            <h3>❌ Complete Disagreement</h3>
                            <div style="font-size: 2.2em; font-weight: bold; margin: 15px 0;">{ml.get('scenario_breakdown', {}).get('both_disagree', {}).get('count', 0)}</div>
                            <p>{ml.get('scenario_breakdown', {}).get('both_disagree', {}).get('percentage', 0):.1%} of cases</p>
                            <small>Pro 1.5 disagrees with both reasoning and verdict</small>
                        </div>
                    </div>
                    
                    <div style="margin-top: 25px;">
                        <h4>Agreement Rate Summary:</h4>
                        <p><strong>Process Agreement:</strong> {ml.get('pro_15_agreement', {}).get('process_agreement_rate', 0):.1%}</p>
                        <p><strong>Classification Agreement:</strong> {ml.get('pro_15_agreement', {}).get('classification_agreement_rate', 0):.1%}</p>
                        <p><strong>Would Reach Same Conclusion:</strong> {ml.get('pro_15_agreement', {}).get('would_reach_same_rate', 0):.1%}</p>
                    </div>
                </div>
            </div>
            
            <div class="insight-box">
                <h2>💡 Performance Assessment & Recommendations - {dataset_name}</h2>
                <div class="three-column">
                    <div>
                        <h3>🎯 Overall Performance</h3>
                        <p><strong>F1-Score:</strong> {ml.get('f1_score', 0):.3f} ({ml.get('performance_assessment', {}).get('f1_category', 'N/A')})</p>
                        <p><strong>Pro 1.5 Agreement:</strong> {ml.get('accuracy', 0):.1%}</p>
                        <p><strong>Primary Issue:</strong> {ml.get('performance_assessment', {}).get('primary_issue', 'N/A')}</p>
                    </div>
                    <div>
                        <h3>🎯 Key Recommendation</h3>
                        <p><strong>Pattern:</strong> {ml.get('performance_assessment', {}).get('recommendation', 'N/A')}</p>
                        <p><strong>Focus:</strong> {ml.get('scenario_breakdown', {}).get('process_only', {}).get('count', 0)} "Good Reasoning" cases represent improvement opportunities</p>
                        <p><strong>Confidence:</strong> {"High confidence in judgments" if ml.get('confidence_analysis', {}).get('average_confidence', 0) > 0.8 else "Moderate confidence in judgments"}</p>
                    </div>
                    <div>
                        <h3>📈 Success Metrics</h3>
                        <p><strong>Current F1:</strong> {ml.get('f1_score', 0):.3f}</p>
                        <p><strong>Current Agreement:</strong> {ml.get('accuracy', 0):.1%}</p>
                        <p><strong>Perfect Alignment:</strong> {ml.get('scenario_breakdown', {}).get('both_agree', {}).get('percentage', 0):.1%}</p>
                    </div>
                </div>
            </div>
            
            <div class="section">
                <h2>📋 Technical Summary - {dataset_name} Dataset</h2>
                <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); gap: 25px;">
                    <div>
                        <h4>🎯 Classification Performance</h4>
                        <p><strong>F1-Score:</strong> {ml.get('f1_score', 0):.3f}</p>
                        <p><strong>Precision:</strong> {ml.get('precision', 0):.3f}</p>
                        <p><strong>Recall:</strong> {ml.get('recall', 0):.3f}</p>
                        <p><strong>Accuracy:</strong> {ml.get('accuracy', 0):.3f}</p>
                        <p><strong>Balanced Accuracy:</strong> {ml.get('balanced_accuracy', 0):.3f}</p>
                    </div>
                    <div>
                        <h4>⚖️ Pro 1.5 Analysis</h4>
                        <p><strong>Process Agreement:</strong> {ml.get('pro_15_agreement', {}).get('process_agreement_rate', 0):.1%}</p>
                        <p><strong>Classification Agreement:</strong> {ml.get('pro_15_agreement', {}).get('classification_agreement_rate', 0):.1%}</p>
                        <p><strong>Average Confidence:</strong> {ml.get('confidence_analysis', {}).get('average_confidence', 0):.3f}</p>
                        <p><strong>High Conf Agreement:</strong> {ml.get('confidence_analysis', {}).get('high_confidence_agreement', 0):.1%}</p>
                    </div>
                    <div>
                        <h4>📊 Error Analysis</h4>
                        <p><strong>False Positive Rate:</strong> {ml.get('false_positive_rate', 0):.3f}</p>
                        <p><strong>False Negative Rate:</strong> {ml.get('false_negative_rate', 0):.3f}</p>
                        <p><strong>Type I Errors:</strong> {cm.get('false_positives', 0)} (Pro too liberal)</p>
                        <p><strong>Type II Errors:</strong> {cm.get('false_negatives', 0)} (Pro too conservative)</p>
                    </div>
                    <div>
                        <h4>📈 Data Quality</h4>
                        <p><strong>Total Evaluations:</strong> {ml.get('total_evaluations', 0):,}</p>
                        <p><strong>Flash Positive Rate:</strong> {ml.get('class_distribution', {}).get('flash_positive', {}).get('percentage', 0):.1%}</p>
                        <p><strong>Pro Positive Rate:</strong> {ml.get('class_distribution', {}).get('pro_positive', {}).get('percentage', 0):.1%}</p>
                        <p><strong>Matthews Correlation:</strong> {ml.get('matthews_correlation_coefficient', 0):.3f}</p>
                    </div>
                </div>
            </div>
        </div>"""
        
        # Add dataset comparison if multiple datasets available
        if len(datasets_available) > 1:
            html_content += f"""
        
        <div class="comparison-section">
            <h2>🔄 Dataset Comparison Summary</h2>
            <div class="three-column">"""
            
            for results in [meta_results, web_results, combined_results]:
                if results:
                    dataset_name = results['dataset_name']
                    html_content += f"""
                <div>
                    <h3>{dataset_name} Dataset</h3>
                    <p><strong>Total Evaluations:</strong> {results.get('total_evaluations', 0):,}</p>
                    <p><strong>F1-Score:</strong> {results.get('f1_score', 0):.3f}</p>
                    <p><strong>Agreement Rate:</strong> {results.get('accuracy', 0):.1%}</p>
                    <p><strong>Perfect Alignment:</strong> {results.get('scenario_breakdown', {}).get('both_agree', {}).get('percentage', 0):.1%}</p>
                    <p><strong>Primary Issue:</strong> {results.get('performance_assessment', {}).get('primary_issue', 'N/A')}</p>
                </div>"""
            
            html_content += """
            </div>
        </div>"""
        
        html_content += f"""
    </div>
    
    <script>
        function showDataset(dataset) {{
            // Hide all dataset content
            const contents = document.querySelectorAll('.dataset-content');
            contents.forEach(content => content.classList.remove('active'));
            
            // Show selected dataset
            document.getElementById(dataset).classList.add('active');
            
            // Update button states
            const buttons = document.querySelectorAll('.tab-button');
            buttons.forEach(button => button.style.opacity = '0.7');
            event.target.style.opacity = '1';
        }}
        
        // Show first available dataset by default
        document.addEventListener('DOMContentLoaded', function() {{
            const firstDataset = '{datasets_available[0].lower()}';
            showDataset(firstDataset);
            
            // Highlight first button
            const firstButton = document.querySelector('.tab-button.{datasets_available[0].lower()}');
            if (firstButton) firstButton.style.opacity = '1';
        }});
        
        console.log('Gemini Pro 1.5 Omni Dashboard Loaded - Datasets: {", ".join(datasets_available)}');
    </script>
</body>
</html>"""
        
        # Save the dashboard
        filename = f"gemini_pro_15_omni_dashboard_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html"
        with open(filename, "w", encoding='utf-8') as f:
            f.write(html_content)
        
        print(f"✅ Created Gemini Pro 1.5 Omni Dashboard: {filename}")
        return filename
    
    def run_complete_analysis(self):
        """Run complete analysis on both datasets and generate omni dashboard"""
        
        print("🚀 STARTING GEMINI PRO 1.5 OMNI ANALYSIS")
        print("=" * 80)
        
        results = {}
        
        # Analyze Meta dataset
        print("\n📰 ANALYZING META DATASET...")
        meta_df = self.load_gemini15_results(META_RESULTS_TABLE)
        if not meta_df.empty:
            results['meta'] = self.calculate_comprehensive_metrics(meta_df, "Meta")
        else:
            results['meta'] = None
            print("⚠️ No Meta dataset results available")
        
        # Analyze Web dataset  
        print("\n🌐 ANALYZING WEB DATASET...")
        web_df = self.load_gemini15_results(WEB_RESULTS_TABLE)
        if not web_df.empty:
            results['web'] = self.calculate_comprehensive_metrics(web_df, "Web")
        else:
            results['web'] = None
            print("⚠️ No Web dataset results available")
        
        # Analyze Combined dataset
        print("\n🔄 ANALYZING COMBINED DATASET...")
        if not meta_df.empty and not web_df.empty:
            combined_df = pd.concat([meta_df, web_df], ignore_index=True)
            results['combined'] = self.calculate_comprehensive_metrics(combined_df, "Combined")
        elif not meta_df.empty:
            results['combined'] = self.calculate_comprehensive_metrics(meta_df, "Combined (Meta Only)")
        elif not web_df.empty:
            results['combined'] = self.calculate_comprehensive_metrics(web_df, "Combined (Web Only)")
        else:
            results['combined'] = None
            print("⚠️ No data available for combined analysis")
        
        # Create summary JSON
        summary = {
            'timestamp': datetime.now().isoformat(),
            'analysis_type': 'gemini_pro_15_omni_analysis',
            'datasets_analyzed': [k for k, v in results.items() if v is not None],
            'results': results
        }
        
        # Save summary
        summary_filename = f"gemini_pro_15_omni_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(summary_filename, 'w') as f:
            json.dump(summary, f, indent=2, default=str)
        
        print(f"\n✅ Summary saved to: {summary_filename}")
        
        # Create HTML dashboard
        print(f"\n🎨 CREATING OMNI HTML DASHBOARD...")
        dashboard_filename = self.create_omni_html_dashboard(
            results['meta'], 
            results['web'], 
            results['combined']
        )
        
        # Final summary
        print(f"\n🎉 GEMINI PRO 1.5 OMNI ANALYSIS COMPLETED!")
        print(f"📊 Datasets analyzed: {[k for k, v in results.items() if v is not None]}")
        if results['meta'] and 'total_evaluations' in results['meta']:
            f1_score = results['meta'].get('f1_score', 'N/A')
            print(f"📰 Meta: {results['meta']['total_evaluations']:,} evaluations, F1: {f1_score}")
        if results['web'] and 'total_evaluations' in results['web']:
            f1_score = results['web'].get('f1_score', 'N/A')
            print(f"🌐 Web: {results['web']['total_evaluations']:,} evaluations, F1: {f1_score}")
        if results['combined'] and 'total_evaluations' in results['combined']:
            f1_score = results['combined'].get('f1_score', 'N/A')
            print(f"🔄 Combined: {results['combined']['total_evaluations']:,} evaluations, F1: {f1_score}")
        
        print(f"📄 Dashboard: {dashboard_filename}")
        print(f"📊 Summary: {summary_filename}")
        
        return results, dashboard_filename

def main():
    """Main execution function"""
    try:
        analyzer = GeminiPro15OmniAnalysis()
        results, dashboard_file = analyzer.run_complete_analysis()
        
        print(f"\n✅ Analysis completed successfully!")
        print(f"🌐 Open the dashboard: {dashboard_file}")
        
        return results
        
    except Exception as e:
        print(f"❌ Analysis failed: {type(e).__name__}: {e}")
        traceback.print_exc()
        return None

if __name__ == "__main__":
    main()

✅ BigQuery client initialized for project: scope3-dev
🚀 STARTING GEMINI PRO 1.5 OMNI ANALYSIS

📰 ANALYZING META DATASET...
📥 Loading data from BA_Meta_Gemini_Pro_Judge_Results...
✅ Loaded 299 records from BA_Meta_Gemini_Pro_Judge_Results

🎯 CALCULATING ML METRICS FOR META DATASET
📊 Class Distribution (Meta):
   Flash Positive (Aligned): 0 (0.0%)
   Flash Negative (Not-Aligned): 112 (37.5%)
   Pro 1.5 Positive: 138 (46.2%)
   Pro 1.5 Negative: 161 (53.8%)
❌ Error calculating metrics for Meta: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

🌐 ANALYZING WEB DATASET...
📥 Loading data from BA_Web_Gemini_Pro_Judge_Results...
✅ Loaded 294 records from BA_Web_Gemini_Pro_Judge_Results

🎯 CALCULATING ML METRICS FOR WEB DATASET
📊 Class Distribution (Web):
   Flash Positive (Aligned): 0 (0.0%)
   Flash Negative (Not-Aligned): 42 (14.3%)
   Pro 1.5 Positive: 226 (76.9%)
   Pro 1.5 Negative: 68 (23.1%)
❌ Error calculatin

In [48]:
!pip install plotly



In [49]:
#!/usr/bin/env python3
"""
Gemini 2.5 Pro Classification Metrics Dashboard - Real Results vs Ground Truth
Calculates traditional ML metrics (F1, Precision, Recall, Confusion Matrix) for 2.5 Pro decisions
"""

import pandas as pd
import json
from google.cloud import bigquery
from datetime import datetime
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
import numpy as np
from typing import Dict, List, Any, Optional

PROJECT_ID = "scope3-dev"
DATASET_ID = "research_bs_monitoring"
META_TABLE = "BA_Meta_Ground_Truth"
RESULTS_TABLE = "BA_Meta_Gemini_25_Pro_Judge_Results"

class Gemini25ProClassificationDashboard:
    def __init__(self):
        self.client = bigquery.Client(project=PROJECT_ID)
        
    def load_results_with_ground_truth(self) -> pd.DataFrame:
        """Load 2.5 Pro results joined with Meta ground truth"""
        print("📥 Loading Gemini 2.5 Pro results with ground truth...")
        
        query = f"""
        SELECT 
            r.artifact_id,
            r.flash_classification,
            r.flash_reasoning,
            r.pro_judge_agreement,
            r.pro_verdict,
            r.pro_confidence,
            r.pro_would_reach_same_conclusion,
            r.pro_reasoning,
            r.api_call_time,
            r.created_at,
            r.error_message,
            -- Ground truth from Meta
            m.correct_classification as ground_truth,
            m.correct_reasoning as ground_truth_reasoning,
            m.source as data_source
        FROM `{PROJECT_ID}.{DATASET_ID}.{RESULTS_TABLE}` r
        JOIN `{PROJECT_ID}.{DATASET_ID}.{META_TABLE}` m
        ON r.artifact_id = m.artifact_id
        WHERE (r.error_message IS NULL OR r.error_message = '')
        AND r.pro_judge_agreement IS NOT NULL
        AND r.pro_would_reach_same_conclusion IS NOT NULL
        ORDER BY r.created_at DESC
        """
        
        try:
            df = self.client.query(query).to_dataframe()
            print(f"✅ Loaded {len(df)} records with ground truth")
            
            if len(df) == 0:
                print("❌ No valid joined data found")
                return pd.DataFrame()
            
            # Convert ground truth to binary (100 = Aligned = 1, 0 = Not-Aligned = 0)
            df['ground_truth_binary'] = (df['ground_truth'] == 100).astype(int)
            
            # Convert Flash decisions to binary (100 = Aligned = 1, 0 = Not-Aligned = 0)
            # Flash should have same encoding as ground truth: 100 for aligned, 0 for not-aligned
            df['flash_binary'] = (df['flash_classification'] == 100).astype(int)
            
            # Convert 2.5 Pro "would reach same conclusion" to predictions
            # If 2.5 Pro would reach same conclusion as Flash, use Flash's decision
            # If not, use opposite of Flash's decision
            df['pro_25_prediction'] = np.where(
                df['pro_would_reach_same_conclusion'] == True,
                df['flash_binary'],  # Same as Flash
                1 - df['flash_binary']  # Opposite of Flash
            )
            
            # Clean up any invalid values in predictions
            df['pro_25_prediction'] = df['pro_25_prediction'].astype(int)
            df['flash_binary'] = df['flash_binary'].astype(int)
            df['ground_truth_binary'] = df['ground_truth_binary'].astype(int)
            
            print(f"📊 Data distribution:")
            print(f"   Ground truth: {df['ground_truth_binary'].value_counts().to_dict()}")
            print(f"   Flash predictions: {df['flash_binary'].value_counts().to_dict()}")
            print(f"   2.5 Pro predictions: {df['pro_25_prediction'].value_counts().to_dict()}")
            print(f"   Agreement rate: {df['pro_judge_agreement'].mean():.1%}")
            
            # Validate that all predictions are binary
            if not all(df['ground_truth_binary'].isin([0, 1])):
                print("⚠️ Warning: Ground truth contains non-binary values")
                df = df[df['ground_truth_binary'].isin([0, 1])]
            
            if not all(df['flash_binary'].isin([0, 1])):
                print("⚠️ Warning: Flash predictions contain non-binary values")
                df = df[df['flash_binary'].isin([0, 1])]
                
            if not all(df['pro_25_prediction'].isin([0, 1])):
                print("⚠️ Warning: 2.5 Pro predictions contain non-binary values")
                df = df[df['pro_25_prediction'].isin([0, 1])]
            
            print(f"📊 Final clean dataset: {len(df)} records")
        
        # Debug: Show sample comparisons
        print(f"\n🔍 DEBUG - Sample comparisons (first 5 records):")
        for i in range(min(5, len(df))):
            print(f"   Record {i+1}: Ground Truth={df.iloc[i]['ground_truth_binary']}, Flash={df.iloc[i]['flash_binary']}, 2.5Pro={df.iloc[i]['pro_25_prediction']}")
            
            return df
            
        except Exception as e:
            print(f"❌ Error loading data: {e}")
            raise
    
    def calculate_classification_metrics(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Calculate comprehensive classification metrics for both models"""
        print("📊 Calculating classification metrics...")
        
        if len(df) == 0:
            return {}
        
        y_true = df['ground_truth_binary'].values
        flash_pred = df['flash_binary'].values  
        pro_pred = df['pro_25_prediction'].values
        
        # Flash metrics - Compare Flash predictions with ground truth
        flash_metrics = self._calculate_model_metrics(y_true, flash_pred, "Flash")
        
        # 2.5 Pro metrics - Compare 2.5 Pro predictions with ground truth  
        pro_metrics = self._calculate_model_metrics(y_true, pro_pred, "Gemini 2.5 Pro")
        
        # Agreement analysis
        flash_vs_ground_truth_agreement = (flash_pred == y_true).mean()
        pro_vs_ground_truth_agreement = (pro_pred == y_true).mean()
        judge_agreement_rate = df['pro_judge_agreement'].mean()
        
        # Cross-analysis: When do they agree/disagree with each other and ground truth?
        both_correct = ((flash_pred == y_true) & (pro_pred == y_true)).sum()
        both_wrong = ((flash_pred != y_true) & (pro_pred != y_true)).sum()
        flash_right_pro_wrong = ((flash_pred == y_true) & (pro_pred != y_true)).sum()
        pro_right_flash_wrong = ((flash_pred != y_true) & (pro_pred == y_true)).sum()
        
        cross_analysis = {
            'both_correct': int(both_correct),
            'both_wrong': int(both_wrong), 
            'flash_right_pro_wrong': int(flash_right_pro_wrong),
            'pro_right_flash_wrong': int(pro_right_flash_wrong),
            'total': len(df)
        }
        
        print(f"✅ Metrics calculated:")
        print(f"   Flash F1: {flash_metrics['f1_score']:.3f}")
        print(f"   2.5 Pro F1: {pro_metrics['f1_score']:.3f}")
        print(f"   Judge agreement: {judge_agreement_rate:.1%}")
        
        return {
            'dataset_info': {
                'total_samples': len(df),
                'positive_samples': int((y_true == 1).sum()),
                'negative_samples': int((y_true == 0).sum()),
                'class_balance': float((y_true == 1).mean())
            },
            'flash_metrics': flash_metrics,
            'pro_25_metrics': pro_metrics,
            'agreement_analysis': {
                'flash_vs_ground_truth': flash_vs_ground_truth_agreement,
                'pro_vs_ground_truth': pro_vs_ground_truth_agreement,
                'pro_judge_agreement_with_flash': judge_agreement_rate,
                'cross_analysis': cross_analysis
            },
            'model_comparison': {
                'f1_difference': pro_metrics['f1_score'] - flash_metrics['f1_score'],
                'accuracy_difference': pro_metrics['accuracy'] - flash_metrics['accuracy'],
                'precision_difference': pro_metrics['precision'] - flash_metrics['precision'],
                'recall_difference': pro_metrics['recall'] - flash_metrics['recall'],
                'better_f1': 'Gemini 2.5 Pro' if pro_metrics['f1_score'] > flash_metrics['f1_score'] else 'Flash'
            }
        }
    
    def _calculate_model_metrics(self, y_true: np.ndarray, y_pred: np.ndarray, model_name: str) -> Dict[str, Any]:
        """Calculate detailed metrics for a single model"""
        
        # Ensure binary values
        y_true = np.array(y_true).astype(int)
        y_pred = np.array(y_pred).astype(int)
        
        # Validate binary values
        if not (set(np.unique(y_true)) <= {0, 1} and set(np.unique(y_pred)) <= {0, 1}):
            print(f"⚠️ Warning: Non-binary values detected in {model_name}")
            print(f"   y_true unique: {np.unique(y_true)}")
            print(f"   y_pred unique: {np.unique(y_pred)}")
            # Force to binary
            y_true = np.clip(y_true, 0, 1)
            y_pred = np.clip(y_pred, 0, 1)
        
        accuracy = accuracy_score(y_true, y_pred)
        
        # Handle edge cases where only one class is predicted
        unique_pred = np.unique(y_pred)
        unique_true = np.unique(y_true)
        
        if len(unique_pred) == 1 or len(unique_true) == 1:
            # Use macro average for edge cases
            precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
            recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
        else:
            precision = precision_score(y_true, y_pred, average='binary', zero_division=0)
            recall = recall_score(y_true, y_pred, average='binary', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)
        
        # Confusion matrix - ensure we get 2x2 matrix
        cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
        
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
        elif cm.shape == (1, 1):
            # Handle case where only one class exists
            if unique_true[0] == 0 and unique_pred[0] == 0:
                tn, fp, fn, tp = cm[0,0], 0, 0, 0
            elif unique_true[0] == 1 and unique_pred[0] == 1:
                tn, fp, fn, tp = 0, 0, 0, cm[0,0]
            else:
                tn, fp, fn, tp = 0, cm[0,0], 0, 0
        else:
            print(f"⚠️ Unexpected confusion matrix shape for {model_name}: {cm.shape}")
            tn = fp = fn = tp = 0
        
        # Additional metrics
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0  # Same as recall
        npv = tn / (tn + fn) if (tn + fn) > 0 else 0  # Negative Predictive Value
        ppv = tp / (tp + fp) if (tp + fp) > 0 else 0  # Positive Predictive Value (same as precision)
        
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Positive Rate
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0  # False Negative Rate
        
        balanced_accuracy = (sensitivity + specificity) / 2
        
        # Matthews Correlation Coefficient
        mcc_num = (tp * tn) - (fp * fn)
        mcc_den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
        mcc = mcc_num / mcc_den if mcc_den > 0 else 0
        
        return {
            'model_name': model_name,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'specificity': specificity,
            'sensitivity': sensitivity,
            'balanced_accuracy': balanced_accuracy,
            'negative_predictive_value': npv,
            'positive_predictive_value': ppv,
            'false_positive_rate': fpr,
            'false_negative_rate': fnr,
            'matthews_correlation': mcc,
            'confusion_matrix': {
                'true_positives': int(tp),
                'false_positives': int(fp), 
                'true_negatives': int(tn),
                'false_negatives': int(fn)
            }
        }
    
    def create_classification_dashboard(self, metrics: Dict[str, Any], df: pd.DataFrame) -> str:
        """Create comprehensive classification metrics dashboard"""
        
        if not metrics:
            return "<html><body><h1>No valid metrics to display</h1></body></html>"
        
        flash_metrics = metrics['flash_metrics']
        pro_metrics = metrics['pro_25_metrics']
        dataset_info = metrics['dataset_info']
        agreement = metrics['agreement_analysis']
        comparison = metrics['model_comparison']
        
        flash_cm = flash_metrics['confusion_matrix']
        pro_cm = pro_metrics['confusion_matrix']
        
        # Performance assessment
        if pro_metrics['f1_score'] >= 0.8:
            pro_assessment = "🟢 EXCELLENT"
            pro_color = "#10B981"
        elif pro_metrics['f1_score'] >= 0.7:
            pro_assessment = "🟡 GOOD" 
            pro_color = "#F59E0B"
        elif pro_metrics['f1_score'] >= 0.6:
            pro_assessment = "🟠 MODERATE"
            pro_color = "#FF6B35"
        else:
            pro_assessment = "🔴 POOR"
            pro_color = "#EF4444"
        
        html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Gemini 2.5 Pro Classification Metrics Dashboard</title>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/4.4.0/chart.min.js"></script>
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{ font-family: 'Inter', sans-serif; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: #333; line-height: 1.6; min-height: 100vh; }}
        .dashboard {{ max-width: 1800px; margin: 0 auto; padding: 20px; }}
        .header {{ background: rgba(255,255,255,0.95); padding: 40px; border-radius: 25px; text-align: center; margin-bottom: 30px; backdrop-filter: blur(15px); box-shadow: 0 20px 60px rgba(0,0,0,0.1); }}
        .header h1 {{ font-size: 3rem; margin-bottom: 15px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; }}
        .header h2 {{ font-size: 1.5rem; color: #666; margin-bottom: 20px; }}
        .model-badge {{ display: inline-block; padding: 10px 25px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 30px; font-weight: bold; font-size: 1.1rem; margin: 5px; }}
        .metrics-overview {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 20px; margin-bottom: 40px; }}
        .metric-card {{ background: rgba(255,255,255,0.95); padding: 25px; border-radius: 20px; text-align: center; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); transition: transform 0.3s ease; }}
        .metric-card:hover {{ transform: translateY(-5px); }}
        .metric-value {{ font-size: 2.5rem; font-weight: bold; margin-bottom: 10px; }}
        .metric-label {{ font-size: 1rem; color: #666; font-weight: 600; margin-bottom: 5px; }}
        .metric-description {{ font-size: 0.85rem; color: #888; }}
        .excellent {{ color: #10B981; }}
        .good {{ color: #F59E0B; }}
        .moderate {{ color: #FF6B35; }}
        .poor {{ color: #EF4444; }}
        .dashboard-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(450px, 1fr)); gap: 25px; margin-bottom: 40px; }}
        .chart-section {{ background: rgba(255,255,255,0.95); padding: 30px; border-radius: 20px; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); }}
        .chart-title {{ font-size: 1.3rem; font-weight: bold; margin-bottom: 20px; color: #333; text-align: center; }}
        .chart-container {{ position: relative; height: 300px; margin-bottom: 15px; }}
        .confusion-matrix {{ display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px; margin: 20px 0; text-align: center; font-size: 0.9rem; }}
        .cm-cell {{ padding: 15px; border-radius: 12px; font-weight: bold; display: flex; flex-direction: column; justify-content: center; align-items: center; }}
        .cm-header {{ background: linear-gradient(135deg, #f8f9fa, #e9ecef); color: #495057; }}
        .cm-tp {{ background: linear-gradient(135deg, #d4edda, #c3e6cb); color: #155724; font-size: 1.8rem; }}
        .cm-fp {{ background: linear-gradient(135deg, #f8d7da, #f1b0b7); color: #721c24; font-size: 1.8rem; }}
        .cm-fn {{ background: linear-gradient(135deg, #fff3cd, #fce4a6); color: #856404; font-size: 1.8rem; }}
        .cm-tn {{ background: linear-gradient(135deg, #d1ecf1, #bee5eb); color: #0c5460; font-size: 1.8rem; }}
        .cm-label {{ font-size: 0.75rem; margin-top: 5px; opacity: 0.8; }}
        .performance-summary {{ background: linear-gradient(135deg, #e3f2fd, #bbdefb); padding: 30px; border-radius: 20px; margin: 30px 0; text-align: center; }}
        .performance-score {{ font-size: 3.5rem; font-weight: bold; color: {pro_color}; margin-bottom: 10px; }}
        .performance-label {{ font-size: 1.3rem; color: #37474f; margin-bottom: 8px; }}
        .performance-description {{ font-size: 1rem; color: #546e7a; }}
        .comparison-section {{ background: rgba(255,255,255,0.95); padding: 30px; border-radius: 20px; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); margin-bottom: 30px; }}
        .comparison-grid {{ display: grid; grid-template-columns: 1fr 1fr; gap: 30px; margin-top: 20px; }}
        .model-column {{ padding: 20px; border-radius: 15px; }}
        .flash-column {{ background: linear-gradient(135deg, #e8f5e8, #d4edda); }}
        .pro-column {{ background: linear-gradient(135deg, #e3f2fd, #bbdefb); }}
        .model-title {{ font-size: 1.5rem; font-weight: bold; text-align: center; margin-bottom: 20px; }}
        .metric-row {{ display: flex; justify-content: space-between; margin: 8px 0; padding: 8px; background: rgba(255,255,255,0.5); border-radius: 8px; }}
    </style>
</head>
<body>
    <div class="dashboard">
        <div class="header">
            <h1>🎯 Classification Metrics Dashboard</h1>
            <h2>Gemini Models vs Meta Ground Truth</h2>
            <div>
                <span class="model-badge">Flash Model</span>
                <span class="model-badge">Gemini 2.5 Pro</span>
            </div>
            <p style="margin-top: 20px; color: #666;">
                Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | 
                Samples: {dataset_info['total_samples']:,} | 
                Positive: {dataset_info['positive_samples']} | Negative: {dataset_info['negative_samples']}
            </p>
        </div>

        <div class="performance-summary">
            <div class="performance-score">{pro_metrics['f1_score']:.3f}</div>
            <div class="performance-label">{pro_assessment} F1-Score (Gemini 2.5 Pro)</div>
            <div class="performance-description">
                2.5 Pro achieves {pro_metrics['accuracy']:.1%} accuracy vs {flash_metrics['accuracy']:.1%} for Flash
                <br>Judge Agreement: {agreement['pro_judge_agreement_with_flash']:.1%} | Better Model: {comparison['better_f1']}
            </div>
        </div>

        <div class="comparison-section">
            <div class="chart-title">📊 Model Performance Comparison</div>
            <div class="comparison-grid">
                <div class="model-column flash-column">
                    <div class="model-title">⚡ Flash Model</div>
                    <div class="metric-row"><span>F1-Score:</span><span class="{'excellent' if flash_metrics['f1_score'] >= 0.8 else 'good' if flash_metrics['f1_score'] >= 0.7 else 'moderate' if flash_metrics['f1_score'] >= 0.6 else 'poor'}">{flash_metrics['f1_score']:.3f}</span></div>
                    <div class="metric-row"><span>Accuracy:</span><span>{flash_metrics['accuracy']:.3f}</span></div>
                    <div class="metric-row"><span>Precision:</span><span>{flash_metrics['precision']:.3f}</span></div>
                    <div class="metric-row"><span>Recall:</span><span>{flash_metrics['recall']:.3f}</span></div>
                    <div class="metric-row"><span>Specificity:</span><span>{flash_metrics['specificity']:.3f}</span></div>
                </div>
                <div class="model-column pro-column">
                    <div class="model-title">🧠 Gemini 2.5 Pro</div>
                    <div class="metric-row"><span>F1-Score:</span><span class="{'excellent' if pro_metrics['f1_score'] >= 0.8 else 'good' if pro_metrics['f1_score'] >= 0.7 else 'moderate' if pro_metrics['f1_score'] >= 0.6 else 'poor'}">{pro_metrics['f1_score']:.3f}</span></div>
                    <div class="metric-row"><span>Accuracy:</span><span>{pro_metrics['accuracy']:.3f}</span></div>
                    <div class="metric-row"><span>Precision:</span><span>{pro_metrics['precision']:.3f}</span></div>
                    <div class="metric-row"><span>Recall:</span><span>{pro_metrics['recall']:.3f}</span></div>
                    <div class="metric-row"><span>Specificity:</span><span>{pro_metrics['specificity']:.3f}</span></div>
                </div>
            </div>
        </div>

        <div class="dashboard-grid">
            <div class="chart-section">
                <div class="chart-title">📊 F1-Score Comparison</div>
                <div class="chart-container">
                    <canvas id="f1Chart"></canvas>
                </div>
            </div>

            <div class="chart-section">
                <div class="chart-title">🎯 Precision vs Recall</div>
                <div class="chart-container">
                    <canvas id="precisionRecallChart"></canvas>
                </div>
            </div>

            <div class="chart-section">
                <div class="chart-title">🔍 Flash Confusion Matrix</div>
                <div class="confusion-matrix">
                    <div class="cm-cell cm-header"></div>
                    <div class="cm-cell cm-header">Ground Truth: 0</div>
                    <div class="cm-cell cm-header">Ground Truth: 1</div>
                    
                    <div class="cm-cell cm-header">Flash: 0</div>
                    <div class="cm-cell cm-tn">
                        {flash_cm['true_negatives']}
                        <div class="cm-label">True Negatives</div>
                    </div>
                    <div class="cm-cell cm-fn">
                        {flash_cm['false_negatives']}
                        <div class="cm-label">False Negatives</div>
                    </div>
                    
                    <div class="cm-cell cm-header">Flash: 1</div>
                    <div class="cm-cell cm-fp">
                        {flash_cm['false_positives']}
                        <div class="cm-label">False Positives</div>
                    </div>
                    <div class="cm-cell cm-tp">
                        {flash_cm['true_positives']}
                        <div class="cm-label">True Positives</div>
                    </div>
                </div>
            </div>

            <div class="chart-section">
                <div class="chart-title">🧠 Gemini 2.5 Pro Confusion Matrix</div>
                <div class="confusion-matrix">
                    <div class="cm-cell cm-header"></div>
                    <div class="cm-cell cm-header">Ground Truth: 0</div>
                    <div class="cm-cell cm-header">Ground Truth: 1</div>
                    
                    <div class="cm-cell cm-header">2.5 Pro: 0</div>
                    <div class="cm-cell cm-tn">
                        {pro_cm['true_negatives']}
                        <div class="cm-label">True Negatives</div>
                    </div>
                    <div class="cm-cell cm-fn">
                        {pro_cm['false_negatives']}
                        <div class="cm-label">False Negatives</div>
                    </div>
                    
                    <div class="cm-cell cm-header">2.5 Pro: 1</div>
                    <div class="cm-cell cm-fp">
                        {pro_cm['false_positives']}
                        <div class="cm-label">False Positives</div>
                    </div>
                    <div class="cm-cell cm-tp">
                        {pro_cm['true_positives']}
                        <div class="cm-label">True Positives</div>
                    </div>
                </div>
            </div>

            <div class="chart-section">
                <div class="chart-title">⚖️ Agreement Analysis</div>
                <div class="chart-container">
                    <canvas id="agreementChart"></canvas>
                </div>
            </div>

            <div class="chart-section">
                <div class="chart-title">📈 Performance Radar</div>
                <div class="chart-container">
                    <canvas id="radarChart"></canvas>
                </div>
            </div>
        </div>
    </div>

    <script>
        window.addEventListener('load', function() {{
            setTimeout(function() {{
                // F1-Score Comparison Chart
                const f1Ctx = document.getElementById('f1Chart');
                if (f1Ctx && window.Chart) {{
                    new Chart(f1Ctx.getContext('2d'), {{
                        type: 'bar',
                        data: {{
                            labels: ['Flash', 'Gemini 2.5 Pro'],
                            datasets: [{{
                                label: 'F1-Score',
                                data: [{flash_metrics['f1_score']:.3f}, {pro_metrics['f1_score']:.3f}],
                                backgroundColor: ['#FF6B6B', '#4ECDC4'],
                                borderRadius: 8
                            }}]
                        }},
                        options: {{ 
                            responsive: true, 
                            maintainAspectRatio: false,
                            scales: {{ y: {{ beginAtZero: true, max: 1 }} }},
                            plugins: {{ legend: {{ position: 'bottom' }} }}
                        }}
                    }});
                }}

                // Precision vs Recall Chart
                const prCtx = document.getElementById('precisionRecallChart');
                if (prCtx && window.Chart) {{
                    new Chart(prCtx.getContext('2d'), {{
                        type: 'scatter',
                        data: {{
                            datasets: [{{
                                label: 'Flash',
                                data: [{{x: {flash_metrics['recall']:.3f}, y: {flash_metrics['precision']:.3f}}}],
                                backgroundColor: '#FF6B6B',
                                pointRadius: 12
                            }}, {{
                                label: 'Gemini 2.5 Pro',
                                data: [{{x: {pro_metrics['recall']:.3f}, y: {pro_metrics['precision']:.3f}}}],
                                backgroundColor: '#4ECDC4',
                                pointRadius: 12
                            }}]
                        }},
                        options: {{ 
                            responsive: true, 
                            maintainAspectRatio: false,
                            scales: {{ 
                                x: {{ title: {{ display: true, text: 'Recall' }}, min: 0, max: 1 }},
                                y: {{ title: {{ display: true, text: 'Precision' }}, min: 0, max: 1 }}
                            }},
                            plugins: {{ legend: {{ position: 'bottom' }} }}
                        }}
                    }});
                }}

                // Agreement Analysis Chart
                const agreeCtx = document.getElementById('agreementChart');
                if (agreeCtx && window.Chart) {{
                    new Chart(agreeCtx.getContext('2d'), {{
                        type: 'bar',
                        data: {{
                            labels: ['Flash vs Ground Truth', '2.5 Pro vs Ground Truth', '2.5 Pro Judge Agreement'],
                            datasets: [{{
                                label: 'Agreement Rate',
                                data: [{agreement['flash_vs_ground_truth']:.3f}, {agreement['pro_vs_ground_truth']:.3f}, {agreement['pro_judge_agreement_with_flash']:.3f}],
                                backgroundColor: ['#FF6B6B', '#4ECDC4', '#FFD93D'],
                                borderRadius: 8
                            }}]
                        }},
                        options: {{ 
                            responsive: true, 
                            maintainAspectRatio: false,
                            scales: {{ y: {{ beginAtZero: true, max: 1 }} }},
                            plugins: {{ legend: {{ position: 'bottom' }} }}
                        }}
                    }});
                }}

                // Performance Radar Chart
                const radarCtx = document.getElementById('radarChart');
                if (radarCtx && window.Chart) {{
                    new Chart(radarCtx.getContext('2d'), {{
                        type: 'radar',
                        data: {{
                            labels: ['F1-Score', 'Precision', 'Recall', 'Accuracy', 'Specificity'],
                            datasets: [{{
                                label: 'Flash',
                                data: [{flash_metrics['f1_score']:.3f}, {flash_metrics['precision']:.3f}, {flash_metrics['recall']:.3f}, {flash_metrics['accuracy']:.3f}, {flash_metrics['specificity']:.3f}],
                                backgroundColor: 'rgba(255, 107, 107, 0.2)',
                                borderColor: '#FF6B6B',
                                borderWidth: 2
                            }}, {{
                                label: 'Gemini 2.5 Pro',
                                data: [{pro_metrics['f1_score']:.3f}, {pro_metrics['precision']:.3f}, {pro_metrics['recall']:.3f}, {pro_metrics['accuracy']:.3f}, {pro_metrics['specificity']:.3f}],
                                backgroundColor: 'rgba(78, 205, 196, 0.2)',
                                borderColor: '#4ECDC4',
                                borderWidth: 2
                            }}]
                        }},
                        options: {{ 
                            responsive: true, 
                            maintainAspectRatio: false,
                            scales: {{ r: {{ beginAtZero: true, max: 1 }} }},
                            plugins: {{ legend: {{ position: 'bottom' }} }}
                        }}
                    }});
                }}

                console.log('Classification Dashboard Loaded');
            }}, 1000);
        }});
    </script>
</body>
</html>"""
        
        return html_content
    
    def generate_classification_dashboard(self):
        """Main function to generate the classification dashboard"""
        print("🚀 STARTING CLASSIFICATION METRICS DASHBOARD GENERATION")
        print("=" * 70)
        
        try:
            # Load results with ground truth
            df = self.load_results_with_ground_truth()
            
            if df.empty:
                print("❌ No data available for dashboard")
                return None
            
            # Calculate metrics
            metrics = self.calculate_classification_metrics(df)
            
            if not metrics:
                print("❌ Could not calculate metrics")
                return None
            
            # Create dashboard
            print("\n📊 Creating Classification Dashboard...")
            dashboard_html = self.create_classification_dashboard(metrics, df)
            
            # Save files
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            dashboard_filename = f"gemini_classification_dashboard_{timestamp}.html"
            metrics_filename = f"classification_metrics_{timestamp}.json"
            
            with open(dashboard_filename, "w", encoding='utf-8') as f:
                f.write(dashboard_html)
            
            with open(metrics_filename, "w") as f:
                json.dump(metrics, f, indent=2, default=str)
            
            print(f"✅ Dashboard saved: {dashboard_filename}")
            print(f"✅ Metrics saved: {metrics_filename}")
            
            # Summary
            flash_metrics = metrics['flash_metrics']
            pro_metrics = metrics['pro_25_metrics']
            agreement = metrics['agreement_analysis']
            
            print(f"\n📊 CLASSIFICATION METRICS SUMMARY:")
            print(f"   📊 Flash Model:")
            print(f"      F1-Score: {flash_metrics['f1_score']:.3f}")
            print(f"      Precision: {flash_metrics['precision']:.3f}")
            print(f"      Recall: {flash_metrics['recall']:.3f}")
            print(f"      Accuracy: {flash_metrics['accuracy']:.3f}")
            print(f"      TP: {flash_metrics['confusion_matrix']['true_positives']}, FP: {flash_metrics['confusion_matrix']['false_positives']}")
            print(f"      TN: {flash_metrics['confusion_matrix']['true_negatives']}, FN: {flash_metrics['confusion_matrix']['false_negatives']}")
            
            print(f"\n   🧠 Gemini 2.5 Pro:")
            print(f"      F1-Score: {pro_metrics['f1_score']:.3f}")
            print(f"      Precision: {pro_metrics['precision']:.3f}")
            print(f"      Recall: {pro_metrics['recall']:.3f}") 
            print(f"      Accuracy: {pro_metrics['accuracy']:.3f}")
            print(f"      TP: {pro_metrics['confusion_matrix']['true_positives']}, FP: {pro_metrics['confusion_matrix']['false_positives']}")
            print(f"      TN: {pro_metrics['confusion_matrix']['true_negatives']}, FN: {pro_metrics['confusion_matrix']['false_negatives']}")
            
            print(f"\n   ⚖️ Agreement Rates:")
            print(f"      Flash vs Ground Truth: {agreement['flash_vs_ground_truth']:.1%}")
            print(f"      2.5 Pro vs Ground Truth: {agreement['pro_vs_ground_truth']:.1%}")
            print(f"      2.5 Pro Judge Agreement: {agreement['pro_judge_agreement_with_flash']:.1%}")
            
            return dashboard_filename, metrics_filename
            
        except Exception as e:
            print(f"❌ Dashboard generation failed: {e}")
            import traceback
            traceback.print_exc()
            return None

def main():
    """Run the classification dashboard generation"""
    dashboard_generator = Gemini25ProClassificationDashboard()
    result = dashboard_generator.generate_classification_dashboard()
    
    if result:
        dashboard_file, metrics_file = result
        print(f"\n🎉 SUCCESS!")
        print(f"📁 Files generated:")
        print(f"   1. {dashboard_file}")
        print(f"   2. {metrics_file}")
        print(f"\n💡 Open {dashboard_file} in your browser to view the dashboard!")
    else:
        print(f"\n❌ Dashboard generation failed")

if __name__ == "__main__":
    main()

SyntaxError: expected 'except' or 'finally' block (475267135.py, line 105)

In [50]:
#!/usr/bin/env python3
"""
Gemini 2.5 Pro Classification Metrics Dashboard - Real Results vs Ground Truth
Calculates traditional ML metrics (F1, Precision, Recall, Confusion Matrix) for 2.5 Pro decisions
"""

import pandas as pd
import json
from google.cloud import bigquery
from datetime import datetime
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
import numpy as np
from typing import Dict, List, Any, Optional

PROJECT_ID = "scope3-dev"
DATASET_ID = "research_bs_monitoring"
META_TABLE = "BA_Meta_Ground_Truth"
RESULTS_TABLE = "BA_Meta_Gemini_25_Pro_Judge_Results"

class Gemini25ProClassificationDashboard:
    def __init__(self):
        self.client = bigquery.Client(project=PROJECT_ID)
        
    def load_results_with_ground_truth(self) -> pd.DataFrame:
        """Load 2.5 Pro results with real ground truth from Meta table"""
        print("📥 Loading Gemini 2.5 Pro results with ground truth...")
        
        # First, let's get the results data
        results_query = f"""
        SELECT 
            artifact_id,
            flash_classification,
            flash_reasoning,
            pro_judge_agreement,
            pro_would_reach_same_conclusion,
            pro_confidence,
            pro_reasoning
        FROM `{PROJECT_ID}.{DATASET_ID}.{RESULTS_TABLE}`
        WHERE pro_judge_agreement IS NOT NULL
        AND pro_would_reach_same_conclusion IS NOT NULL
        """
        
        # Get real ground truth from Meta table
        meta_query = f"""
        SELECT 
            artifact_id,
            correct_classification as ground_truth,
            correct_reasoning as ground_truth_reasoning,
            source
        FROM `{PROJECT_ID}.{DATASET_ID}.{META_TABLE}`
        """
        
        try:
            print("📊 Loading results data...")
            results_df = self.client.query(results_query).to_dataframe()
            print(f"   Found {len(results_df)} result records")
            
            print("📊 Loading ground truth data...")
            meta_df = self.client.query(meta_query).to_dataframe()
            print(f"   Found {len(meta_df)} ground truth records")
            
            # Debug: Check ground truth distribution
            if len(meta_df) > 0:
                gt_dist = meta_df['ground_truth'].value_counts().to_dict()
                print(f"   Ground truth distribution from Meta table: {gt_dist}")
            
            # Join results with ground truth
            df = results_df.merge(meta_df, on='artifact_id', how='inner')
            print(f"✅ Merged dataset: {len(df)} records")
            
            if len(df) == 0:
                print("❌ No matching records found after merge")
                return pd.DataFrame()
            
            # Debug: Check what we actually got from the merge
            print(f"   Sample artifact_ids from results: {results_df['artifact_id'].head().tolist()}")
            print(f"   Sample artifact_ids from meta: {meta_df['artifact_id'].head().tolist()}")
            print(f"   Sample flash_classification: {results_df['flash_classification'].head().tolist()}")
            print(f"   Sample ground_truth after merge: {df['ground_truth'].head().tolist()}")
            
            # Check if flash_classification and ground_truth are identical BEFORE conversion
            identical_before_conversion = (df['flash_classification'] == df['ground_truth']).sum()
            print(f"   Flash vs Ground Truth identical BEFORE conversion: {identical_before_conversion}/{len(df)}")
            
            if identical_before_conversion == len(df):
                print("   🚨 CRITICAL ERROR: flash_classification is identical to ground_truth!")
                print("   This means your ground truth data IS the Flash predictions, not human annotations.")
                print("   Check your data pipeline - Meta table might contain Flash results instead of human labels.")
                
                # Let's check a few individual records to confirm
                print("   📋 Sample records to verify:")
                for i in range(min(3, len(df))):
                    row = df.iloc[i]
                    print(f"      Record {i+1}: artifact_id={row['artifact_id']}, flash={row['flash_classification']}, ground_truth={row['ground_truth']}")
            
            # Convert to binary format
            df['ground_truth_binary'] = (df['ground_truth'] == 100).astype(int)
            df['flash_binary'] = (df['flash_classification'] == 100).astype(int)
            
            # Convert 2.5 Pro predictions based on "would reach same conclusion"
            df['pro_25_prediction'] = np.where(
                df['pro_would_reach_same_conclusion'] == True,
                df['flash_binary'],  # Same as Flash
                1 - df['flash_binary']  # Opposite of Flash
            ).astype(int)
            
            print(f"📊 Data distribution:")
            print(f"   Ground truth: {df['ground_truth_binary'].value_counts().to_dict()}")
            print(f"   Flash predictions: {df['flash_binary'].value_counts().to_dict()}")
            print(f"   2.5 Pro predictions: {df['pro_25_prediction'].value_counts().to_dict()}")
            print(f"   Judge agreement rate: {df['pro_judge_agreement'].mean():.1%}")
            
            # Validate binary data
            for col, name in [('ground_truth_binary', 'Ground truth'), 
                            ('flash_binary', 'Flash'), 
                            ('pro_25_prediction', '2.5 Pro')]:
                if not all(df[col].isin([0, 1])):
                    print(f"⚠️ Warning: {name} contains non-binary values")
                    df = df[df[col].isin([0, 1])]
            
            print(f"📊 Final clean dataset: {len(df)} records")
            
            # Debug sample comparisons
            print(f"\n🔍 DEBUG - Sample comparisons (first 5 records):")
            print(f"   Columns available: {df.columns.tolist()}")
            for i in range(min(5, len(df))):
                row = df.iloc[i]
                print(f"   Record {i+1}:")
                print(f"      artifact_id: {row['artifact_id']}")
                print(f"      ground_truth (from Meta): {row['ground_truth']} -> binary: {row['ground_truth_binary']}")
                print(f"      flash_classification: {row['flash_classification']} -> binary: {row['flash_binary']}")
                print(f"      pro_would_reach_same_conclusion: {row['pro_would_reach_same_conclusion']}")
                print(f"      pro_25_prediction: {row['pro_25_prediction']}")
                print(f"      Flash vs Ground Truth: {'✅ MATCH' if row['flash_binary'] == row['ground_truth_binary'] else '❌ DIFFER'}")
                print(f"      ---")
            
            # Check if Flash predictions are identical to ground truth
            flash_matches_gt = (df['flash_binary'] == df['ground_truth_binary']).sum()
            print(f"\n🚨 CRITICAL CHECK:")
            print(f"   Flash predictions matching ground truth: {flash_matches_gt}/{len(df)} ({flash_matches_gt/len(df):.1%})")
            
            if flash_matches_gt == len(df):
                print("   🚨 ERROR: Flash predictions are 100% identical to ground truth!")
                print("   This suggests the ground truth data is actually Flash's predictions, not human annotations.")
                print("   Check your data pipeline - the 'ground_truth' field might be populated with Flash results.")
            elif flash_matches_gt > len(df) * 0.95:
                print("   ⚠️ WARNING: Flash predictions are suspiciously similar to ground truth (>95% match)")
                print("   This might indicate data contamination.")
            else:
                print("   ✅ Good: Flash predictions differ from ground truth as expected.")
            
            return df
            
        except Exception as e:
            print(f"❌ Error loading data: {e}")
            import traceback
            traceback.print_exc()
            return pd.DataFrame()
    
    def calculate_classification_metrics(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Calculate comprehensive classification metrics for both models"""
        print("📊 Calculating classification metrics...")
        
        if len(df) == 0:
            return {}
        
        y_true = df['ground_truth_binary'].values
        flash_pred = df['flash_binary'].values  
        pro_pred = df['pro_25_prediction'].values
        
        # Flash metrics - Compare Flash predictions with ground truth
        flash_metrics = self._calculate_model_metrics(y_true, flash_pred, "Flash")
        
        # 2.5 Pro metrics - Compare 2.5 Pro predictions with ground truth  
        pro_metrics = self._calculate_model_metrics(y_true, pro_pred, "Gemini 2.5 Pro")
        
        # Agreement analysis
        flash_vs_ground_truth_agreement = (flash_pred == y_true).mean()
        pro_vs_ground_truth_agreement = (pro_pred == y_true).mean()
        judge_agreement_rate = df['pro_judge_agreement'].mean()
        
        # Cross-analysis: When do they agree/disagree with each other and ground truth?
        both_correct = ((flash_pred == y_true) & (pro_pred == y_true)).sum()
        both_wrong = ((flash_pred != y_true) & (pro_pred != y_true)).sum()
        flash_right_pro_wrong = ((flash_pred == y_true) & (pro_pred != y_true)).sum()
        pro_right_flash_wrong = ((flash_pred != y_true) & (pro_pred == y_true)).sum()
        
        cross_analysis = {
            'both_correct': int(both_correct),
            'both_wrong': int(both_wrong), 
            'flash_right_pro_wrong': int(flash_right_pro_wrong),
            'pro_right_flash_wrong': int(pro_right_flash_wrong),
            'total': len(df)
        }
        
        print(f"✅ Metrics calculated:")
        print(f"   Flash F1: {flash_metrics['f1_score']:.3f}")
        print(f"   2.5 Pro F1: {pro_metrics['f1_score']:.3f}")
        print(f"   Judge agreement: {judge_agreement_rate:.1%}")
        
        return {
            'dataset_info': {
                'total_samples': len(df),
                'positive_samples': int((y_true == 1).sum()),
                'negative_samples': int((y_true == 0).sum()),
                'class_balance': float((y_true == 1).mean())
            },
            'flash_metrics': flash_metrics,
            'pro_25_metrics': pro_metrics,
            'agreement_analysis': {
                'flash_vs_ground_truth': flash_vs_ground_truth_agreement,
                'pro_vs_ground_truth': pro_vs_ground_truth_agreement,
                'pro_judge_agreement_with_flash': judge_agreement_rate,
                'cross_analysis': cross_analysis
            },
            'model_comparison': {
                'f1_difference': pro_metrics['f1_score'] - flash_metrics['f1_score'],
                'accuracy_difference': pro_metrics['accuracy'] - flash_metrics['accuracy'],
                'precision_difference': pro_metrics['precision'] - flash_metrics['precision'],
                'recall_difference': pro_metrics['recall'] - flash_metrics['recall'],
                'better_f1': 'Gemini 2.5 Pro' if pro_metrics['f1_score'] > flash_metrics['f1_score'] else 'Flash'
            }
        }
    
    def _calculate_model_metrics(self, y_true: np.ndarray, y_pred: np.ndarray, model_name: str) -> Dict[str, Any]:
        """Calculate detailed metrics for a single model"""
        
        # Ensure binary values
        y_true = np.array(y_true).astype(int)
        y_pred = np.array(y_pred).astype(int)
        
        # Validate binary values
        if not (set(np.unique(y_true)) <= {0, 1} and set(np.unique(y_pred)) <= {0, 1}):
            print(f"⚠️ Warning: Non-binary values detected in {model_name}")
            print(f"   y_true unique: {np.unique(y_true)}")
            print(f"   y_pred unique: {np.unique(y_pred)}")
            # Force to binary
            y_true = np.clip(y_true, 0, 1)
            y_pred = np.clip(y_pred, 0, 1)
        
        accuracy = accuracy_score(y_true, y_pred)
        
        # Handle edge cases where only one class is predicted
        unique_pred = np.unique(y_pred)
        unique_true = np.unique(y_true)
        
        if len(unique_pred) == 1 or len(unique_true) == 1:
            # Use macro average for edge cases
            precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
            recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
        else:
            precision = precision_score(y_true, y_pred, average='binary', zero_division=0)
            recall = recall_score(y_true, y_pred, average='binary', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)
        
        # Confusion matrix - ensure we get 2x2 matrix
        cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
        
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
        elif cm.shape == (1, 1):
            # Handle case where only one class exists
            if unique_true[0] == 0 and unique_pred[0] == 0:
                tn, fp, fn, tp = cm[0,0], 0, 0, 0
            elif unique_true[0] == 1 and unique_pred[0] == 1:
                tn, fp, fn, tp = 0, 0, 0, cm[0,0]
            else:
                tn, fp, fn, tp = 0, cm[0,0], 0, 0
        else:
            print(f"⚠️ Unexpected confusion matrix shape for {model_name}: {cm.shape}")
            tn = fp = fn = tp = 0
        
        # Additional metrics
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0  # Same as recall
        npv = tn / (tn + fn) if (tn + fn) > 0 else 0  # Negative Predictive Value
        ppv = tp / (tp + fp) if (tp + fp) > 0 else 0  # Positive Predictive Value (same as precision)
        
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Positive Rate
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0  # False Negative Rate
        
        balanced_accuracy = (sensitivity + specificity) / 2
        
        # Matthews Correlation Coefficient
        mcc_num = (tp * tn) - (fp * fn)
        mcc_den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
        mcc = mcc_num / mcc_den if mcc_den > 0 else 0
        
        return {
            'model_name': model_name,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'specificity': specificity,
            'sensitivity': sensitivity,
            'balanced_accuracy': balanced_accuracy,
            'negative_predictive_value': npv,
            'positive_predictive_value': ppv,
            'false_positive_rate': fpr,
            'false_negative_rate': fnr,
            'matthews_correlation': mcc,
            'confusion_matrix': {
                'true_positives': int(tp),
                'false_positives': int(fp), 
                'true_negatives': int(tn),
                'false_negatives': int(fn)
            }
        }
    
    def create_classification_dashboard(self, metrics: Dict[str, Any], df: pd.DataFrame) -> str:
        """Create comprehensive classification metrics dashboard"""
        
        if not metrics:
            return "<html><body><h1>No valid metrics to display</h1></body></html>"
        
        flash_metrics = metrics['flash_metrics']
        pro_metrics = metrics['pro_25_metrics']
        dataset_info = metrics['dataset_info']
        agreement = metrics['agreement_analysis']
        comparison = metrics['model_comparison']
        
        flash_cm = flash_metrics['confusion_matrix']
        pro_cm = pro_metrics['confusion_matrix']
        
        # Performance assessment
        if pro_metrics['f1_score'] >= 0.8:
            pro_assessment = "🟢 EXCELLENT"
            pro_color = "#10B981"
        elif pro_metrics['f1_score'] >= 0.7:
            pro_assessment = "🟡 GOOD" 
            pro_color = "#F59E0B"
        elif pro_metrics['f1_score'] >= 0.6:
            pro_assessment = "🟠 MODERATE"
            pro_color = "#FF6B35"
        else:
            pro_assessment = "🔴 POOR"
            pro_color = "#EF4444"
        
        html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Gemini 2.5 Pro Classification Metrics Dashboard</title>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/4.4.0/chart.min.js"></script>
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{ font-family: 'Inter', sans-serif; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: #333; line-height: 1.6; min-height: 100vh; }}
        .dashboard {{ max-width: 1800px; margin: 0 auto; padding: 20px; }}
        .header {{ background: rgba(255,255,255,0.95); padding: 40px; border-radius: 25px; text-align: center; margin-bottom: 30px; backdrop-filter: blur(15px); box-shadow: 0 20px 60px rgba(0,0,0,0.1); }}
        .header h1 {{ font-size: 3rem; margin-bottom: 15px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; }}
        .header h2 {{ font-size: 1.5rem; color: #666; margin-bottom: 20px; }}
        .model-badge {{ display: inline-block; padding: 10px 25px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 30px; font-weight: bold; font-size: 1.1rem; margin: 5px; }}
        .metrics-overview {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 20px; margin-bottom: 40px; }}
        .metric-card {{ background: rgba(255,255,255,0.95); padding: 25px; border-radius: 20px; text-align: center; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); transition: transform 0.3s ease; }}
        .metric-card:hover {{ transform: translateY(-5px); }}
        .metric-value {{ font-size: 2.5rem; font-weight: bold; margin-bottom: 10px; }}
        .metric-label {{ font-size: 1rem; color: #666; font-weight: 600; margin-bottom: 5px; }}
        .metric-description {{ font-size: 0.85rem; color: #888; }}
        .excellent {{ color: #10B981; }}
        .good {{ color: #F59E0B; }}
        .moderate {{ color: #FF6B35; }}
        .poor {{ color: #EF4444; }}
        .dashboard-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(450px, 1fr)); gap: 25px; margin-bottom: 40px; }}
        .chart-section {{ background: rgba(255,255,255,0.95); padding: 30px; border-radius: 20px; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); }}
        .chart-title {{ font-size: 1.3rem; font-weight: bold; margin-bottom: 20px; color: #333; text-align: center; }}
        .chart-container {{ position: relative; height: 300px; margin-bottom: 15px; }}
        .confusion-matrix {{ display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px; margin: 20px 0; text-align: center; font-size: 0.9rem; }}
        .cm-cell {{ padding: 15px; border-radius: 12px; font-weight: bold; display: flex; flex-direction: column; justify-content: center; align-items: center; }}
        .cm-header {{ background: linear-gradient(135deg, #f8f9fa, #e9ecef); color: #495057; }}
        .cm-tp {{ background: linear-gradient(135deg, #d4edda, #c3e6cb); color: #155724; font-size: 1.8rem; }}
        .cm-fp {{ background: linear-gradient(135deg, #f8d7da, #f1b0b7); color: #721c24; font-size: 1.8rem; }}
        .cm-fn {{ background: linear-gradient(135deg, #fff3cd, #fce4a6); color: #856404; font-size: 1.8rem; }}
        .cm-tn {{ background: linear-gradient(135deg, #d1ecf1, #bee5eb); color: #0c5460; font-size: 1.8rem; }}
        .cm-label {{ font-size: 0.75rem; margin-top: 5px; opacity: 0.8; }}
        .performance-summary {{ background: linear-gradient(135deg, #e3f2fd, #bbdefb); padding: 30px; border-radius: 20px; margin: 30px 0; text-align: center; }}
        .performance-score {{ font-size: 3.5rem; font-weight: bold; color: {pro_color}; margin-bottom: 10px; }}
        .performance-label {{ font-size: 1.3rem; color: #37474f; margin-bottom: 8px; }}
        .performance-description {{ font-size: 1rem; color: #546e7a; }}
        .comparison-section {{ background: rgba(255,255,255,0.95); padding: 30px; border-radius: 20px; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); margin-bottom: 30px; }}
        .comparison-grid {{ display: grid; grid-template-columns: 1fr 1fr; gap: 30px; margin-top: 20px; }}
        .model-column {{ padding: 20px; border-radius: 15px; }}
        .flash-column {{ background: linear-gradient(135deg, #e8f5e8, #d4edda); }}
        .pro-column {{ background: linear-gradient(135deg, #e3f2fd, #bbdefb); }}
        .model-title {{ font-size: 1.5rem; font-weight: bold; text-align: center; margin-bottom: 20px; }}
        .metric-row {{ display: flex; justify-content: space-between; margin: 8px 0; padding: 8px; background: rgba(255,255,255,0.5); border-radius: 8px; }}
    </style>
</head>
<body>
    <div class="dashboard">
        <div class="header">
            <h1>🎯 Classification Metrics Dashboard</h1>
            <h2>Gemini Models vs Meta Ground Truth</h2>
            <div>
                <span class="model-badge">Flash Model</span>
                <span class="model-badge">Gemini 2.5 Pro</span>
            </div>
            <p style="margin-top: 20px; color: #666;">
                Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | 
                Samples: {dataset_info['total_samples']:,} | 
                Positive: {dataset_info['positive_samples']} | Negative: {dataset_info['negative_samples']}
            </p>
        </div>

        <div class="performance-summary">
            <div class="performance-score">{pro_metrics['f1_score']:.3f}</div>
            <div class="performance-label">{pro_assessment} F1-Score (Gemini 2.5 Pro)</div>
            <div class="performance-description">
                2.5 Pro achieves {pro_metrics['accuracy']:.1%} accuracy with {pro_metrics['precision']:.1%} precision
                <br>Judge Agreement with Flash: {agreement['pro_judge_agreement_with_flash']:.1%}
            </div>
        </div>

        <div class="metrics-overview">
            <div class="metric-card">
                <div class="metric-value {'excellent' if pro_metrics['f1_score'] >= 0.8 else 'good' if pro_metrics['f1_score'] >= 0.7 else 'moderate' if pro_metrics['f1_score'] >= 0.6 else 'poor'}">{pro_metrics['f1_score']:.3f}</div>
                <div class="metric-label">2.5 Pro F1-Score</div>
                <div class="metric-description">Harmonic Mean P&R</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if pro_metrics['recall'] >= 0.8 else 'good' if pro_metrics['recall'] >= 0.7 else 'moderate' if pro_metrics['recall'] >= 0.6 else 'poor'}">{pro_metrics['recall']:.3f}</div>
                <div class="metric-label">2.5 Pro TPR</div>
                <div class="metric-description">True Positive Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if pro_metrics['specificity'] >= 0.8 else 'good' if pro_metrics['specificity'] >= 0.7 else 'moderate' if pro_metrics['specificity'] >= 0.6 else 'poor'}">{pro_metrics['specificity']:.3f}</div>
                <div class="metric-label">2.5 Pro TNR</div>
                <div class="metric-description">True Negative Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if pro_metrics['false_positive_rate'] <= 0.2 else 'good' if pro_metrics['false_positive_rate'] <= 0.3 else 'moderate' if pro_metrics['false_positive_rate'] <= 0.4 else 'poor'}">{pro_metrics['false_positive_rate']:.3f}</div>
                <div class="metric-label">2.5 Pro FPR</div>
                <div class="metric-description">False Positive Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if pro_metrics['false_negative_rate'] <= 0.2 else 'good' if pro_metrics['false_negative_rate'] <= 0.3 else 'moderate' if pro_metrics['false_negative_rate'] <= 0.4 else 'poor'}">{pro_metrics['false_negative_rate']:.3f}</div>
                <div class="metric-label">2.5 Pro FNR</div>
                <div class="metric-description">False Negative Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if pro_metrics['accuracy'] >= 0.9 else 'good' if pro_metrics['accuracy'] >= 0.8 else 'moderate' if pro_metrics['accuracy'] >= 0.7 else 'poor'}">{pro_metrics['accuracy']:.3f}</div>
                <div class="metric-label">2.5 Pro Accuracy</div>
                <div class="metric-description">Overall Correctness</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if pro_metrics['precision'] >= 0.8 else 'good' if pro_metrics['precision'] >= 0.7 else 'moderate' if pro_metrics['precision'] >= 0.6 else 'poor'}">{pro_metrics['precision']:.3f}</div>
                <div class="metric-label">2.5 Pro Precision</div>
                <div class="metric-description">Positive Predictive Value</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if agreement['pro_judge_agreement_with_flash'] >= 0.8 else 'good' if agreement['pro_judge_agreement_with_flash'] >= 0.7 else 'moderate' if agreement['pro_judge_agreement_with_flash'] >= 0.6 else 'poor'}">{agreement['pro_judge_agreement_with_flash']:.1%}</div>
                <div class="metric-label">Judge Agreement</div>
                <div class="metric-description">2.5 Pro agrees with Flash</div>
            </div>
        </div>

        <div class="comparison-section">
            <div class="chart-title">📊 Gemini 2.5 Pro Performance Metrics</div>
            <div style="max-width: 600px; margin: 0 auto;">
                <div class="model-column pro-column">
                    <div class="model-title">🧠 Gemini 2.5 Pro vs Ground Truth</div>
                    <div class="metric-row"><span>F1-Score:</span><span class="{'excellent' if pro_metrics['f1_score'] >= 0.8 else 'good' if pro_metrics['f1_score'] >= 0.7 else 'moderate' if pro_metrics['f1_score'] >= 0.6 else 'poor'}">{pro_metrics['f1_score']:.3f}</span></div>
                    <div class="metric-row"><span>Accuracy:</span><span>{pro_metrics['accuracy']:.3f}</span></div>
                    <div class="metric-row"><span>Precision:</span><span>{pro_metrics['precision']:.3f}</span></div>
                    <div class="metric-row"><span>Recall (TPR):</span><span>{pro_metrics['recall']:.3f}</span></div>
                    <div class="metric-row"><span>Specificity (TNR):</span><span>{pro_metrics['specificity']:.3f}</span></div>
                    <div class="metric-row"><span>False Positive Rate:</span><span>{pro_metrics['false_positive_rate']:.3f}</span></div>
                    <div class="metric-row"><span>False Negative Rate:</span><span>{pro_metrics['false_negative_rate']:.3f}</span></div>
                    <div class="metric-row"><span>Judge Agreement with Flash:</span><span>{agreement['pro_judge_agreement_with_flash']:.1%}</span></div>
                </div>
            </div>
        </div>

        <div class="dashboard-grid">
            <div class="chart-section">
                <div class="chart-title">🧠 Gemini 2.5 Pro Confusion Matrix</div>
                <div class="confusion-matrix">
                    <div class="cm-cell cm-header"></div>
                    <div class="cm-cell cm-header">Ground Truth: 0</div>
                    <div class="cm-cell cm-header">Ground Truth: 1</div>
                    
                    <div class="cm-cell cm-header">2.5 Pro: 0</div>
                    <div class="cm-cell cm-tn">
                        {pro_cm['true_negatives']}
                        <div class="cm-label">True Negatives</div>
                    </div>
                    <div class="cm-cell cm-fn">
                        {pro_cm['false_negatives']}
                        <div class="cm-label">False Negatives</div>
                    </div>
                    
                    <div class="cm-cell cm-header">2.5 Pro: 1</div>
                    <div class="cm-cell cm-fp">
                        {pro_cm['false_positives']}
                        <div class="cm-label">False Positives</div>
                    </div>
                    <div class="cm-cell cm-tp">
                        {pro_cm['true_positives']}
                        <div class="cm-label">True Positives</div>
                    </div>
                </div>
                <div style="text-align: center; margin-top: 15px;">
                    <p><strong>TP:</strong> {pro_cm['true_positives']} | <strong>FP:</strong> {pro_cm['false_positives']} | <strong>TN:</strong> {pro_cm['true_negatives']} | <strong>FN:</strong> {pro_cm['false_negatives']}</p>
                </div>
            </div>
        </div>
    </div>

    <script>
        window.addEventListener('load', function() {{
            console.log('Gemini 2.5 Pro Classification Dashboard Loaded');
        }});
    </script>
</body>
</html>"""
        
        return html_content
    
    def generate_classification_dashboard(self):
        """Main function to generate the classification dashboard"""
        print("🚀 STARTING CLASSIFICATION METRICS DASHBOARD GENERATION")
        print("=" * 70)
        
        try:
            # Load results with ground truth
            df = self.load_results_with_ground_truth()
            
            if df.empty:
                print("❌ No data available for dashboard")
                return None
            
            # Calculate metrics
            metrics = self.calculate_classification_metrics(df)
            
            if not metrics:
                print("❌ Could not calculate metrics")
                return None
            
            # Create dashboard
            print("\n📊 Creating Classification Dashboard...")
            dashboard_html = self.create_classification_dashboard(metrics, df)
            
            # Save files
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            dashboard_filename = f"gemini_classification_dashboard_{timestamp}.html"
            metrics_filename = f"classification_metrics_{timestamp}.json"
            
            with open(dashboard_filename, "w", encoding='utf-8') as f:
                f.write(dashboard_html)
            
            with open(metrics_filename, "w") as f:
                json.dump(metrics, f, indent=2, default=str)
            
            print(f"✅ Dashboard saved: {dashboard_filename}")
            print(f"✅ Metrics saved: {metrics_filename}")
            
            # Summary
            flash_metrics = metrics['flash_metrics']
            pro_metrics = metrics['pro_25_metrics']
            agreement = metrics['agreement_analysis']
            
            print(f"\n📊 CLASSIFICATION METRICS SUMMARY:")
            print(f"   📊 Flash Model:")
            print(f"      F1-Score: {flash_metrics['f1_score']:.3f}")
            print(f"      Precision: {flash_metrics['precision']:.3f}")
            print(f"      Recall: {flash_metrics['recall']:.3f}")
            print(f"      Accuracy: {flash_metrics['accuracy']:.3f}")
            print(f"      TP: {flash_metrics['confusion_matrix']['true_positives']}, FP: {flash_metrics['confusion_matrix']['false_positives']}")
            print(f"      TN: {flash_metrics['confusion_matrix']['true_negatives']}, FN: {flash_metrics['confusion_matrix']['false_negatives']}")
            
            print(f"\n   🧠 Gemini 2.5 Pro:")
            print(f"      F1-Score: {pro_metrics['f1_score']:.3f}")
            print(f"      Precision: {pro_metrics['precision']:.3f}")
            print(f"      Recall: {pro_metrics['recall']:.3f}") 
            print(f"      Accuracy: {pro_metrics['accuracy']:.3f}")
            print(f"      TP: {pro_metrics['confusion_matrix']['true_positives']}, FP: {pro_metrics['confusion_matrix']['false_positives']}")
            print(f"      TN: {pro_metrics['confusion_matrix']['true_negatives']}, FN: {pro_metrics['confusion_matrix']['false_negatives']}")
            
            print(f"\n   ⚖️ Agreement Rates:")
            print(f"      Flash vs Ground Truth: {agreement['flash_vs_ground_truth']:.1%}")
            print(f"      2.5 Pro vs Ground Truth: {agreement['pro_vs_ground_truth']:.1%}")
            print(f"      2.5 Pro Judge Agreement: {agreement['pro_judge_agreement_with_flash']:.1%}")
            
            return dashboard_filename, metrics_filename
            
        except Exception as e:
            print(f"❌ Dashboard generation failed: {e}")
            import traceback
            traceback.print_exc()
            return None

def main():
    """Run the classification dashboard generation"""
    dashboard_generator = Gemini25ProClassificationDashboard()
    result = dashboard_generator.generate_classification_dashboard()
    
    if result:
        dashboard_file, metrics_file = result
        print(f"\n🎉 SUCCESS!")
        print(f"📁 Files generated:")
        print(f"   1. {dashboard_file}")
        print(f"   2. {metrics_file}")
        print(f"\n💡 Open {dashboard_file} in your browser to view the dashboard!")
    else:
        print(f"\n❌ Dashboard generation failed")

if __name__ == "__main__":
    main()

🚀 STARTING CLASSIFICATION METRICS DASHBOARD GENERATION
📥 Loading Gemini 2.5 Pro results with ground truth...
📊 Loading results data...
   Found 299 result records
📊 Loading ground truth data...
   Found 299 ground truth records
   Ground truth distribution from Meta table: {100: 187, 0: 112}
✅ Merged dataset: 299 records
   Sample artifact_ids from results: ['meta:aWdfbWVkaWFfM3B2OjE3OTUxMjQ0NTg3OTc2MTk3', 'meta:aWdfbWVkaWFfM3B2OjE4MDQ1MjA0Njk4MjMzMDYz', 'meta:aWdfbWVkaWFfM3B2OjE4MDQ5OTgyNjc1MTMzMjM5', 'meta:993741799587060', 'meta:aWdfbWVkaWFfM3B2OjE4MDYwMjIyMDk0OTgxNzk5']
   Sample artifact_ids from meta: ['meta:122140809938790694', 'meta:4050755845195196', 'meta:1173810271458869', 'meta:1297387721748313', 'meta:1138085658345605']
   Sample flash_classification: [0, 0, 0, 0, 0]
   Sample ground_truth after merge: [0, 0, 0, 0, 0]
   Flash vs Ground Truth identical BEFORE conversion: 299/299
   🚨 CRITICAL ERROR: flash_classification is identical to ground_truth!
   This means your grou

In [51]:
#!/usr/bin/env python3
"""
Gemini 1.5 Pro Classification Metrics Dashboard - Real Results vs Ground Truth
Calculates traditional ML metrics (F1, Precision, Recall, Confusion Matrix) for 1.5 Pro decisions
"""

import pandas as pd
import json
from google.cloud import bigquery
from datetime import datetime
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
import numpy as np
from typing import Dict, List, Any, Optional

PROJECT_ID = "scope3-dev"
DATASET_ID = "research_bs_monitoring"
META_TABLE = "BA_Meta_Ground_Truth"
RESULTS_TABLE = "BA_Meta_Gemini_Pro_Judge_Results"  # Gemini 1.5 Pro results table

class Gemini15ProClassificationDashboard:
    def __init__(self):
        self.client = bigquery.Client(project=PROJECT_ID)
        
    def load_results_with_ground_truth(self) -> pd.DataFrame:
        """Load 1.5 Pro results with real ground truth from Meta table"""
        print("📥 Loading Gemini 1.5 Pro results with ground truth...")
        
        # First, let's get the results data
        results_query = f"""
        SELECT 
            artifact_id,
            flash_classification,
            flash_reasoning,
            model_prompt,
            pro_judge_agreement,
            pro_verdict,
            pro_confidence,
            pro_would_reach_same_conclusion,
            pro_reasoning,
            flash_vs_pro_analysis,
            improvements,
            api_call_time,
            model_used,
            error_message
        FROM `{PROJECT_ID}.{DATASET_ID}.{RESULTS_TABLE}`
        WHERE pro_judge_agreement IS NOT NULL
        AND pro_would_reach_same_conclusion IS NOT NULL
        AND (error_message IS NULL OR error_message = '')
        """
        
        # Get real ground truth from Meta table
        meta_query = f"""
        SELECT 
            artifact_id,
            correct_classification as ground_truth,
            correct_reasoning as ground_truth_reasoning,
            source
        FROM `{PROJECT_ID}.{DATASET_ID}.{META_TABLE}`
        """
        
        try:
            print("📊 Loading Gemini 1.5 Pro results data...")
            results_df = self.client.query(results_query).to_dataframe()
            print(f"   Found {len(results_df)} result records")
            
            print("📊 Loading ground truth data...")
            meta_df = self.client.query(meta_query).to_dataframe()
            print(f"   Found {len(meta_df)} ground truth records")
            
            # Debug: Check ground truth distribution
            if len(meta_df) > 0:
                gt_dist = meta_df['ground_truth'].value_counts().to_dict()
                print(f"   Ground truth distribution from Meta table: {gt_dist}")
            
            # Join results with ground truth
            df = results_df.merge(meta_df, on='artifact_id', how='inner')
            print(f"✅ Merged dataset: {len(df)} records")
            
            if len(df) == 0:
                print("❌ No matching records found after merge")
                return pd.DataFrame()
            
            # Debug: Check what we actually got from the merge
            print(f"   Sample artifact_ids from results: {results_df['artifact_id'].head().tolist()}")
            print(f"   Sample artifact_ids from meta: {meta_df['artifact_id'].head().tolist()}")
            print(f"   Sample flash_classification: {results_df['flash_classification'].head().tolist()}")
            print(f"   Sample ground_truth after merge: {df['ground_truth'].head().tolist()}")
            
            # Check if flash_classification and ground_truth are identical BEFORE conversion
            identical_before_conversion = (df['flash_classification'] == df['ground_truth']).sum()
            print(f"   Flash vs Ground Truth identical BEFORE conversion: {identical_before_conversion}/{len(df)}")
            
            if identical_before_conversion == len(df):
                print("   🚨 CRITICAL ERROR: flash_classification is identical to ground_truth!")
                print("   This means your ground truth data IS the Flash predictions, not human annotations.")
                print("   Check your data pipeline - Meta table might contain Flash results instead of human labels.")
                
                # Let's check a few individual records to confirm
                print("   📋 Sample records to verify:")
                for i in range(min(3, len(df))):
                    row = df.iloc[i]
                    print(f"      Record {i+1}: artifact_id={row['artifact_id']}, flash={row['flash_classification']}, ground_truth={row['ground_truth']}")
            
            # Convert to binary format
            df['ground_truth_binary'] = (df['ground_truth'] == 100).astype(int)
            df['flash_binary'] = (df['flash_classification'] == 100).astype(int)
            
            # Convert 1.5 Pro "would reach same conclusion" to predictions
            df['pro_15_prediction'] = np.where(
                df['pro_would_reach_same_conclusion'] == True,
                df['flash_binary'],  # Same as Flash
                1 - df['flash_binary']  # Opposite of Flash
            ).astype(int)
            
            print(f"📊 Data distribution:")
            print(f"   Ground truth: {df['ground_truth_binary'].value_counts().to_dict()}")
            print(f"   Flash predictions: {df['flash_binary'].value_counts().to_dict()}")
            print(f"   1.5 Pro predictions: {df['pro_15_prediction'].value_counts().to_dict()}")
            print(f"   Judge agreement rate: {df['pro_judge_agreement'].mean():.1%}")
            
            # Validate binary data
            for col, name in [('ground_truth_binary', 'Ground truth'), 
                            ('flash_binary', 'Flash'), 
                            ('pro_15_prediction', '1.5 Pro')]:
                if not all(df[col].isin([0, 1])):
                    print(f"⚠️ Warning: {name} contains non-binary values")
                    df = df[df[col].isin([0, 1])]
            
            print(f"📊 Final clean dataset: {len(df)} records")
            
            # Debug sample comparisons
            print(f"\n🔍 DEBUG - Sample comparisons (first 5 records):")
            print(f"   Columns available: {df.columns.tolist()}")
            for i in range(min(5, len(df))):
                row = df.iloc[i]
                print(f"   Record {i+1}:")
                print(f"      artifact_id: {row['artifact_id']}")
                print(f"      ground_truth (from Meta): {row['ground_truth']} -> binary: {row['ground_truth_binary']}")
                print(f"      flash_classification: {row['flash_classification']} -> binary: {row['flash_binary']}")
                print(f"      pro_would_reach_same_conclusion: {row['pro_would_reach_same_conclusion']}")
                print(f"      pro_15_prediction: {row['pro_15_prediction']}")
                print(f"      Flash vs Ground Truth: {'✅ MATCH' if row['flash_binary'] == row['ground_truth_binary'] else '❌ DIFFER'}")
                print(f"      ---")
            
            # Check if Flash predictions are identical to ground truth
            flash_matches_gt = (df['flash_binary'] == df['ground_truth_binary']).sum()
            print(f"\n🚨 CRITICAL CHECK:")
            print(f"   Flash predictions matching ground truth: {flash_matches_gt}/{len(df)} ({flash_matches_gt/len(df):.1%})")
            
            if flash_matches_gt == len(df):
                print("   🚨 ERROR: Flash predictions are 100% identical to ground truth!")
                print("   This suggests the ground truth data is actually Flash's predictions, not human annotations.")
                print("   Check your data pipeline - the 'ground_truth' field might be populated with Flash results.")
            elif flash_matches_gt > len(df) * 0.95:
                print("   ⚠️ WARNING: Flash predictions are suspiciously similar to ground truth (>95% match)")
                print("   This might indicate data contamination.")
            else:
                print("   ✅ Good: Flash predictions differ from ground truth as expected.")
            
            return df
            
        except Exception as e:
            print(f"❌ Error loading data: {e}")
            import traceback
            traceback.print_exc()
            return pd.DataFrame()
    
    def calculate_classification_metrics(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Calculate comprehensive classification metrics for 1.5 Pro"""
        print("📊 Calculating classification metrics...")
        
        if len(df) == 0:
            return {}
        
        y_true = df['ground_truth_binary'].values
        pro_pred = df['pro_15_prediction'].values
        
        # 1.5 Pro metrics - Compare 1.5 Pro predictions with ground truth  
        pro_metrics = self._calculate_model_metrics(y_true, pro_pred, "Gemini 1.5 Pro")
        
        # Agreement analysis
        pro_vs_ground_truth_agreement = (pro_pred == y_true).mean()
        judge_agreement_rate = df['pro_judge_agreement'].mean()
        
        print(f"✅ Metrics calculated:")
        print(f"   1.5 Pro F1: {pro_metrics['f1_score']:.3f}")
        print(f"   Judge agreement: {judge_agreement_rate:.1%}")
        
        return {
            'dataset_info': {
                'total_samples': len(df),
                'positive_samples': int((y_true == 1).sum()),
                'negative_samples': int((y_true == 0).sum()),
                'class_balance': float((y_true == 1).mean())
            },
            'pro_15_metrics': pro_metrics,
            'agreement_analysis': {
                'pro_vs_ground_truth': pro_vs_ground_truth_agreement,
                'pro_judge_agreement_with_flash': judge_agreement_rate
            }
        }
    
    def _calculate_model_metrics(self, y_true: np.ndarray, y_pred: np.ndarray, model_name: str) -> Dict[str, Any]:
        """Calculate detailed metrics for a single model"""
        
        # Ensure binary values
        y_true = np.array(y_true).astype(int)
        y_pred = np.array(y_pred).astype(int)
        
        # Validate binary values
        if not (set(np.unique(y_true)) <= {0, 1} and set(np.unique(y_pred)) <= {0, 1}):
            print(f"⚠️ Warning: Non-binary values detected in {model_name}")
            print(f"   y_true unique: {np.unique(y_true)}")
            print(f"   y_pred unique: {np.unique(y_pred)}")
            # Force to binary
            y_true = np.clip(y_true, 0, 1)
            y_pred = np.clip(y_pred, 0, 1)
        
        accuracy = accuracy_score(y_true, y_pred)
        
        # Handle edge cases where only one class is predicted
        unique_pred = np.unique(y_pred)
        unique_true = np.unique(y_true)
        
        if len(unique_pred) == 1 or len(unique_true) == 1:
            # Use macro average for edge cases
            precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
            recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
        else:
            precision = precision_score(y_true, y_pred, average='binary', zero_division=0)
            recall = recall_score(y_true, y_pred, average='binary', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)
        
        # Confusion matrix - ensure we get 2x2 matrix
        cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
        
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
        elif cm.shape == (1, 1):
            # Handle case where only one class exists
            if unique_true[0] == 0 and unique_pred[0] == 0:
                tn, fp, fn, tp = cm[0,0], 0, 0, 0
            elif unique_true[0] == 1 and unique_pred[0] == 1:
                tn, fp, fn, tp = 0, 0, 0, cm[0,0]
            else:
                tn, fp, fn, tp = 0, cm[0,0], 0, 0
        else:
            print(f"⚠️ Unexpected confusion matrix shape for {model_name}: {cm.shape}")
            tn = fp = fn = tp = 0
        
        # Additional metrics
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0  # Same as recall
        npv = tn / (tn + fn) if (tn + fn) > 0 else 0  # Negative Predictive Value
        ppv = tp / (tp + fp) if (tp + fp) > 0 else 0  # Positive Predictive Value (same as precision)
        
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Positive Rate
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0  # False Negative Rate
        
        balanced_accuracy = (sensitivity + specificity) / 2
        
        # Matthews Correlation Coefficient
        mcc_num = (tp * tn) - (fp * fn)
        mcc_den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
        mcc = mcc_num / mcc_den if mcc_den > 0 else 0
        
        return {
            'model_name': model_name,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'specificity': specificity,
            'sensitivity': sensitivity,
            'balanced_accuracy': balanced_accuracy,
            'negative_predictive_value': npv,
            'positive_predictive_value': ppv,
            'false_positive_rate': fpr,
            'false_negative_rate': fnr,
            'matthews_correlation': mcc,
            'confusion_matrix': {
                'true_positives': int(tp),
                'false_positives': int(fp), 
                'true_negatives': int(tn),
                'false_negatives': int(fn)
            }
        }
    
    def create_classification_dashboard(self, metrics: Dict[str, Any], df: pd.DataFrame) -> str:
        """Create comprehensive classification metrics dashboard"""
        
        if not metrics:
            return "<html><body><h1>No valid metrics to display</h1></body></html>"
        
        pro_metrics = metrics['pro_15_metrics']
        dataset_info = metrics['dataset_info']
        agreement = metrics['agreement_analysis']
        
        pro_cm = pro_metrics['confusion_matrix']
        
        # Performance assessment
        if pro_metrics['f1_score'] >= 0.8:
            pro_assessment = "🟢 EXCELLENT"
            pro_color = "#10B981"
        elif pro_metrics['f1_score'] >= 0.7:
            pro_assessment = "🟡 GOOD" 
            pro_color = "#F59E0B"
        elif pro_metrics['f1_score'] >= 0.6:
            pro_assessment = "🟠 MODERATE"
            pro_color = "#FF6B35"
        else:
            pro_assessment = "🔴 POOR"
            pro_color = "#EF4444"
        
        html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Gemini 1.5 Pro Classification Metrics Dashboard</title>
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{ font-family: 'Inter', sans-serif; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: #333; line-height: 1.6; min-height: 100vh; }}
        .dashboard {{ max-width: 1800px; margin: 0 auto; padding: 20px; }}
        .header {{ background: rgba(255,255,255,0.95); padding: 40px; border-radius: 25px; text-align: center; margin-bottom: 30px; backdrop-filter: blur(15px); box-shadow: 0 20px 60px rgba(0,0,0,0.1); }}
        .header h1 {{ font-size: 3rem; margin-bottom: 15px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; }}
        .header h2 {{ font-size: 1.5rem; color: #666; margin-bottom: 20px; }}
        .model-badge {{ display: inline-block; padding: 10px 25px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 30px; font-weight: bold; font-size: 1.1rem; margin: 5px; }}
        .metrics-overview {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 20px; margin-bottom: 40px; }}
        .metric-card {{ background: rgba(255,255,255,0.95); padding: 25px; border-radius: 20px; text-align: center; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); transition: transform 0.3s ease; }}
        .metric-card:hover {{ transform: translateY(-5px); }}
        .metric-value {{ font-size: 2.5rem; font-weight: bold; margin-bottom: 10px; }}
        .metric-label {{ font-size: 1rem; color: #666; font-weight: 600; margin-bottom: 5px; }}
        .metric-description {{ font-size: 0.85rem; color: #888; }}
        .excellent {{ color: #10B981; }}
        .good {{ color: #F59E0B; }}
        .moderate {{ color: #FF6B35; }}
        .poor {{ color: #EF4444; }}
        .dashboard-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(450px, 1fr)); gap: 25px; margin-bottom: 40px; }}
        .chart-section {{ background: rgba(255,255,255,0.95); padding: 30px; border-radius: 20px; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); }}
        .chart-title {{ font-size: 1.3rem; font-weight: bold; margin-bottom: 20px; color: #333; text-align: center; }}
        .confusion-matrix {{ display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px; margin: 20px 0; text-align: center; font-size: 0.9rem; }}
        .cm-cell {{ padding: 15px; border-radius: 12px; font-weight: bold; display: flex; flex-direction: column; justify-content: center; align-items: center; }}
        .cm-header {{ background: linear-gradient(135deg, #f8f9fa, #e9ecef); color: #495057; }}
        .cm-tp {{ background: linear-gradient(135deg, #d4edda, #c3e6cb); color: #155724; font-size: 1.8rem; }}
        .cm-fp {{ background: linear-gradient(135deg, #f8d7da, #f1b0b7); color: #721c24; font-size: 1.8rem; }}
        .cm-fn {{ background: linear-gradient(135deg, #fff3cd, #fce4a6); color: #856404; font-size: 1.8rem; }}
        .cm-tn {{ background: linear-gradient(135deg, #d1ecf1, #bee5eb); color: #0c5460; font-size: 1.8rem; }}
        .cm-label {{ font-size: 0.75rem; margin-top: 5px; opacity: 0.8; }}
        .performance-summary {{ background: linear-gradient(135deg, #e3f2fd, #bbdefb); padding: 30px; border-radius: 20px; margin: 30px 0; text-align: center; }}
        .performance-score {{ font-size: 3.5rem; font-weight: bold; color: {pro_color}; margin-bottom: 10px; }}
        .performance-label {{ font-size: 1.3rem; color: #37474f; margin-bottom: 8px; }}
        .performance-description {{ font-size: 1rem; color: #546e7a; }}
        .comparison-section {{ background: rgba(255,255,255,0.95); padding: 30px; border-radius: 20px; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); margin-bottom: 30px; }}
        .model-column {{ padding: 20px; border-radius: 15px; }}
        .pro-column {{ background: linear-gradient(135deg, #e3f2fd, #bbdefb); }}
        .model-title {{ font-size: 1.5rem; font-weight: bold; text-align: center; margin-bottom: 20px; }}
        .metric-row {{ display: flex; justify-content: space-between; margin: 8px 0; padding: 8px; background: rgba(255,255,255,0.5); border-radius: 8px; }}
    </style>
</head>
<body>
    <div class="dashboard">
        <div class="header">
            <h1>🎯 Gemini 1.5 Pro Classification Metrics</h1>
            <h2>Performance vs Meta Ground Truth</h2>
            <div>
                <span class="model-badge">Gemini 1.5 Pro</span>
            </div>
            <p style="margin-top: 20px; color: #666;">
                Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | 
                Samples: {dataset_info['total_samples']:,} | 
                Positive: {dataset_info['positive_samples']} | Negative: {dataset_info['negative_samples']}
            </p>
        </div>

        <div class="performance-summary">
            <div class="performance-score">{pro_metrics['f1_score']:.3f}</div>
            <div class="performance-label">{pro_assessment} F1-Score (Gemini 1.5 Pro)</div>
            <div class="performance-description">
                1.5 Pro achieves {pro_metrics['accuracy']:.1%} accuracy with {pro_metrics['precision']:.1%} precision
                <br>Judge Agreement with Flash: {agreement['pro_judge_agreement_with_flash']:.1%}
            </div>
        </div>

        <div class="metrics-overview">
            <div class="metric-card">
                <div class="metric-value {'excellent' if pro_metrics['f1_score'] >= 0.8 else 'good' if pro_metrics['f1_score'] >= 0.7 else 'moderate' if pro_metrics['f1_score'] >= 0.6 else 'poor'}">{pro_metrics['f1_score']:.3f}</div>
                <div class="metric-label">1.5 Pro F1-Score</div>
                <div class="metric-description">Harmonic Mean P&R</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if pro_metrics['recall'] >= 0.8 else 'good' if pro_metrics['recall'] >= 0.7 else 'moderate' if pro_metrics['recall'] >= 0.6 else 'poor'}">{pro_metrics['recall']:.3f}</div>
                <div class="metric-label">1.5 Pro TPR</div>
                <div class="metric-description">True Positive Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if pro_metrics['specificity'] >= 0.8 else 'good' if pro_metrics['specificity'] >= 0.7 else 'moderate' if pro_metrics['specificity'] >= 0.6 else 'poor'}">{pro_metrics['specificity']:.3f}</div>
                <div class="metric-label">1.5 Pro TNR</div>
                <div class="metric-description">True Negative Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if pro_metrics['false_positive_rate'] <= 0.2 else 'good' if pro_metrics['false_positive_rate'] <= 0.3 else 'moderate' if pro_metrics['false_positive_rate'] <= 0.4 else 'poor'}">{pro_metrics['false_positive_rate']:.3f}</div>
                <div class="metric-label">1.5 Pro FPR</div>
                <div class="metric-description">False Positive Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if pro_metrics['false_negative_rate'] <= 0.2 else 'good' if pro_metrics['false_negative_rate'] <= 0.3 else 'moderate' if pro_metrics['false_negative_rate'] <= 0.4 else 'poor'}">{pro_metrics['false_negative_rate']:.3f}</div>
                <div class="metric-label">1.5 Pro FNR</div>
                <div class="metric-description">False Negative Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if pro_metrics['accuracy'] >= 0.9 else 'good' if pro_metrics['accuracy'] >= 0.8 else 'moderate' if pro_metrics['accuracy'] >= 0.7 else 'poor'}">{pro_metrics['accuracy']:.3f}</div>
                <div class="metric-label">1.5 Pro Accuracy</div>
                <div class="metric-description">Overall Correctness</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if pro_metrics['precision'] >= 0.8 else 'good' if pro_metrics['precision'] >= 0.7 else 'moderate' if pro_metrics['precision'] >= 0.6 else 'poor'}">{pro_metrics['precision']:.3f}</div>
                <div class="metric-label">1.5 Pro Precision</div>
                <div class="metric-description">Positive Predictive Value</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if agreement['pro_judge_agreement_with_flash'] >= 0.8 else 'good' if agreement['pro_judge_agreement_with_flash'] >= 0.7 else 'moderate' if agreement['pro_judge_agreement_with_flash'] >= 0.6 else 'poor'}">{agreement['pro_judge_agreement_with_flash']:.1%}</div>
                <div class="metric-label">Judge Agreement</div>
                <div class="metric-description">1.5 Pro agrees with Flash</div>
            </div>
        </div>

        <div class="comparison-section">
            <div class="chart-title">📊 Gemini 1.5 Pro Performance Metrics</div>
            <div style="max-width: 600px; margin: 0 auto;">
                <div class="model-column pro-column">
                    <div class="model-title">🧠 Gemini 1.5 Pro vs Ground Truth</div>
                    <div class="metric-row"><span>F1-Score:</span><span class="{'excellent' if pro_metrics['f1_score'] >= 0.8 else 'good' if pro_metrics['f1_score'] >= 0.7 else 'moderate' if pro_metrics['f1_score'] >= 0.6 else 'poor'}">{pro_metrics['f1_score']:.3f}</span></div>
                    <div class="metric-row"><span>Accuracy:</span><span>{pro_metrics['accuracy']:.3f}</span></div>
                    <div class="metric-row"><span>Precision:</span><span>{pro_metrics['precision']:.3f}</span></div>
                    <div class="metric-row"><span>Recall (TPR):</span><span>{pro_metrics['recall']:.3f}</span></div>
                    <div class="metric-row"><span>Specificity (TNR):</span><span>{pro_metrics['specificity']:.3f}</span></div>
                    <div class="metric-row"><span>False Positive Rate:</span><span>{pro_metrics['false_positive_rate']:.3f}</span></div>
                    <div class="metric-row"><span>False Negative Rate:</span><span>{pro_metrics['false_negative_rate']:.3f}</span></div>
                    <div class="metric-row"><span>Judge Agreement with Flash:</span><span>{agreement['pro_judge_agreement_with_flash']:.1%}</span></div>
                </div>
            </div>
        </div>

        <div class="dashboard-grid">
            <div class="chart-section">
                <div class="chart-title">🧠 Gemini 1.5 Pro Confusion Matrix</div>
                <div class="confusion-matrix">
                    <div class="cm-cell cm-header"></div>
                    <div class="cm-cell cm-header">Ground Truth: 0</div>
                    <div class="cm-cell cm-header">Ground Truth: 1</div>
                    
                    <div class="cm-cell cm-header">1.5 Pro: 0</div>
                    <div class="cm-cell cm-tn">
                        {pro_cm['true_negatives']}
                        <div class="cm-label">True Negatives</div>
                    </div>
                    <div class="cm-cell cm-fn">
                        {pro_cm['false_negatives']}
                        <div class="cm-label">False Negatives</div>
                    </div>
                    
                    <div class="cm-cell cm-header">1.5 Pro: 1</div>
                    <div class="cm-cell cm-fp">
                        {pro_cm['false_positives']}
                        <div class="cm-label">False Positives</div>
                    </div>
                    <div class="cm-cell cm-tp">
                        {pro_cm['true_positives']}
                        <div class="cm-label">True Positives</div>
                    </div>
                </div>
                <div style="text-align: center; margin-top: 15px;">
                    <p><strong>TP:</strong> {pro_cm['true_positives']} | <strong>FP:</strong> {pro_cm['false_positives']} | <strong>TN:</strong> {pro_cm['true_negatives']} | <strong>FN:</strong> {pro_cm['false_negatives']}</p>
                </div>
            </div>
        </div>
    </div>

    <script>
        window.addEventListener('load', function() {{
            console.log('Gemini 1.5 Pro Classification Dashboard Loaded');
        }});
    </script>
</body>
</html>"""
        
        return html_content
    
    def generate_classification_dashboard(self):
        """Main function to generate the classification dashboard"""
        print("🚀 STARTING GEMINI 1.5 PRO CLASSIFICATION DASHBOARD GENERATION")
        print("=" * 70)
        
        try:
            # Load results with ground truth
            df = self.load_results_with_ground_truth()
            
            if df.empty:
                print("❌ No data available for dashboard")
                return None
            
            # Calculate metrics
            metrics = self.calculate_classification_metrics(df)
            
            if not metrics:
                print("❌ Could not calculate metrics")
                return None
            
            # Create dashboard
            print("\n📊 Creating Gemini 1.5 Pro Classification Dashboard...")
            dashboard_html = self.create_classification_dashboard(metrics, df)
            
            # Save files
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            dashboard_filename = f"gemini_15_pro_classification_dashboard_{timestamp}.html"
            metrics_filename = f"gemini_15_pro_classification_metrics_{timestamp}.json"
            
            with open(dashboard_filename, "w", encoding='utf-8') as f:
                f.write(dashboard_html)
            
            with open(metrics_filename, "w") as f:
                json.dump(metrics, f, indent=2, default=str)
            
            print(f"✅ Dashboard saved: {dashboard_filename}")
            print(f"✅ Metrics saved: {metrics_filename}")
            
            # Summary
            pro_metrics = metrics['pro_15_metrics']
            agreement = metrics['agreement_analysis']
            
            print(f"\n📊 GEMINI 1.5 PRO CLASSIFICATION METRICS SUMMARY:")
            print(f"   🧠 Gemini 1.5 Pro:")
            print(f"      F1-Score: {pro_metrics['f1_score']:.3f}")
            print(f"      Precision: {pro_metrics['precision']:.3f}")
            print(f"      Recall: {pro_metrics['recall']:.3f}") 
            print(f"      Accuracy: {pro_metrics['accuracy']:.3f}")
            print(f"      TP: {pro_metrics['confusion_matrix']['true_positives']}, FP: {pro_metrics['confusion_matrix']['false_positives']}")
            print(f"      TN: {pro_metrics['confusion_matrix']['true_negatives']}, FN: {pro_metrics['confusion_matrix']['false_negatives']}")
            
            print(f"\n   ⚖️ Agreement Rates:")
            print(f"      1.5 Pro vs Ground Truth: {agreement['pro_vs_ground_truth']:.1%}")
            print(f"      1.5 Pro Judge Agreement: {agreement['pro_judge_agreement_with_flash']:.1%}")
            
            return dashboard_filename, metrics_filename
            
        except Exception as e:
            print(f"❌ Dashboard generation failed: {e}")
            import traceback
            traceback.print_exc()
            return None

def main():
    """Run the Gemini 1.5 Pro classification dashboard generation"""
    dashboard_generator = Gemini15ProClassificationDashboard()
    result = dashboard_generator.generate_classification_dashboard()
    
    if result:
        dashboard_file, metrics_file = result
        print(f"\n🎉 SUCCESS!")
        print(f"📁 Files generated:")
        print(f"   1. {dashboard_file}")
        print(f"   2. {metrics_file}")
        print(f"\n💡 Open {dashboard_file} in your browser to view the dashboard!")
    else:
        print(f"\n❌ Dashboard generation failed")

if __name__ == "__main__":
    main()

🚀 STARTING GEMINI 1.5 PRO CLASSIFICATION DASHBOARD GENERATION
📥 Loading Gemini 1.5 Pro results with ground truth...
📊 Loading Gemini 1.5 Pro results data...
   Found 299 result records
📊 Loading ground truth data...
   Found 299 ground truth records
   Ground truth distribution from Meta table: {100: 187, 0: 112}
✅ Merged dataset: 299 records
   Sample artifact_ids from results: ['meta:1174568814711709', 'meta:1192660369325206', 'meta:1216373337197968', 'meta:122131923230709520', 'meta:1795720740984128']
   Sample artifact_ids from meta: ['meta:122140809938790694', 'meta:4050755845195196', 'meta:1173810271458869', 'meta:1297387721748313', 'meta:1138085658345605']
   Sample flash_classification: [0, 100, 100, 100, 0]
   Sample ground_truth after merge: [0, 100, 100, 100, 0]
   Flash vs Ground Truth identical BEFORE conversion: 299/299
   🚨 CRITICAL ERROR: flash_classification is identical to ground_truth!
   This means your ground truth data IS the Flash predictions, not human annotatio

In [52]:
#!/usr/bin/env python3
"""
Web Gemini Pro Classification Metrics Dashboard - Real Results vs Web Ground Truth
Calculates traditional ML metrics (F1, Precision, Recall, Confusion Matrix) for Gemini Pro decisions on Web content
"""

import pandas as pd
import json
from google.cloud import bigquery
from datetime import datetime
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
import numpy as np
from typing import Dict, List, Any, Optional

PROJECT_ID = "scope3-dev"
DATASET_ID = "research_bs_monitoring"
WEB_GROUND_TRUTH_TABLE = "BA_Web_Ground_Truth"  # Web ground truth table
WEB_RESULTS_TABLE = "BA_Web_Gemini_Pro_Judge_Results"  # Web Gemini Pro results table

class WebGeminiProClassificationDashboard:
    def __init__(self):
        self.client = bigquery.Client(project=PROJECT_ID)
        
    def load_results_with_ground_truth(self) -> pd.DataFrame:
        """Load Web Gemini Pro results with real ground truth from Web Ground Truth table"""
        print("📥 Loading Web Gemini Pro results with ground truth...")
        
        # First, let's get the results data
        results_query = f"""
        SELECT 
            artifact_id,
            flash_classification,
            flash_reasoning,
            model_prompt,
            pro_judge_agreement,
            pro_verdict,
            pro_confidence,
            pro_would_reach_same_conclusion,
            pro_reasoning,
            flash_vs_pro_analysis,
            improvements,
            api_call_time,
            model_used,
            error_message
        FROM `{PROJECT_ID}.{DATASET_ID}.{WEB_RESULTS_TABLE}`
        WHERE pro_judge_agreement IS NOT NULL
        AND pro_would_reach_same_conclusion IS NOT NULL
        AND (error_message IS NULL OR error_message = '')
        """
        
        # Get real ground truth from Web Ground Truth table
        web_gt_query = f"""
        SELECT 
            artifact_id,
            correct_classification as ground_truth,
            correct_reasoning as ground_truth_reasoning,
            source
        FROM `{PROJECT_ID}.{DATASET_ID}.{WEB_GROUND_TRUTH_TABLE}`
        """
        
        try:
            print("📊 Loading Web Gemini Pro results data...")
            results_df = self.client.query(results_query).to_dataframe()
            print(f"   Found {len(results_df)} result records")
            
            print("📊 Loading Web ground truth data...")
            web_gt_df = self.client.query(web_gt_query).to_dataframe()
            print(f"   Found {len(web_gt_df)} web ground truth records")
            
            # Debug: Check ground truth distribution
            if len(web_gt_df) > 0:
                gt_dist = web_gt_df['ground_truth'].value_counts().to_dict()
                print(f"   Web ground truth distribution: {gt_dist}")
            
            # Join results with ground truth
            df = results_df.merge(web_gt_df, on='artifact_id', how='inner')
            print(f"✅ Merged dataset: {len(df)} records")
            
            if len(df) == 0:
                print("❌ No matching records found after merge")
                return pd.DataFrame()
            
            # Debug: Check what we actually got from the merge
            print(f"   Sample artifact_ids from results: {results_df['artifact_id'].head().tolist()}")
            print(f"   Sample artifact_ids from web GT: {web_gt_df['artifact_id'].head().tolist()}")
            print(f"   Sample flash_classification: {results_df['flash_classification'].head().tolist()}")
            print(f"   Sample web ground_truth after merge: {df['ground_truth'].head().tolist()}")
            
            # Check if flash_classification and ground_truth are identical BEFORE conversion
            identical_before_conversion = (df['flash_classification'] == df['ground_truth']).sum()
            print(f"   Flash vs Web Ground Truth identical BEFORE conversion: {identical_before_conversion}/{len(df)}")
            
            if identical_before_conversion == len(df):
                print("   🚨 CRITICAL ERROR: flash_classification is identical to web ground_truth!")
                print("   This means your web ground truth data IS the Flash predictions, not human annotations.")
                print("   Check your data pipeline - Web GT table might contain Flash results instead of human labels.")
                
                # Let's check a few individual records to confirm
                print("   📋 Sample records to verify:")
                for i in range(min(3, len(df))):
                    row = df.iloc[i]
                    print(f"      Record {i+1}: artifact_id={row['artifact_id']}, flash={row['flash_classification']}, web_gt={row['ground_truth']}")
            
            # Convert to binary format
            df['ground_truth_binary'] = (df['ground_truth'] == 100).astype(int)
            df['flash_binary'] = (df['flash_classification'] == 100).astype(int)
            
            # Convert Gemini 1.5 "would reach same conclusion" to predictions
            df['gemini_15_web_prediction'] = np.where(
                df['pro_would_reach_same_conclusion'] == True,
                df['flash_binary'],  # Same as Flash
                1 - df['flash_binary']  # Opposite of Flash
            ).astype(int)
            
            print(f"📊 Data distribution:")
            print(f"   Web ground truth: {df['ground_truth_binary'].value_counts().to_dict()}")
            print(f"   Flash predictions: {df['flash_binary'].value_counts().to_dict()}")
            print(f"   Web Gemini 1.5 predictions: {df['gemini_15_web_prediction'].value_counts().to_dict()}")
            print(f"   Judge agreement rate: {df['pro_judge_agreement'].mean():.1%}")
            
            # Validate binary data
            for col, name in [('ground_truth_binary', 'Web ground truth'), 
                            ('flash_binary', 'Flash'), 
                            ('gemini_15_web_prediction', 'Web Gemini 1.5')]:
                if not all(df[col].isin([0, 1])):
                    print(f"⚠️ Warning: {name} contains non-binary values")
                    df = df[df[col].isin([0, 1])]
            
            print(f"📊 Final clean dataset: {len(df)} records")
            
            # Debug sample comparisons
            print(f"\n🔍 DEBUG - Sample comparisons (first 5 records):")
            print(f"   Columns available: {df.columns.tolist()}")
            for i in range(min(5, len(df))):
                row = df.iloc[i]
                print(f"   Record {i+1}:")
                print(f"      artifact_id: {row['artifact_id']}")
                print(f"      web_ground_truth (from Web GT): {row['ground_truth']} -> binary: {row['ground_truth_binary']}")
                print(f"      flash_classification: {row['flash_classification']} -> binary: {row['flash_binary']}")
                print(f"      pro_would_reach_same_conclusion: {row['pro_would_reach_same_conclusion']}")
                print(f"      gemini_15_web_prediction: {row['gemini_15_web_prediction']}")
                print(f"      Flash vs Web GT: {'✅ MATCH' if row['flash_binary'] == row['ground_truth_binary'] else '❌ DIFFER'}")
                print(f"      ---")
            
            # Check if Flash predictions are identical to ground truth
            flash_matches_gt = (df['flash_binary'] == df['ground_truth_binary']).sum()
            print(f"\n🚨 CRITICAL CHECK:")
            print(f"   Flash predictions matching web ground truth: {flash_matches_gt}/{len(df)} ({flash_matches_gt/len(df):.1%})")
            
            if flash_matches_gt == len(df):
                print("   🚨 ERROR: Flash predictions are 100% identical to web ground truth!")
                print("   This suggests the web ground truth data is actually Flash's predictions, not human annotations.")
                print("   Check your data pipeline - the 'ground_truth' field might be populated with Flash results.")
            elif flash_matches_gt > len(df) * 0.95:
                print("   ⚠️ WARNING: Flash predictions are suspiciously similar to web ground truth (>95% match)")
                print("   This might indicate data contamination.")
            else:
                print("   ✅ Good: Flash predictions differ from web ground truth as expected.")
            
            return df
            
        except Exception as e:
            print(f"❌ Error loading data: {e}")
            import traceback
            traceback.print_exc()
            return pd.DataFrame()
    
    def calculate_classification_metrics(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Calculate comprehensive classification metrics for Web Gemini Pro"""
        print("📊 Calculating classification metrics...")
        
        if len(df) == 0:
            return {}
        
        y_true = df['ground_truth_binary'].values
        gemini_15_pred = df['gemini_15_web_prediction'].values
        
        # Web Gemini 1.5 metrics - Compare Web Gemini 1.5 predictions with web ground truth  
        gemini_15_metrics = self._calculate_model_metrics(y_true, gemini_15_pred, "Web Gemini 1.5")
        
        # Agreement analysis
        gemini_15_vs_ground_truth_agreement = (gemini_15_pred == y_true).mean()
        judge_agreement_rate = df['pro_judge_agreement'].mean()
        
        print(f"✅ Metrics calculated:")
        print(f"   Web Gemini 1.5 F1: {gemini_15_metrics['f1_score']:.3f}")
        print(f"   Judge agreement: {judge_agreement_rate:.1%}")
        
        return {
            'dataset_info': {
                'total_samples': len(df),
                'positive_samples': int((y_true == 1).sum()),
                'negative_samples': int((y_true == 0).sum()),
                'class_balance': float((y_true == 1).mean())
            },
            'gemini_15_web_metrics': gemini_15_metrics,
            'agreement_analysis': {
                'gemini_15_vs_ground_truth': gemini_15_vs_ground_truth_agreement,
                'gemini_15_judge_agreement_with_flash': judge_agreement_rate
            }
        }
    
    def _calculate_model_metrics(self, y_true: np.ndarray, y_pred: np.ndarray, model_name: str) -> Dict[str, Any]:
        """Calculate detailed metrics for a single model"""
        
        # Ensure binary values
        y_true = np.array(y_true).astype(int)
        y_pred = np.array(y_pred).astype(int)
        
        # Validate binary values
        if not (set(np.unique(y_true)) <= {0, 1} and set(np.unique(y_pred)) <= {0, 1}):
            print(f"⚠️ Warning: Non-binary values detected in {model_name}")
            print(f"   y_true unique: {np.unique(y_true)}")
            print(f"   y_pred unique: {np.unique(y_pred)}")
            # Force to binary
            y_true = np.clip(y_true, 0, 1)
            y_pred = np.clip(y_pred, 0, 1)
        
        accuracy = accuracy_score(y_true, y_pred)
        
        # Handle edge cases where only one class is predicted
        unique_pred = np.unique(y_pred)
        unique_true = np.unique(y_true)
        
        if len(unique_pred) == 1 or len(unique_true) == 1:
            # Use macro average for edge cases
            precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
            recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
        else:
            precision = precision_score(y_true, y_pred, average='binary', zero_division=0)
            recall = recall_score(y_true, y_pred, average='binary', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)
        
        # Confusion matrix - ensure we get 2x2 matrix
        cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
        
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
        elif cm.shape == (1, 1):
            # Handle case where only one class exists
            if unique_true[0] == 0 and unique_pred[0] == 0:
                tn, fp, fn, tp = cm[0,0], 0, 0, 0
            elif unique_true[0] == 1 and unique_pred[0] == 1:
                tn, fp, fn, tp = 0, 0, 0, cm[0,0]
            else:
                tn, fp, fn, tp = 0, cm[0,0], 0, 0
        else:
            print(f"⚠️ Unexpected confusion matrix shape for {model_name}: {cm.shape}")
            tn = fp = fn = tp = 0
        
        # Additional metrics
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0  # Same as recall
        npv = tn / (tn + fn) if (tn + fn) > 0 else 0  # Negative Predictive Value
        ppv = tp / (tp + fp) if (tp + fp) > 0 else 0  # Positive Predictive Value (same as precision)
        
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Positive Rate
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0  # False Negative Rate
        
        balanced_accuracy = (sensitivity + specificity) / 2
        
        # Matthews Correlation Coefficient
        mcc_num = (tp * tn) - (fp * fn)
        mcc_den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
        mcc = mcc_num / mcc_den if mcc_den > 0 else 0
        
        return {
            'model_name': model_name,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'specificity': specificity,
            'sensitivity': sensitivity,
            'balanced_accuracy': balanced_accuracy,
            'negative_predictive_value': npv,
            'positive_predictive_value': ppv,
            'false_positive_rate': fpr,
            'false_negative_rate': fnr,
            'matthews_correlation': mcc,
            'confusion_matrix': {
                'true_positives': int(tp),
                'false_positives': int(fp), 
                'true_negatives': int(tn),
                'false_negatives': int(fn)
            }
        }
    
    def create_classification_dashboard(self, metrics: Dict[str, Any], df: pd.DataFrame) -> str:
        """Create comprehensive classification metrics dashboard"""
        
        if not metrics:
            return "<html><body><h1>No valid metrics to display</h1></body></html>"
        
        gemini_15_metrics = metrics['gemini_15_web_metrics']
        dataset_info = metrics['dataset_info']
        agreement = metrics['agreement_analysis']
        
        gemini_15_cm = gemini_15_metrics['confusion_matrix']
        
        # Performance assessment
        if gemini_15_metrics['f1_score'] >= 0.8:
            gemini_15_assessment = "🟢 EXCELLENT"
            gemini_15_color = "#10B981"
        elif gemini_15_metrics['f1_score'] >= 0.7:
            gemini_15_assessment = "🟡 GOOD" 
            gemini_15_color = "#F59E0B"
        elif gemini_15_metrics['f1_score'] >= 0.6:
            gemini_15_assessment = "🟠 MODERATE"
            gemini_15_color = "#FF6B35"
        else:
            gemini_15_assessment = "🔴 POOR"
            gemini_15_color = "#EF4444"
        
        html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Web Gemini 1.5 Classification Metrics Dashboard</title>
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{ font-family: 'Inter', sans-serif; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: #333; line-height: 1.6; min-height: 100vh; }}
        .dashboard {{ max-width: 1800px; margin: 0 auto; padding: 20px; }}
        .header {{ background: rgba(255,255,255,0.95); padding: 40px; border-radius: 25px; text-align: center; margin-bottom: 30px; backdrop-filter: blur(15px); box-shadow: 0 20px 60px rgba(0,0,0,0.1); }}
        .header h1 {{ font-size: 3rem; margin-bottom: 15px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; }}
        .header h2 {{ font-size: 1.5rem; color: #666; margin-bottom: 20px; }}
        .model-badge {{ display: inline-block; padding: 10px 25px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 30px; font-weight: bold; font-size: 1.1rem; margin: 5px; }}
        .metrics-overview {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 20px; margin-bottom: 40px; }}
        .metric-card {{ background: rgba(255,255,255,0.95); padding: 25px; border-radius: 20px; text-align: center; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); transition: transform 0.3s ease; }}
        .metric-card:hover {{ transform: translateY(-5px); }}
        .metric-value {{ font-size: 2.5rem; font-weight: bold; margin-bottom: 10px; }}
        .metric-label {{ font-size: 1rem; color: #666; font-weight: 600; margin-bottom: 5px; }}
        .metric-description {{ font-size: 0.85rem; color: #888; }}
        .excellent {{ color: #10B981; }}
        .good {{ color: #F59E0B; }}
        .moderate {{ color: #FF6B35; }}
        .poor {{ color: #EF4444; }}
        .dashboard-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(450px, 1fr)); gap: 25px; margin-bottom: 40px; }}
        .chart-section {{ background: rgba(255,255,255,0.95); padding: 30px; border-radius: 20px; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); }}
        .chart-title {{ font-size: 1.3rem; font-weight: bold; margin-bottom: 20px; color: #333; text-align: center; }}
        .confusion-matrix {{ display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px; margin: 20px 0; text-align: center; font-size: 0.9rem; }}
        .cm-cell {{ padding: 15px; border-radius: 12px; font-weight: bold; display: flex; flex-direction: column; justify-content: center; align-items: center; }}
        .cm-header {{ background: linear-gradient(135deg, #f8f9fa, #e9ecef); color: #495057; }}
        .cm-tp {{ background: linear-gradient(135deg, #d4edda, #c3e6cb); color: #155724; font-size: 1.8rem; }}
        .cm-fp {{ background: linear-gradient(135deg, #f8d7da, #f1b0b7); color: #721c24; font-size: 1.8rem; }}
        .cm-fn {{ background: linear-gradient(135deg, #fff3cd, #fce4a6); color: #856404; font-size: 1.8rem; }}
        .cm-tn {{ background: linear-gradient(135deg, #d1ecf1, #bee5eb); color: #0c5460; font-size: 1.8rem; }}
        .cm-label {{ font-size: 0.75rem; margin-top: 5px; opacity: 0.8; }}
        .performance-summary {{ background: linear-gradient(135deg, #e3f2fd, #bbdefb); padding: 30px; border-radius: 20px; margin: 30px 0; text-align: center; }}
        .performance-score {{ font-size: 3.5rem; font-weight: bold; color: {gemini_15_color}; margin-bottom: 10px; }}
        .performance-label {{ font-size: 1.3rem; color: #37474f; margin-bottom: 8px; }}
        .performance-description {{ font-size: 1rem; color: #546e7a; }}
        .comparison-section {{ background: rgba(255,255,255,0.95); padding: 30px; border-radius: 20px; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); margin-bottom: 30px; }}
        .model-column {{ padding: 20px; border-radius: 15px; }}
        .pro-column {{ background: linear-gradient(135deg, #e3f2fd, #bbdefb); }}
        .model-title {{ font-size: 1.5rem; font-weight: bold; text-align: center; margin-bottom: 20px; }}
        .metric-row {{ display: flex; justify-content: space-between; margin: 8px 0; padding: 8px; background: rgba(255,255,255,0.5); border-radius: 8px; }}
    </style>
</head>
<body>
    <div class="dashboard">
        <div class="header">
            <h1>🌐 Web Gemini 1.5 Classification Metrics</h1>
            <h2>Performance vs Web Ground Truth</h2>
            <div>
                <span class="model-badge">Web Gemini 1.5</span>
            </div>
            <p style="margin-top: 20px; color: #666;">
                Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | 
                Samples: {dataset_info['total_samples']:,} | 
                Positive: {dataset_info['positive_samples']} | Negative: {dataset_info['negative_samples']}
            </p>
        </div>

        <div class="performance-summary">
            <div class="performance-score">{gemini_15_metrics['f1_score']:.3f}</div>
            <div class="performance-label">{gemini_15_assessment} F1-Score (Web Gemini 1.5)</div>
            <div class="performance-description">
                Web Gemini 1.5 achieves {gemini_15_metrics['accuracy']:.1%} accuracy with {gemini_15_metrics['precision']:.1%} precision
                <br>Judge Agreement with Flash: {agreement['gemini_15_judge_agreement_with_flash']:.1%}
            </div>
        </div>

        <div class="metrics-overview">
            <div class="metric-card">
                <div class="metric-value {'excellent' if gemini_15_metrics['f1_score'] >= 0.8 else 'good' if gemini_15_metrics['f1_score'] >= 0.7 else 'moderate' if gemini_15_metrics['f1_score'] >= 0.6 else 'poor'}">{gemini_15_metrics['f1_score']:.3f}</div>
                <div class="metric-label">Web 1.5 F1-Score</div>
                <div class="metric-description">Harmonic Mean P&R</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if gemini_15_metrics['recall'] >= 0.8 else 'good' if gemini_15_metrics['recall'] >= 0.7 else 'moderate' if gemini_15_metrics['recall'] >= 0.6 else 'poor'}">{gemini_15_metrics['recall']:.3f}</div>
                <div class="metric-label">Web 1.5 TPR</div>
                <div class="metric-description">True Positive Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if gemini_15_metrics['specificity'] >= 0.8 else 'good' if gemini_15_metrics['specificity'] >= 0.7 else 'moderate' if gemini_15_metrics['specificity'] >= 0.6 else 'poor'}">{gemini_15_metrics['specificity']:.3f}</div>
                <div class="metric-label">Web 1.5 TNR</div>
                <div class="metric-description">True Negative Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if gemini_15_metrics['false_positive_rate'] <= 0.2 else 'good' if gemini_15_metrics['false_positive_rate'] <= 0.3 else 'moderate' if gemini_15_metrics['false_positive_rate'] <= 0.4 else 'poor'}">{gemini_15_metrics['false_positive_rate']:.3f}</div>
                <div class="metric-label">Web 1.5 FPR</div>
                <div class="metric-description">False Positive Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if gemini_15_metrics['false_negative_rate'] <= 0.2 else 'good' if gemini_15_metrics['false_negative_rate'] <= 0.3 else 'moderate' if gemini_15_metrics['false_negative_rate'] <= 0.4 else 'poor'}">{gemini_15_metrics['false_negative_rate']:.3f}</div>
                <div class="metric-label">Web 1.5 FNR</div>
                <div class="metric-description">False Negative Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if gemini_15_metrics['accuracy'] >= 0.9 else 'good' if gemini_15_metrics['accuracy'] >= 0.8 else 'moderate' if gemini_15_metrics['accuracy'] >= 0.7 else 'poor'}">{gemini_15_metrics['accuracy']:.3f}</div>
                <div class="metric-label">Web 1.5 Accuracy</div>
                <div class="metric-description">Overall Correctness</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if gemini_15_metrics['precision'] >= 0.8 else 'good' if gemini_15_metrics['precision'] >= 0.7 else 'moderate' if gemini_15_metrics['precision'] >= 0.6 else 'poor'}">{gemini_15_metrics['precision']:.3f}</div>
                <div class="metric-label">Web 1.5 Precision</div>
                <div class="metric-description">Positive Predictive Value</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if agreement['gemini_15_judge_agreement_with_flash'] >= 0.8 else 'good' if agreement['gemini_15_judge_agreement_with_flash'] >= 0.7 else 'moderate' if agreement['gemini_15_judge_agreement_with_flash'] >= 0.6 else 'poor'}">{agreement['gemini_15_judge_agreement_with_flash']:.1%}</div>
                <div class="metric-label">Judge Agreement</div>
                <div class="metric-description">Web 1.5 agrees with Flash</div>
            </div>
        </div>

        <div class="comparison-section">
            <div class="chart-title">📊 Web Gemini 1.5 Performance Metrics</div>
            <div style="max-width: 600px; margin: 0 auto;">
                <div class="model-column pro-column">
                    <div class="model-title">🌐 Web Gemini 1.5 vs Web Ground Truth</div>
                    <div class="metric-row"><span>F1-Score:</span><span class="{'excellent' if gemini_15_metrics['f1_score'] >= 0.8 else 'good' if gemini_15_metrics['f1_score'] >= 0.7 else 'moderate' if gemini_15_metrics['f1_score'] >= 0.6 else 'poor'}">{gemini_15_metrics['f1_score']:.3f}</span></div>
                    <div class="metric-row"><span>Accuracy:</span><span>{gemini_15_metrics['accuracy']:.3f}</span></div>
                    <div class="metric-row"><span>Precision:</span><span>{gemini_15_metrics['precision']:.3f}</span></div>
                    <div class="metric-row"><span>Recall (TPR):</span><span>{gemini_15_metrics['recall']:.3f}</span></div>
                    <div class="metric-row"><span>Specificity (TNR):</span><span>{gemini_15_metrics['specificity']:.3f}</span></div>
                    <div class="metric-row"><span>False Positive Rate:</span><span>{gemini_15_metrics['false_positive_rate']:.3f}</span></div>
                    <div class="metric-row"><span>False Negative Rate:</span><span>{gemini_15_metrics['false_negative_rate']:.3f}</span></div>
                    <div class="metric-row"><span>Judge Agreement with Flash:</span><span>{agreement['gemini_15_judge_agreement_with_flash']:.1%}</span></div>
                </div>
            </div>
        </div>

        <div class="dashboard-grid">
            <div class="chart-section">
                <div class="chart-title">🌐 Web Gemini 1.5 Confusion Matrix</div>
                <div class="confusion-matrix">
                    <div class="cm-cell cm-header"></div>
                    <div class="cm-cell cm-header">Web GT: 0</div>
                    <div class="cm-cell cm-header">Web GT: 1</div>
                    
                    <div class="cm-cell cm-header">Web 1.5: 0</div>
                    <div class="cm-cell cm-tn">
                        {gemini_15_cm['true_negatives']}
                        <div class="cm-label">True Negatives</div>
                    </div>
                    <div class="cm-cell cm-fn">
                        {gemini_15_cm['false_negatives']}
                        <div class="cm-label">False Negatives</div>
                    </div>
                    
                    <div class="cm-cell cm-header">Web 1.5: 1</div>
                    <div class="cm-cell cm-fp">
                        {gemini_15_cm['false_positives']}
                        <div class="cm-label">False Positives</div>
                    </div>
                    <div class="cm-cell cm-tp">
                        {gemini_15_cm['true_positives']}
                        <div class="cm-label">True Positives</div>
                    </div>
                </div>
                <div style="text-align: center; margin-top: 15px;">
                    <p><strong>TP:</strong> {gemini_15_cm['true_positives']} | <strong>FP:</strong> {gemini_15_cm['false_positives']} | <strong>TN:</strong> {gemini_15_cm['true_negatives']} | <strong>FN:</strong> {gemini_15_cm['false_negatives']}</p>
                </div>
            </div>
        </div>
    </div>

    <script>
        window.addEventListener('load', function() {{
            console.log('Web Gemini 1.5 Classification Dashboard Loaded');
        }});
    </script>
</body>
</html>"""
        
        return html_content
    
    def generate_classification_dashboard(self):
        """Main function to generate the classification dashboard"""
        print("🚀 STARTING WEB GEMINI PRO CLASSIFICATION DASHBOARD GENERATION")
        print("=" * 70)
        
        try:
            # Load results with ground truth
            df = self.load_results_with_ground_truth()
            
            if df.empty:
                print("❌ No data available for dashboard")
                return None
            
            # Calculate metrics
            metrics = self.calculate_classification_metrics(df)
            
            if not metrics:
                print("❌ Could not calculate metrics")
                return None
            
            # Create dashboard
            print("\n📊 Creating Web Gemini Pro Classification Dashboard...")
            dashboard_html = self.create_classification_dashboard(metrics, df)
            
            # Save files
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            dashboard_filename = f"web_gemini_pro_classification_dashboard_{timestamp}.html"
            metrics_filename = f"web_gemini_pro_classification_metrics_{timestamp}.json"
            
            with open(dashboard_filename, "w", encoding='utf-8') as f:
                f.write(dashboard_html)
            
            with open(metrics_filename, "w") as f:
                json.dump(metrics, f, indent=2, default=str)
            
            print(f"✅ Dashboard saved: {dashboard_filename}")
            print(f"✅ Metrics saved: {metrics_filename}")
            
            # Summary
            gemini_15_metrics = metrics['gemini_15_web_metrics']
            agreement = metrics['agreement_analysis']
            
            print(f"\n📊 WEB GEMINI 1.5 CLASSIFICATION METRICS SUMMARY:")
            print(f"   🌐 Web Gemini 1.5:")
            print(f"      F1-Score: {gemini_15_metrics['f1_score']:.3f}")
            print(f"      Precision: {gemini_15_metrics['precision']:.3f}")
            print(f"      Recall: {gemini_15_metrics['recall']:.3f}") 
            print(f"      Accuracy: {gemini_15_metrics['accuracy']:.3f}")
            print(f"      TP: {gemini_15_metrics['confusion_matrix']['true_positives']}, FP: {gemini_15_metrics['confusion_matrix']['false_positives']}")
            print(f"      TN: {gemini_15_metrics['confusion_matrix']['true_negatives']}, FN: {gemini_15_metrics['confusion_matrix']['false_negatives']}")
            
            print(f"\n   ⚖️ Agreement Rates:")
            print(f"      Web 1.5 vs Web Ground Truth: {agreement['gemini_15_vs_ground_truth']:.1%}")
            print(f"      Web 1.5 Judge Agreement: {agreement['gemini_15_judge_agreement_with_flash']:.1%}")
            
            return dashboard_filename, metrics_filename
            
        except Exception as e:
            print(f"❌ Dashboard generation failed: {e}")
            import traceback
            traceback.print_exc()
            return None

def main():
    """Run the Web Gemini Pro classification dashboard generation"""
    dashboard_generator = WebGeminiProClassificationDashboard()
    result = dashboard_generator.generate_classification_dashboard()
    
    if result:
        dashboard_file, metrics_file = result
        print(f"\n🎉 SUCCESS!")
        print(f"📁 Files generated:")
        print(f"   1. {dashboard_file}")
        print(f"   2. {metrics_file}")
        print(f"\n💡 Open {dashboard_file} in your browser to view the dashboard!")
    else:
        print(f"\n❌ Dashboard generation failed")

if __name__ == "__main__":
    main()

🚀 STARTING WEB GEMINI PRO CLASSIFICATION DASHBOARD GENERATION
📥 Loading Web Gemini Pro results with ground truth...
📊 Loading Web Gemini Pro results data...
   Found 294 result records
📊 Loading Web ground truth data...
   Found 294 web ground truth records
   Web ground truth distribution: {100: 252, 0: 42}
✅ Merged dataset: 294 records
   Sample artifact_ids from results: ['news.de/gesundheit/855915949/corona-zahlen-kreisfreie-stadt-solingen-heute-aktuell-27-06-2025-coronavirus-news-zu-rki-fallzahlen-tote-in-nordrhein-westfalen-intensivbetten-auslastung-und-neue-covid-19-variante-nimbus/1', 'medicinenet.com/how_do_i_know_if_i_have_damaged_my_rotator_cuff/article.htm', 'michigansthumb.com/news/article/Gagetown-Elementary-hosts-wax-museum-7356734.php', 'linuxiac.com/incus-6-13-container-and-virtual-machine-manager-released', 'liveworksheets.com/node/6694649']
   Sample artifact_ids from web GT: ['news.de/gesundheit/855915949/corona-zahlen-kreisfreie-stadt-solingen-heute-aktuell-27-06-2

In [53]:
#!/usr/bin/env python3
"""
Web Gemini 2.5 Pro Classification Metrics Dashboard - Real Results vs Web Ground Truth
Calculates traditional ML metrics (F1, Precision, Recall, Confusion Matrix) for Gemini 2.5 Pro decisions on Web content
"""

import pandas as pd
import json
from google.cloud import bigquery
from datetime import datetime
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
import numpy as np
from typing import Dict, List, Any, Optional

PROJECT_ID = "scope3-dev"
DATASET_ID = "research_bs_monitoring"
WEB_GROUND_TRUTH_TABLE = "BA_Web_Ground_Truth"  # Web ground truth table
WEB_RESULTS_TABLE = "BA_Web_Gemini_25_Pro_Judge_Results"  # Web Gemini 2.5 Pro results table

class WebGemini25ProClassificationDashboard:
    def __init__(self):
        self.client = bigquery.Client(project=PROJECT_ID)
        
    def load_results_with_ground_truth(self) -> pd.DataFrame:
        """Load Web Gemini 2.5 Pro results with real ground truth from Web Ground Truth table"""
        print("📥 Loading Web Gemini 2.5 Pro results with ground truth...")
        
        # First, let's get the results data
        results_query = f"""
        SELECT 
            artifact_id,
            flash_classification,
            flash_reasoning,
            model_prompt,
            pro_judge_agreement,
            pro_verdict,
            pro_confidence,
            pro_would_reach_same_conclusion,
            pro_reasoning,
            flash_vs_25pro_analysis,
            model_version_insights,
            improvements,
            api_call_time,
            model_used,
            error_message
        FROM `{PROJECT_ID}.{DATASET_ID}.{WEB_RESULTS_TABLE}`
        WHERE pro_judge_agreement IS NOT NULL
        AND pro_would_reach_same_conclusion IS NOT NULL
        AND (error_message IS NULL OR error_message = '')
        """
        
        # Get real ground truth from Web Ground Truth table
        web_gt_query = f"""
        SELECT 
            artifact_id,
            correct_classification as ground_truth,
            correct_reasoning as ground_truth_reasoning,
            source
        FROM `{PROJECT_ID}.{DATASET_ID}.{WEB_GROUND_TRUTH_TABLE}`
        """
        
        try:
            print("📊 Loading Web Gemini 2.5 Pro results data...")
            results_df = self.client.query(results_query).to_dataframe()
            print(f"   Found {len(results_df)} result records")
            
            print("📊 Loading Web ground truth data...")
            web_gt_df = self.client.query(web_gt_query).to_dataframe()
            print(f"   Found {len(web_gt_df)} web ground truth records")
            
            # Debug: Check ground truth distribution
            if len(web_gt_df) > 0:
                gt_dist = web_gt_df['ground_truth'].value_counts().to_dict()
                print(f"   Web ground truth distribution: {gt_dist}")
            
            # Join results with ground truth
            df = results_df.merge(web_gt_df, on='artifact_id', how='inner')
            print(f"✅ Merged dataset: {len(df)} records")
            
            if len(df) == 0:
                print("❌ No matching records found after merge")
                return pd.DataFrame()
            
            # Debug: Check what we actually got from the merge
            print(f"   Sample artifact_ids from results: {results_df['artifact_id'].head().tolist()}")
            print(f"   Sample artifact_ids from web GT: {web_gt_df['artifact_id'].head().tolist()}")
            print(f"   Sample flash_classification: {results_df['flash_classification'].head().tolist()}")
            print(f"   Sample web ground_truth after merge: {df['ground_truth'].head().tolist()}")
            
            # Check if flash_classification and ground_truth are identical BEFORE conversion
            identical_before_conversion = (df['flash_classification'] == df['ground_truth']).sum()
            print(f"   Flash vs Web Ground Truth identical BEFORE conversion: {identical_before_conversion}/{len(df)}")
            
            if identical_before_conversion == len(df):
                print("   🚨 CRITICAL ERROR: flash_classification is identical to web ground_truth!")
                print("   This means your web ground truth data IS the Flash predictions, not human annotations.")
                print("   Check your data pipeline - Web GT table might contain Flash results instead of human labels.")
                
                # Let's check a few individual records to confirm
                print("   📋 Sample records to verify:")
                for i in range(min(3, len(df))):
                    row = df.iloc[i]
                    print(f"      Record {i+1}: artifact_id={row['artifact_id']}, flash={row['flash_classification']}, web_gt={row['ground_truth']}")
            
            # Convert to binary format
            df['ground_truth_binary'] = (df['ground_truth'] == 100).astype(int)
            df['flash_binary'] = (df['flash_classification'] == 100).astype(int)
            
            # Convert Gemini 2.5 Pro "would reach same conclusion" to predictions
            df['gemini_25_web_prediction'] = np.where(
                df['pro_would_reach_same_conclusion'] == True,
                df['flash_binary'],  # Same as Flash
                1 - df['flash_binary']  # Opposite of Flash
            ).astype(int)
            
            print(f"📊 Data distribution:")
            print(f"   Web ground truth: {df['ground_truth_binary'].value_counts().to_dict()}")
            print(f"   Flash predictions: {df['flash_binary'].value_counts().to_dict()}")
            print(f"   Web Gemini 2.5 Pro predictions: {df['gemini_25_web_prediction'].value_counts().to_dict()}")
            print(f"   Judge agreement rate: {df['pro_judge_agreement'].mean():.1%}")
            
            # Validate binary data
            for col, name in [('ground_truth_binary', 'Web ground truth'), 
                            ('flash_binary', 'Flash'), 
                            ('gemini_25_web_prediction', 'Web Gemini 2.5 Pro')]:
                if not all(df[col].isin([0, 1])):
                    print(f"⚠️ Warning: {name} contains non-binary values")
                    df = df[df[col].isin([0, 1])]
            
            print(f"📊 Final clean dataset: {len(df)} records")
            
            # Debug sample comparisons
            print(f"\n🔍 DEBUG - Sample comparisons (first 5 records):")
            print(f"   Columns available: {df.columns.tolist()}")
            for i in range(min(5, len(df))):
                row = df.iloc[i]
                print(f"   Record {i+1}:")
                print(f"      artifact_id: {row['artifact_id']}")
                print(f"      web_ground_truth (from Web GT): {row['ground_truth']} -> binary: {row['ground_truth_binary']}")
                print(f"      flash_classification: {row['flash_classification']} -> binary: {row['flash_binary']}")
                print(f"      pro_would_reach_same_conclusion: {row['pro_would_reach_same_conclusion']}")
                print(f"      gemini_25_web_prediction: {row['gemini_25_web_prediction']}")
                print(f"      Flash vs Web GT: {'✅ MATCH' if row['flash_binary'] == row['ground_truth_binary'] else '❌ DIFFER'}")
                print(f"      ---")
            
            # Check if Flash predictions are identical to ground truth
            flash_matches_gt = (df['flash_binary'] == df['ground_truth_binary']).sum()
            print(f"\n🚨 CRITICAL CHECK:")
            print(f"   Flash predictions matching web ground truth: {flash_matches_gt}/{len(df)} ({flash_matches_gt/len(df):.1%})")
            
            if flash_matches_gt == len(df):
                print("   🚨 ERROR: Flash predictions are 100% identical to web ground truth!")
                print("   This suggests the web ground truth data is actually Flash's predictions, not human annotations.")
                print("   Check your data pipeline - the 'ground_truth' field might be populated with Flash results.")
            elif flash_matches_gt > len(df) * 0.95:
                print("   ⚠️ WARNING: Flash predictions are suspiciously similar to web ground truth (>95% match)")
                print("   This might indicate data contamination.")
            else:
                print("   ✅ Good: Flash predictions differ from web ground truth as expected.")
            
            return df
            
        except Exception as e:
            print(f"❌ Error loading data: {e}")
            import traceback
            traceback.print_exc()
            return pd.DataFrame()
    
    def calculate_classification_metrics(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Calculate comprehensive classification metrics for Web Gemini 2.5 Pro"""
        print("📊 Calculating classification metrics...")
        
        if len(df) == 0:
            return {}
        
        y_true = df['ground_truth_binary'].values
        gemini_25_pred = df['gemini_25_web_prediction'].values
        
        # Web Gemini 2.5 Pro metrics - Compare Web Gemini 2.5 Pro predictions with web ground truth  
        gemini_25_metrics = self._calculate_model_metrics(y_true, gemini_25_pred, "Web Gemini 2.5 Pro")
        
        # Agreement analysis
        gemini_25_vs_ground_truth_agreement = (gemini_25_pred == y_true).mean()
        judge_agreement_rate = df['pro_judge_agreement'].mean()
        
        print(f"✅ Metrics calculated:")
        print(f"   Web Gemini 2.5 Pro F1: {gemini_25_metrics['f1_score']:.3f}")
        print(f"   Judge agreement: {judge_agreement_rate:.1%}")
        
        return {
            'dataset_info': {
                'total_samples': len(df),
                'positive_samples': int((y_true == 1).sum()),
                'negative_samples': int((y_true == 0).sum()),
                'class_balance': float((y_true == 1).mean())
            },
            'gemini_25_web_metrics': gemini_25_metrics,
            'agreement_analysis': {
                'gemini_25_vs_ground_truth': gemini_25_vs_ground_truth_agreement,
                'gemini_25_judge_agreement_with_flash': judge_agreement_rate
            }
        }
    
    def _calculate_model_metrics(self, y_true: np.ndarray, y_pred: np.ndarray, model_name: str) -> Dict[str, Any]:
        """Calculate detailed metrics for a single model"""
        
        # Ensure binary values
        y_true = np.array(y_true).astype(int)
        y_pred = np.array(y_pred).astype(int)
        
        # Validate binary values
        if not (set(np.unique(y_true)) <= {0, 1} and set(np.unique(y_pred)) <= {0, 1}):
            print(f"⚠️ Warning: Non-binary values detected in {model_name}")
            print(f"   y_true unique: {np.unique(y_true)}")
            print(f"   y_pred unique: {np.unique(y_pred)}")
            # Force to binary
            y_true = np.clip(y_true, 0, 1)
            y_pred = np.clip(y_pred, 0, 1)
        
        accuracy = accuracy_score(y_true, y_pred)
        
        # Handle edge cases where only one class is predicted
        unique_pred = np.unique(y_pred)
        unique_true = np.unique(y_true)
        
        if len(unique_pred) == 1 or len(unique_true) == 1:
            # Use macro average for edge cases
            precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
            recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
        else:
            precision = precision_score(y_true, y_pred, average='binary', zero_division=0)
            recall = recall_score(y_true, y_pred, average='binary', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)
        
        # Confusion matrix - ensure we get 2x2 matrix
        cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
        
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
        elif cm.shape == (1, 1):
            # Handle case where only one class exists
            if unique_true[0] == 0 and unique_pred[0] == 0:
                tn, fp, fn, tp = cm[0,0], 0, 0, 0
            elif unique_true[0] == 1 and unique_pred[0] == 1:
                tn, fp, fn, tp = 0, 0, 0, cm[0,0]
            else:
                tn, fp, fn, tp = 0, cm[0,0], 0, 0
        else:
            print(f"⚠️ Unexpected confusion matrix shape for {model_name}: {cm.shape}")
            tn = fp = fn = tp = 0
        
        # Additional metrics
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0  # Same as recall
        npv = tn / (tn + fn) if (tn + fn) > 0 else 0  # Negative Predictive Value
        ppv = tp / (tp + fp) if (tp + fp) > 0 else 0  # Positive Predictive Value (same as precision)
        
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Positive Rate
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0  # False Negative Rate
        
        balanced_accuracy = (sensitivity + specificity) / 2
        
        # Matthews Correlation Coefficient
        mcc_num = (tp * tn) - (fp * fn)
        mcc_den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
        mcc = mcc_num / mcc_den if mcc_den > 0 else 0
        
        return {
            'model_name': model_name,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'specificity': specificity,
            'sensitivity': sensitivity,
            'balanced_accuracy': balanced_accuracy,
            'negative_predictive_value': npv,
            'positive_predictive_value': ppv,
            'false_positive_rate': fpr,
            'false_negative_rate': fnr,
            'matthews_correlation': mcc,
            'confusion_matrix': {
                'true_positives': int(tp),
                'false_positives': int(fp), 
                'true_negatives': int(tn),
                'false_negatives': int(fn)
            }
        }
    
    def create_classification_dashboard(self, metrics: Dict[str, Any], df: pd.DataFrame) -> str:
        """Create comprehensive classification metrics dashboard"""
        
        if not metrics:
            return "<html><body><h1>No valid metrics to display</h1></body></html>"
        
        gemini_25_metrics = metrics['gemini_25_web_metrics']
        dataset_info = metrics['dataset_info']
        agreement = metrics['agreement_analysis']
        
        gemini_25_cm = gemini_25_metrics['confusion_matrix']
        
        # Performance assessment
        if gemini_25_metrics['f1_score'] >= 0.8:
            gemini_25_assessment = "🟢 EXCELLENT"
            gemini_25_color = "#10B981"
        elif gemini_25_metrics['f1_score'] >= 0.7:
            gemini_25_assessment = "🟡 GOOD" 
            gemini_25_color = "#F59E0B"
        elif gemini_25_metrics['f1_score'] >= 0.6:
            gemini_25_assessment = "🟠 MODERATE"
            gemini_25_color = "#FF6B35"
        else:
            gemini_25_assessment = "🔴 POOR"
            gemini_25_color = "#EF4444"
        
        html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Web Gemini 2.5 Pro Classification Metrics Dashboard</title>
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{ font-family: 'Inter', sans-serif; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: #333; line-height: 1.6; min-height: 100vh; }}
        .dashboard {{ max-width: 1800px; margin: 0 auto; padding: 20px; }}
        .header {{ background: rgba(255,255,255,0.95); padding: 40px; border-radius: 25px; text-align: center; margin-bottom: 30px; backdrop-filter: blur(15px); box-shadow: 0 20px 60px rgba(0,0,0,0.1); }}
        .header h1 {{ font-size: 3rem; margin-bottom: 15px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; }}
        .header h2 {{ font-size: 1.5rem; color: #666; margin-bottom: 20px; }}
        .model-badge {{ display: inline-block; padding: 10px 25px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 30px; font-weight: bold; font-size: 1.1rem; margin: 5px; }}
        .metrics-overview {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 20px; margin-bottom: 40px; }}
        .metric-card {{ background: rgba(255,255,255,0.95); padding: 25px; border-radius: 20px; text-align: center; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); transition: transform 0.3s ease; }}
        .metric-card:hover {{ transform: translateY(-5px); }}
        .metric-value {{ font-size: 2.5rem; font-weight: bold; margin-bottom: 10px; }}
        .metric-label {{ font-size: 1rem; color: #666; font-weight: 600; margin-bottom: 5px; }}
        .metric-description {{ font-size: 0.85rem; color: #888; }}
        .excellent {{ color: #10B981; }}
        .good {{ color: #F59E0B; }}
        .moderate {{ color: #FF6B35; }}
        .poor {{ color: #EF4444; }}
        .dashboard-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(450px, 1fr)); gap: 25px; margin-bottom: 40px; }}
        .chart-section {{ background: rgba(255,255,255,0.95); padding: 30px; border-radius: 20px; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); }}
        .chart-title {{ font-size: 1.3rem; font-weight: bold; margin-bottom: 20px; color: #333; text-align: center; }}
        .confusion-matrix {{ display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px; margin: 20px 0; text-align: center; font-size: 0.9rem; }}
        .cm-cell {{ padding: 15px; border-radius: 12px; font-weight: bold; display: flex; flex-direction: column; justify-content: center; align-items: center; }}
        .cm-header {{ background: linear-gradient(135deg, #f8f9fa, #e9ecef); color: #495057; }}
        .cm-tp {{ background: linear-gradient(135deg, #d4edda, #c3e6cb); color: #155724; font-size: 1.8rem; }}
        .cm-fp {{ background: linear-gradient(135deg, #f8d7da, #f1b0b7); color: #721c24; font-size: 1.8rem; }}
        .cm-fn {{ background: linear-gradient(135deg, #fff3cd, #fce4a6); color: #856404; font-size: 1.8rem; }}
        .cm-tn {{ background: linear-gradient(135deg, #d1ecf1, #bee5eb); color: #0c5460; font-size: 1.8rem; }}
        .cm-label {{ font-size: 0.75rem; margin-top: 5px; opacity: 0.8; }}
        .performance-summary {{ background: linear-gradient(135deg, #e3f2fd, #bbdefb); padding: 30px; border-radius: 20px; margin: 30px 0; text-align: center; }}
        .performance-score {{ font-size: 3.5rem; font-weight: bold; color: {gemini_25_color}; margin-bottom: 10px; }}
        .performance-label {{ font-size: 1.3rem; color: #37474f; margin-bottom: 8px; }}
        .performance-description {{ font-size: 1rem; color: #546e7a; }}
        .comparison-section {{ background: rgba(255,255,255,0.95); padding: 30px; border-radius: 20px; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); margin-bottom: 30px; }}
        .model-column {{ padding: 20px; border-radius: 15px; }}
        .pro-column {{ background: linear-gradient(135deg, #e3f2fd, #bbdefb); }}
        .model-title {{ font-size: 1.5rem; font-weight: bold; text-align: center; margin-bottom: 20px; }}
        .metric-row {{ display: flex; justify-content: space-between; margin: 8px 0; padding: 8px; background: rgba(255,255,255,0.5); border-radius: 8px; }}
    </style>
</head>
<body>
    <div class="dashboard">
        <div class="header">
            <h1>🌐 Web Gemini 2.5 Pro Classification Metrics</h1>
            <h2>Performance vs Web Ground Truth</h2>
            <div>
                <span class="model-badge">Web Gemini 2.5 Pro</span>
            </div>
            <p style="margin-top: 20px; color: #666;">
                Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | 
                Samples: {dataset_info['total_samples']:,} | 
                Positive: {dataset_info['positive_samples']} | Negative: {dataset_info['negative_samples']}
            </p>
        </div>

        <div class="performance-summary">
            <div class="performance-score">{gemini_25_metrics['f1_score']:.3f}</div>
            <div class="performance-label">{gemini_25_assessment} F1-Score (Web Gemini 2.5 Pro)</div>
            <div class="performance-description">
                Web Gemini 2.5 Pro achieves {gemini_25_metrics['accuracy']:.1%} accuracy with {gemini_25_metrics['precision']:.1%} precision
                <br>Judge Agreement with Flash: {agreement['gemini_25_judge_agreement_with_flash']:.1%}
            </div>
        </div>

        <div class="metrics-overview">
            <div class="metric-card">
                <div class="metric-value {'excellent' if gemini_25_metrics['f1_score'] >= 0.8 else 'good' if gemini_25_metrics['f1_score'] >= 0.7 else 'moderate' if gemini_25_metrics['f1_score'] >= 0.6 else 'poor'}">{gemini_25_metrics['f1_score']:.3f}</div>
                <div class="metric-label">Web 2.5 Pro F1-Score</div>
                <div class="metric-description">Harmonic Mean P&R</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if gemini_25_metrics['recall'] >= 0.8 else 'good' if gemini_25_metrics['recall'] >= 0.7 else 'moderate' if gemini_25_metrics['recall'] >= 0.6 else 'poor'}">{gemini_25_metrics['recall']:.3f}</div>
                <div class="metric-label">Web 2.5 Pro TPR</div>
                <div class="metric-description">True Positive Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if gemini_25_metrics['specificity'] >= 0.8 else 'good' if gemini_25_metrics['specificity'] >= 0.7 else 'moderate' if gemini_25_metrics['specificity'] >= 0.6 else 'poor'}">{gemini_25_metrics['specificity']:.3f}</div>
                <div class="metric-label">Web 2.5 Pro TNR</div>
                <div class="metric-description">True Negative Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if gemini_25_metrics['false_positive_rate'] <= 0.2 else 'good' if gemini_25_metrics['false_positive_rate'] <= 0.3 else 'moderate' if gemini_25_metrics['false_positive_rate'] <= 0.4 else 'poor'}">{gemini_25_metrics['false_positive_rate']:.3f}</div>
                <div class="metric-label">Web 2.5 Pro FPR</div>
                <div class="metric-description">False Positive Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if gemini_25_metrics['false_negative_rate'] <= 0.2 else 'good' if gemini_25_metrics['false_negative_rate'] <= 0.3 else 'moderate' if gemini_25_metrics['false_negative_rate'] <= 0.4 else 'poor'}">{gemini_25_metrics['false_negative_rate']:.3f}</div>
                <div class="metric-label">Web 2.5 Pro FNR</div>
                <div class="metric-description">False Negative Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if gemini_25_metrics['accuracy'] >= 0.9 else 'good' if gemini_25_metrics['accuracy'] >= 0.8 else 'moderate' if gemini_25_metrics['accuracy'] >= 0.7 else 'poor'}">{gemini_25_metrics['accuracy']:.3f}</div>
                <div class="metric-label">Web 2.5 Pro Accuracy</div>
                <div class="metric-description">Overall Correctness</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if gemini_25_metrics['precision'] >= 0.8 else 'good' if gemini_25_metrics['precision'] >= 0.7 else 'moderate' if gemini_25_metrics['precision'] >= 0.6 else 'poor'}">{gemini_25_metrics['precision']:.3f}</div>
                <div class="metric-label">Web 2.5 Pro Precision</div>
                <div class="metric-description">Positive Predictive Value</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if agreement['gemini_25_judge_agreement_with_flash'] >= 0.8 else 'good' if agreement['gemini_25_judge_agreement_with_flash'] >= 0.7 else 'moderate' if agreement['gemini_25_judge_agreement_with_flash'] >= 0.6 else 'poor'}">{agreement['gemini_25_judge_agreement_with_flash']:.1%}</div>
                <div class="metric-label">Judge Agreement</div>
                <div class="metric-description">Web 2.5 Pro agrees with Flash</div>
            </div>
        </div>

        <div class="comparison-section">
            <div class="chart-title">📊 Web Gemini 2.5 Pro Performance Metrics</div>
            <div style="max-width: 600px; margin: 0 auto;">
                <div class="model-column pro-column">
                    <div class="model-title">🌐 Web Gemini 2.5 Pro vs Web Ground Truth</div>
                    <div class="metric-row"><span>F1-Score:</span><span class="{'excellent' if gemini_25_metrics['f1_score'] >= 0.8 else 'good' if gemini_25_metrics['f1_score'] >= 0.7 else 'moderate' if gemini_25_metrics['f1_score'] >= 0.6 else 'poor'}">{gemini_25_metrics['f1_score']:.3f}</span></div>
                    <div class="metric-row"><span>Accuracy:</span><span>{gemini_25_metrics['accuracy']:.3f}</span></div>
                    <div class="metric-row"><span>Precision:</span><span>{gemini_25_metrics['precision']:.3f}</span></div>
                    <div class="metric-row"><span>Recall (TPR):</span><span>{gemini_25_metrics['recall']:.3f}</span></div>
                    <div class="metric-row"><span>Specificity (TNR):</span><span>{gemini_25_metrics['specificity']:.3f}</span></div>
                    <div class="metric-row"><span>False Positive Rate:</span><span>{gemini_25_metrics['false_positive_rate']:.3f}</span></div>
                    <div class="metric-row"><span>False Negative Rate:</span><span>{gemini_25_metrics['false_negative_rate']:.3f}</span></div>
                    <div class="metric-row"><span>Judge Agreement with Flash:</span><span>{agreement['gemini_25_judge_agreement_with_flash']:.1%}</span></div>
                </div>
            </div>
        </div>

        <div class="dashboard-grid">
            <div class="chart-section">
                <div class="chart-title">🌐 Web Gemini 2.5 Pro Confusion Matrix</div>
                <div class="confusion-matrix">
                    <div class="cm-cell cm-header"></div>
                    <div class="cm-cell cm-header">Web GT: 0</div>
                    <div class="cm-cell cm-header">Web GT: 1</div>
                    
                    <div class="cm-cell cm-header">Web 2.5 Pro: 0</div>
                    <div class="cm-cell cm-tn">
                        {gemini_25_cm['true_negatives']}
                        <div class="cm-label">True Negatives</div>
                    </div>
                    <div class="cm-cell cm-fn">
                        {gemini_25_cm['false_negatives']}
                        <div class="cm-label">False Negatives</div>
                    </div>
                    
                    <div class="cm-cell cm-header">Web 2.5 Pro: 1</div>
                    <div class="cm-cell cm-fp">
                        {gemini_25_cm['false_positives']}
                        <div class="cm-label">False Positives</div>
                    </div>
                    <div class="cm-cell cm-tp">
                        {gemini_25_cm['true_positives']}
                        <div class="cm-label">True Positives</div>
                    </div>
                </div>
                <div style="text-align: center; margin-top: 15px;">
                    <p><strong>TP:</strong> {gemini_25_cm['true_positives']} | <strong>FP:</strong> {gemini_25_cm['false_positives']} | <strong>TN:</strong> {gemini_25_cm['true_negatives']} | <strong>FN:</strong> {gemini_25_cm['false_negatives']}</p>
                </div>
            </div>
        </div>
    </div>

    <script>
        window.addEventListener('load', function() {{
            console.log('Web Gemini 2.5 Pro Classification Dashboard Loaded');
        }});
    </script>
</body>
</html>"""
        
        return html_content
    
    def generate_classification_dashboard(self):
        """Main function to generate the classification dashboard"""
        print("🚀 STARTING WEB GEMINI 2.5 PRO CLASSIFICATION DASHBOARD GENERATION")
        print("=" * 70)
        
        try:
            # Load results with ground truth
            df = self.load_results_with_ground_truth()
            
            if df.empty:
                print("❌ No data available for dashboard")
                return None
            
            # Calculate metrics
            metrics = self.calculate_classification_metrics(df)
            
            if not metrics:
                print("❌ Could not calculate metrics")
                return None
            
            # Create dashboard
            print("\n📊 Creating Web Gemini 2.5 Pro Classification Dashboard...")
            dashboard_html = self.create_classification_dashboard(metrics, df)
            
            # Save files
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            dashboard_filename = f"web_gemini_25_pro_classification_dashboard_{timestamp}.html"
            metrics_filename = f"web_gemini_25_pro_classification_metrics_{timestamp}.json"
            
            with open(dashboard_filename, "w", encoding='utf-8') as f:
                f.write(dashboard_html)
            
            with open(metrics_filename, "w") as f:
                json.dump(metrics, f, indent=2, default=str)
            
            print(f"✅ Dashboard saved: {dashboard_filename}")
            print(f"✅ Metrics saved: {metrics_filename}")
            
            # Summary
            gemini_25_metrics = metrics['gemini_25_web_metrics']
            agreement = metrics['agreement_analysis']
            
            print(f"\n📊 WEB GEMINI 2.5 PRO CLASSIFICATION METRICS SUMMARY:")
            print(f"   🌐 Web Gemini 2.5 Pro:")
            print(f"      F1-Score: {gemini_25_metrics['f1_score']:.3f}")
            print(f"      Precision: {gemini_25_metrics['precision']:.3f}")
            print(f"      Recall: {gemini_25_metrics['recall']:.3f}") 
            print(f"      Accuracy: {gemini_25_metrics['accuracy']:.3f}")
            print(f"      TP: {gemini_25_metrics['confusion_matrix']['true_positives']}, FP: {gemini_25_metrics['confusion_matrix']['false_positives']}")
            print(f"      TN: {gemini_25_metrics['confusion_matrix']['true_negatives']}, FN: {gemini_25_metrics['confusion_matrix']['false_negatives']}")
            
            print(f"\n   ⚖️ Agreement Rates:")
            print(f"      Web 2.5 Pro vs Web Ground Truth: {agreement['gemini_25_vs_ground_truth']:.1%}")
            print(f"      Web 2.5 Pro Judge Agreement: {agreement['gemini_25_judge_agreement_with_flash']:.1%}")
            
            return dashboard_filename, metrics_filename
            
        except Exception as e:
            print(f"❌ Dashboard generation failed: {e}")
            import traceback
            traceback.print_exc()
            return None

def main():
    """Run the Web Gemini 2.5 Pro classification dashboard generation"""
    dashboard_generator = WebGemini25ProClassificationDashboard()
    result = dashboard_generator.generate_classification_dashboard()
    
    if result:
        dashboard_file, metrics_file = result
        print(f"\n🎉 SUCCESS!")
        print(f"📁 Files generated:")
        print(f"   1. {dashboard_file}")
        print(f"   2. {metrics_file}")
        print(f"\n💡 Open {dashboard_file} in your browser to view the dashboard!")
    else:
        print(f"\n❌ Dashboard generation failed")

if __name__ == "__main__":
    main()

🚀 STARTING WEB GEMINI 2.5 PRO CLASSIFICATION DASHBOARD GENERATION
📥 Loading Web Gemini 2.5 Pro results with ground truth...
📊 Loading Web Gemini 2.5 Pro results data...
   Found 293 result records
📊 Loading Web ground truth data...
   Found 294 web ground truth records
   Web ground truth distribution: {100: 252, 0: 42}
✅ Merged dataset: 293 records
   Sample artifact_ids from results: ['comicsands.com/message-wrong-person-reddit', 'cracked.com/article_22086_5-self-righteous-critics-who-were-total-hypocrites.html', 'daringgourmet.com/best-buttermilk-biscuits/comment-page-2', 'dictionary.com/e/acronyms/fw', 'deltiasgaming.com/dead-rails-a-beginners-guide']
   Sample artifact_ids from web GT: ['news.de/gesundheit/855915949/corona-zahlen-kreisfreie-stadt-solingen-heute-aktuell-27-06-2025-coronavirus-news-zu-rki-fallzahlen-tote-in-nordrhein-westfalen-intensivbetten-auslastung-und-neue-covid-19-variante-nimbus/1', 'bizjournals.com/sanfrancisco/news/2025/06/27/genentech-ulcerative-colitis-vi

In [54]:
#!/usr/bin/env python3
"""
Flash Meta Classification Metrics Dashboard - Flash Results vs Meta Ground Truth
Calculates traditional ML metrics (F1, Precision, Recall, Confusion Matrix) for Flash decisions on Meta content
"""

import pandas as pd
import json
from google.cloud import bigquery
from datetime import datetime
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
import numpy as np
from typing import Dict, List, Any, Optional

PROJECT_ID = "scope3-dev"
DATASET_ID = "research_bs_monitoring"
META_GROUND_TRUTH_TABLE = "BA_Meta_Ground_Truth"  # Meta ground truth table
META_RESULTS_TABLE = "BA_Meta_Gemini_Pro_Judge_Results"  # Contains flash_classification

class FlashMetaClassificationDashboard:
    def __init__(self):
        self.client = bigquery.Client(project=PROJECT_ID)
        
    def load_results_with_ground_truth(self) -> pd.DataFrame:
        """Load Flash results with real ground truth from Meta Ground Truth table"""
        print("📥 Loading Flash Meta results with ground truth...")
        
        # Get Flash classifications from the results table
        results_query = f"""
        SELECT 
            artifact_id,
            flash_classification,
            flash_reasoning,
            model_prompt,
            source,
            data_source
        FROM `{PROJECT_ID}.{DATASET_ID}.{META_RESULTS_TABLE}`
        WHERE flash_classification IS NOT NULL
        """
        
        # Get real ground truth from Meta Ground Truth table
        meta_gt_query = f"""
        SELECT 
            artifact_id,
            correct_classification as ground_truth,
            correct_reasoning as ground_truth_reasoning,
            source as gt_source
        FROM `{PROJECT_ID}.{DATASET_ID}.{META_GROUND_TRUTH_TABLE}`
        """
        
        try:
            print("📊 Loading Flash Meta results data...")
            results_df = self.client.query(results_query).to_dataframe()
            print(f"   Found {len(results_df)} Flash result records")
            
            print("📊 Loading Meta ground truth data...")
            meta_gt_df = self.client.query(meta_gt_query).to_dataframe()
            print(f"   Found {len(meta_gt_df)} meta ground truth records")
            
            # Debug: Check ground truth distribution
            if len(meta_gt_df) > 0:
                gt_dist = meta_gt_df['ground_truth'].value_counts().to_dict()
                print(f"   Meta ground truth distribution: {gt_dist}")
            
            # Join results with ground truth
            df = results_df.merge(meta_gt_df, on='artifact_id', how='inner')
            print(f"✅ Merged dataset: {len(df)} records")
            
            if len(df) == 0:
                print("❌ No matching records found after merge")
                return pd.DataFrame()
            
            # Debug: Check what we actually got from the merge
            print(f"   Sample artifact_ids from results: {results_df['artifact_id'].head().tolist()}")
            print(f"   Sample artifact_ids from meta GT: {meta_gt_df['artifact_id'].head().tolist()}")
            print(f"   Sample flash_classification: {results_df['flash_classification'].head().tolist()}")
            print(f"   Sample meta ground_truth after merge: {df['ground_truth'].head().tolist()}")
            
            # Check if flash_classification and ground_truth are identical BEFORE conversion
            identical_before_conversion = (df['flash_classification'] == df['ground_truth']).sum()
            print(f"   Flash vs Meta Ground Truth identical BEFORE conversion: {identical_before_conversion}/{len(df)}")
            
            if identical_before_conversion == len(df):
                print("   🚨 CRITICAL ERROR: flash_classification is identical to meta ground_truth!")
                print("   This means your meta ground truth data IS the Flash predictions, not human annotations.")
                print("   Check your data pipeline - Meta GT table might contain Flash results instead of human labels.")
                
                # Let's check a few individual records to confirm
                print("   📋 Sample records to verify:")
                for i in range(min(3, len(df))):
                    row = df.iloc[i]
                    print(f"      Record {i+1}: artifact_id={row['artifact_id']}, flash={row['flash_classification']}, meta_gt={row['ground_truth']}")
            
            # Convert to binary format
            df['ground_truth_binary'] = (df['ground_truth'] == 100).astype(int)
            df['flash_binary'] = (df['flash_classification'] == 100).astype(int)
            
            print(f"📊 Data distribution:")
            print(f"   Meta ground truth: {df['ground_truth_binary'].value_counts().to_dict()}")
            print(f"   Flash predictions: {df['flash_binary'].value_counts().to_dict()}")
            
            # Validate binary data
            for col, name in [('ground_truth_binary', 'Meta ground truth'), 
                            ('flash_binary', 'Flash')]:
                if not all(df[col].isin([0, 1])):
                    print(f"⚠️ Warning: {name} contains non-binary values")
                    df = df[df[col].isin([0, 1])]
            
            print(f"📊 Final clean dataset: {len(df)} records")
            
            # Debug sample comparisons
            print(f"\n🔍 DEBUG - Sample comparisons (first 5 records):")
            print(f"   Columns available: {df.columns.tolist()}")
            for i in range(min(5, len(df))):
                row = df.iloc[i]
                print(f"   Record {i+1}:")
                print(f"      artifact_id: {row['artifact_id']}")
                print(f"      meta_ground_truth (from Meta GT): {row['ground_truth']} -> binary: {row['ground_truth_binary']}")
                print(f"      flash_classification: {row['flash_classification']} -> binary: {row['flash_binary']}")
                print(f"      Flash vs Meta GT: {'✅ MATCH' if row['flash_binary'] == row['ground_truth_binary'] else '❌ DIFFER'}")
                print(f"      ---")
            
            # Check if Flash predictions are identical to ground truth
            flash_matches_gt = (df['flash_binary'] == df['ground_truth_binary']).sum()
            print(f"\n🚨 CRITICAL CHECK:")
            print(f"   Flash predictions matching meta ground truth: {flash_matches_gt}/{len(df)} ({flash_matches_gt/len(df):.1%})")
            
            if flash_matches_gt == len(df):
                print("   🚨 ERROR: Flash predictions are 100% identical to meta ground truth!")
                print("   This suggests the meta ground truth data is actually Flash's predictions, not human annotations.")
                print("   Check your data pipeline - the 'ground_truth' field might be populated with Flash results.")
            elif flash_matches_gt > len(df) * 0.95:
                print("   ⚠️ WARNING: Flash predictions are suspiciously similar to meta ground truth (>95% match)")
                print("   This might indicate data contamination.")
            else:
                print("   ✅ Good: Flash predictions differ from meta ground truth as expected.")
            
            return df
            
        except Exception as e:
            print(f"❌ Error loading data: {e}")
            import traceback
            traceback.print_exc()
            return pd.DataFrame()
    
    def calculate_classification_metrics(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Calculate comprehensive classification metrics for Flash"""
        print("📊 Calculating Flash classification metrics...")
        
        if len(df) == 0:
            return {}
        
        y_true = df['ground_truth_binary'].values
        flash_pred = df['flash_binary'].values
        
        # Flash metrics - Compare Flash predictions with meta ground truth  
        flash_metrics = self._calculate_model_metrics(y_true, flash_pred, "Flash")
        
        # Flash agreement with ground truth
        flash_vs_ground_truth_agreement = (flash_pred == y_true).mean()
        
        print(f"✅ Metrics calculated:")
        print(f"   Flash F1: {flash_metrics['f1_score']:.3f}")
        print(f"   Flash vs GT agreement: {flash_vs_ground_truth_agreement:.1%}")
        
        return {
            'dataset_info': {
                'total_samples': len(df),
                'positive_samples': int((y_true == 1).sum()),
                'negative_samples': int((y_true == 0).sum()),
                'class_balance': float((y_true == 1).mean())
            },
            'flash_meta_metrics': flash_metrics,
            'agreement_analysis': {
                'flash_vs_ground_truth': flash_vs_ground_truth_agreement
            }
        }
    
    def _calculate_model_metrics(self, y_true: np.ndarray, y_pred: np.ndarray, model_name: str) -> Dict[str, Any]:
        """Calculate detailed metrics for a single model"""
        
        # Ensure binary values
        y_true = np.array(y_true).astype(int)
        y_pred = np.array(y_pred).astype(int)
        
        # Validate binary values
        if not (set(np.unique(y_true)) <= {0, 1} and set(np.unique(y_pred)) <= {0, 1}):
            print(f"⚠️ Warning: Non-binary values detected in {model_name}")
            print(f"   y_true unique: {np.unique(y_true)}")
            print(f"   y_pred unique: {np.unique(y_pred)}")
            # Force to binary
            y_true = np.clip(y_true, 0, 1)
            y_pred = np.clip(y_pred, 0, 1)
        
        accuracy = accuracy_score(y_true, y_pred)
        
        # Handle edge cases where only one class is predicted
        unique_pred = np.unique(y_pred)
        unique_true = np.unique(y_true)
        
        if len(unique_pred) == 1 or len(unique_true) == 1:
            # Use macro average for edge cases
            precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
            recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
        else:
            precision = precision_score(y_true, y_pred, average='binary', zero_division=0)
            recall = recall_score(y_true, y_pred, average='binary', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)
        
        # Confusion matrix - ensure we get 2x2 matrix
        cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
        
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
        elif cm.shape == (1, 1):
            # Handle case where only one class exists
            if unique_true[0] == 0 and unique_pred[0] == 0:
                tn, fp, fn, tp = cm[0,0], 0, 0, 0
            elif unique_true[0] == 1 and unique_pred[0] == 1:
                tn, fp, fn, tp = 0, 0, 0, cm[0,0]
            else:
                tn, fp, fn, tp = 0, cm[0,0], 0, 0
        else:
            print(f"⚠️ Unexpected confusion matrix shape for {model_name}: {cm.shape}")
            tn = fp = fn = tp = 0
        
        # Additional metrics
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0  # Same as recall
        npv = tn / (tn + fn) if (tn + fn) > 0 else 0  # Negative Predictive Value
        ppv = tp / (tp + fp) if (tp + fp) > 0 else 0  # Positive Predictive Value (same as precision)
        
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Positive Rate
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0  # False Negative Rate
        
        balanced_accuracy = (sensitivity + specificity) / 2
        
        # Matthews Correlation Coefficient
        mcc_num = (tp * tn) - (fp * fn)
        mcc_den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
        mcc = mcc_num / mcc_den if mcc_den > 0 else 0
        
        return {
            'model_name': model_name,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'specificity': specificity,
            'sensitivity': sensitivity,
            'balanced_accuracy': balanced_accuracy,
            'negative_predictive_value': npv,
            'positive_predictive_value': ppv,
            'false_positive_rate': fpr,
            'false_negative_rate': fnr,
            'matthews_correlation': mcc,
            'confusion_matrix': {
                'true_positives': int(tp),
                'false_positives': int(fp), 
                'true_negatives': int(tn),
                'false_negatives': int(fn)
            }
        }
    
    def create_classification_dashboard(self, metrics: Dict[str, Any], df: pd.DataFrame) -> str:
        """Create comprehensive classification metrics dashboard"""
        
        if not metrics:
            return "<html><body><h1>No valid metrics to display</h1></body></html>"
        
        flash_metrics = metrics['flash_meta_metrics']
        dataset_info = metrics['dataset_info']
        agreement = metrics['agreement_analysis']
        
        flash_cm = flash_metrics['confusion_matrix']
        
        # Performance assessment
        if flash_metrics['f1_score'] >= 0.8:
            flash_assessment = "🟢 EXCELLENT"
            flash_color = "#10B981"
        elif flash_metrics['f1_score'] >= 0.7:
            flash_assessment = "🟡 GOOD" 
            flash_color = "#F59E0B"
        elif flash_metrics['f1_score'] >= 0.6:
            flash_assessment = "🟠 MODERATE"
            flash_color = "#FF6B35"
        else:
            flash_assessment = "🔴 POOR"
            flash_color = "#EF4444"
        
        html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Flash Meta Classification Metrics Dashboard</title>
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{ font-family: 'Inter', sans-serif; background: linear-gradient(135deg, #ff6b6b 0%, #ee5a52 100%); color: #333; line-height: 1.6; min-height: 100vh; }}
        .dashboard {{ max-width: 1800px; margin: 0 auto; padding: 20px; }}
        .header {{ background: rgba(255,255,255,0.95); padding: 40px; border-radius: 25px; text-align: center; margin-bottom: 30px; backdrop-filter: blur(15px); box-shadow: 0 20px 60px rgba(0,0,0,0.1); }}
        .header h1 {{ font-size: 3rem; margin-bottom: 15px; background: linear-gradient(135deg, #ff6b6b 0%, #ee5a52 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; }}
        .header h2 {{ font-size: 1.5rem; color: #666; margin-bottom: 20px; }}
        .model-badge {{ display: inline-block; padding: 10px 25px; background: linear-gradient(135deg, #ff6b6b 0%, #ee5a52 100%); color: white; border-radius: 30px; font-weight: bold; font-size: 1.1rem; margin: 5px; }}
        .metrics-overview {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 20px; margin-bottom: 40px; }}
        .metric-card {{ background: rgba(255,255,255,0.95); padding: 25px; border-radius: 20px; text-align: center; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); transition: transform 0.3s ease; }}
        .metric-card:hover {{ transform: translateY(-5px); }}
        .metric-value {{ font-size: 2.5rem; font-weight: bold; margin-bottom: 10px; }}
        .metric-label {{ font-size: 1rem; color: #666; font-weight: 600; margin-bottom: 5px; }}
        .metric-description {{ font-size: 0.85rem; color: #888; }}
        .excellent {{ color: #10B981; }}
        .good {{ color: #F59E0B; }}
        .moderate {{ color: #FF6B35; }}
        .poor {{ color: #EF4444; }}
        .dashboard-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(450px, 1fr)); gap: 25px; margin-bottom: 40px; }}
        .chart-section {{ background: rgba(255,255,255,0.95); padding: 30px; border-radius: 20px; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); }}
        .chart-title {{ font-size: 1.3rem; font-weight: bold; margin-bottom: 20px; color: #333; text-align: center; }}
        .confusion-matrix {{ display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px; margin: 20px 0; text-align: center; font-size: 0.9rem; }}
        .cm-cell {{ padding: 15px; border-radius: 12px; font-weight: bold; display: flex; flex-direction: column; justify-content: center; align-items: center; }}
        .cm-header {{ background: linear-gradient(135deg, #f8f9fa, #e9ecef); color: #495057; }}
        .cm-tp {{ background: linear-gradient(135deg, #d4edda, #c3e6cb); color: #155724; font-size: 1.8rem; }}
        .cm-fp {{ background: linear-gradient(135deg, #f8d7da, #f1b0b7); color: #721c24; font-size: 1.8rem; }}
        .cm-fn {{ background: linear-gradient(135deg, #fff3cd, #fce4a6); color: #856404; font-size: 1.8rem; }}
        .cm-tn {{ background: linear-gradient(135deg, #d1ecf1, #bee5eb); color: #0c5460; font-size: 1.8rem; }}
        .cm-label {{ font-size: 0.75rem; margin-top: 5px; opacity: 0.8; }}
        .performance-summary {{ background: linear-gradient(135deg, #ffe6e6, #ffcccc); padding: 30px; border-radius: 20px; margin: 30px 0; text-align: center; }}
        .performance-score {{ font-size: 3.5rem; font-weight: bold; color: {flash_color}; margin-bottom: 10px; }}
        .performance-label {{ font-size: 1.3rem; color: #37474f; margin-bottom: 8px; }}
        .performance-description {{ font-size: 1rem; color: #546e7a; }}
        .comparison-section {{ background: rgba(255,255,255,0.95); padding: 30px; border-radius: 20px; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); margin-bottom: 30px; }}
        .model-column {{ padding: 20px; border-radius: 15px; }}
        .flash-column {{ background: linear-gradient(135deg, #ffe6e6, #ffcccc); }}
        .model-title {{ font-size: 1.5rem; font-weight: bold; text-align: center; margin-bottom: 20px; }}
        .metric-row {{ display: flex; justify-content: space-between; margin: 8px 0; padding: 8px; background: rgba(255,255,255,0.5); border-radius: 8px; }}
    </style>
</head>
<body>
    <div class="dashboard">
        <div class="header">
            <h1>⚡ Flash Meta Classification Metrics</h1>
            <h2>Performance vs Meta Ground Truth</h2>
            <div>
                <span class="model-badge">Flash Model</span>
            </div>
            <p style="margin-top: 20px; color: #666;">
                Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | 
                Samples: {dataset_info['total_samples']:,} | 
                Positive: {dataset_info['positive_samples']} | Negative: {dataset_info['negative_samples']}
            </p>
        </div>

        <div class="performance-summary">
            <div class="performance-score">{flash_metrics['f1_score']:.3f}</div>
            <div class="performance-label">{flash_assessment} F1-Score (Flash Model)</div>
            <div class="performance-description">
                Flash achieves {flash_metrics['accuracy']:.1%} accuracy with {flash_metrics['precision']:.1%} precision
                <br>Agreement with Meta Ground Truth: {agreement['flash_vs_ground_truth']:.1%}
            </div>
        </div>

        <div class="metrics-overview">
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['f1_score'] >= 0.8 else 'good' if flash_metrics['f1_score'] >= 0.7 else 'moderate' if flash_metrics['f1_score'] >= 0.6 else 'poor'}">{flash_metrics['f1_score']:.3f}</div>
                <div class="metric-label">Flash F1-Score</div>
                <div class="metric-description">Harmonic Mean P&R</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['recall'] >= 0.8 else 'good' if flash_metrics['recall'] >= 0.7 else 'moderate' if flash_metrics['recall'] >= 0.6 else 'poor'}">{flash_metrics['recall']:.3f}</div>
                <div class="metric-label">Flash TPR</div>
                <div class="metric-description">True Positive Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['specificity'] >= 0.8 else 'good' if flash_metrics['specificity'] >= 0.7 else 'moderate' if flash_metrics['specificity'] >= 0.6 else 'poor'}">{flash_metrics['specificity']:.3f}</div>
                <div class="metric-label">Flash TNR</div>
                <div class="metric-description">True Negative Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['false_positive_rate'] <= 0.2 else 'good' if flash_metrics['false_positive_rate'] <= 0.3 else 'moderate' if flash_metrics['false_positive_rate'] <= 0.4 else 'poor'}">{flash_metrics['false_positive_rate']:.3f}</div>
                <div class="metric-label">Flash FPR</div>
                <div class="metric-description">False Positive Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['false_negative_rate'] <= 0.2 else 'good' if flash_metrics['false_negative_rate'] <= 0.3 else 'moderate' if flash_metrics['false_negative_rate'] <= 0.4 else 'poor'}">{flash_metrics['false_negative_rate']:.3f}</div>
                <div class="metric-label">Flash FNR</div>
                <div class="metric-description">False Negative Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['accuracy'] >= 0.9 else 'good' if flash_metrics['accuracy'] >= 0.8 else 'moderate' if flash_metrics['accuracy'] >= 0.7 else 'poor'}">{flash_metrics['accuracy']:.3f}</div>
                <div class="metric-label">Flash Accuracy</div>
                <div class="metric-description">Overall Correctness</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['precision'] >= 0.8 else 'good' if flash_metrics['precision'] >= 0.7 else 'moderate' if flash_metrics['precision'] >= 0.6 else 'poor'}">{flash_metrics['precision']:.3f}</div>
                <div class="metric-label">Flash Precision</div>
                <div class="metric-description">Positive Predictive Value</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if agreement['flash_vs_ground_truth'] >= 0.8 else 'good' if agreement['flash_vs_ground_truth'] >= 0.7 else 'moderate' if agreement['flash_vs_ground_truth'] >= 0.6 else 'poor'}">{agreement['flash_vs_ground_truth']:.1%}</div>
                <div class="metric-label">Ground Truth Agreement</div>
                <div class="metric-description">Flash matches human labels</div>
            </div>
        </div>

        <div class="comparison-section">
            <div class="chart-title">📊 Flash Model Performance Metrics</div>
            <div style="max-width: 600px; margin: 0 auto;">
                <div class="model-column flash-column">
                    <div class="model-title">⚡ Flash vs Meta Ground Truth</div>
                    <div class="metric-row"><span>F1-Score:</span><span class="{'excellent' if flash_metrics['f1_score'] >= 0.8 else 'good' if flash_metrics['f1_score'] >= 0.7 else 'moderate' if flash_metrics['f1_score'] >= 0.6 else 'poor'}">{flash_metrics['f1_score']:.3f}</span></div>
                    <div class="metric-row"><span>Accuracy:</span><span>{flash_metrics['accuracy']:.3f}</span></div>
                    <div class="metric-row"><span>Precision:</span><span>{flash_metrics['precision']:.3f}</span></div>
                    <div class="metric-row"><span>Recall (TPR):</span><span>{flash_metrics['recall']:.3f}</span></div>
                    <div class="metric-row"><span>Specificity (TNR):</span><span>{flash_metrics['specificity']:.3f}</span></div>
                    <div class="metric-row"><span>False Positive Rate:</span><span>{flash_metrics['false_positive_rate']:.3f}</span></div>
                    <div class="metric-row"><span>False Negative Rate:</span><span>{flash_metrics['false_negative_rate']:.3f}</span></div>
                    <div class="metric-row"><span>Ground Truth Agreement:</span><span>{agreement['flash_vs_ground_truth']:.1%}</span></div>
                </div>
            </div>
        </div>

        <div class="dashboard-grid">
            <div class="chart-section">
                <div class="chart-title">⚡ Flash Confusion Matrix</div>
                <div class="confusion-matrix">
                    <div class="cm-cell cm-header"></div>
                    <div class="cm-cell cm-header">Meta GT: 0</div>
                    <div class="cm-cell cm-header">Meta GT: 1</div>
                    
                    <div class="cm-cell cm-header">Flash: 0</div>
                    <div class="cm-cell cm-tn">
                        {flash_cm['true_negatives']}
                        <div class="cm-label">True Negatives</div>
                    </div>
                    <div class="cm-cell cm-fn">
                        {flash_cm['false_negatives']}
                        <div class="cm-label">False Negatives</div>
                    </div>
                    
                    <div class="cm-cell cm-header">Flash: 1</div>
                    <div class="cm-cell cm-fp">
                        {flash_cm['false_positives']}
                        <div class="cm-label">False Positives</div>
                    </div>
                    <div class="cm-cell cm-tp">
                        {flash_cm['true_positives']}
                        <div class="cm-label">True Positives</div>
                    </div>
                </div>
                <div style="text-align: center; margin-top: 15px;">
                    <p><strong>TP:</strong> {flash_cm['true_positives']} | <strong>FP:</strong> {flash_cm['false_positives']} | <strong>TN:</strong> {flash_cm['true_negatives']} | <strong>FN:</strong> {flash_cm['false_negatives']}</p>
                </div>
            </div>
        </div>
    </div>

    <script>
        window.addEventListener('load', function() {{
            console.log('Flash Meta Classification Dashboard Loaded');
        }});
    </script>
</body>
</html>"""
        
        return html_content
    
    def generate_classification_dashboard(self):
        """Main function to generate the classification dashboard"""
        print("🚀 STARTING FLASH META CLASSIFICATION DASHBOARD GENERATION")
        print("=" * 70)
        
        try:
            # Load results with ground truth
            df = self.load_results_with_ground_truth()
            
            if df.empty:
                print("❌ No data available for dashboard")
                return None
            
            # Calculate metrics
            metrics = self.calculate_classification_metrics(df)
            
            if not metrics:
                print("❌ Could not calculate metrics")
                return None
            
            # Create dashboard
            print("\n📊 Creating Flash Meta Classification Dashboard...")
            dashboard_html = self.create_classification_dashboard(metrics, df)
            
            # Save files
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            dashboard_filename = f"flash_meta_classification_dashboard_{timestamp}.html"
            metrics_filename = f"flash_meta_classification_metrics_{timestamp}.json"
            
            with open(dashboard_filename, "w", encoding='utf-8') as f:
                f.write(dashboard_html)
            
            with open(metrics_filename, "w") as f:
                json.dump(metrics, f, indent=2, default=str)
            
            print(f"✅ Dashboard saved: {dashboard_filename}")
            print(f"✅ Metrics saved: {metrics_filename}")
            
            # Summary
            flash_metrics = metrics['flash_meta_metrics']
            agreement = metrics['agreement_analysis']
            
            print(f"\n📊 FLASH META CLASSIFICATION METRICS SUMMARY:")
            print(f"   ⚡ Flash Model:")
            print(f"      F1-Score: {flash_metrics['f1_score']:.3f}")
            print(f"      Precision: {flash_metrics['precision']:.3f}")
            print(f"      Recall: {flash_metrics['recall']:.3f}") 
            print(f"      Accuracy: {flash_metrics['accuracy']:.3f}")
            print(f"      TP: {flash_metrics['confusion_matrix']['true_positives']}, FP: {flash_metrics['confusion_matrix']['false_positives']}")
            print(f"      TN: {flash_metrics['confusion_matrix']['true_negatives']}, FN: {flash_metrics['confusion_matrix']['false_negatives']}")
            
            print(f"\n   ⚖️ Agreement Rates:")
            print(f"      Flash vs Meta Ground Truth: {agreement['flash_vs_ground_truth']:.1%}")
            
            return dashboard_filename, metrics_filename
            
        except Exception as e:
            print(f"❌ Dashboard generation failed: {e}")
            import traceback
            traceback.print_exc()
            return None

def main():
    """Run the Flash Meta classification dashboard generation"""
    dashboard_generator = FlashMetaClassificationDashboard()
    result = dashboard_generator.generate_classification_dashboard()
    
    if result:
        dashboard_file, metrics_file = result
        print(f"\n🎉 SUCCESS!")
        print(f"📁 Files generated:")
        print(f"   1. {dashboard_file}")
        print(f"   2. {metrics_file}")
        print(f"\n💡 Open {dashboard_file} in your browser to view the dashboard!")
    else:
        print(f"\n❌ Dashboard generation failed")

if __name__ == "__main__":
    main()

🚀 STARTING FLASH META CLASSIFICATION DASHBOARD GENERATION
📥 Loading Flash Meta results with ground truth...
📊 Loading Flash Meta results data...
   Found 299 Flash result records
📊 Loading Meta ground truth data...
   Found 299 meta ground truth records
   Meta ground truth distribution: {100: 187, 0: 112}
✅ Merged dataset: 299 records
   Sample artifact_ids from results: ['meta:993741799587060', 'meta:9940080572776247', 'meta:996072466011266', 'meta:aWdfbWVkaWFfM3B2OjE3ODQ4Mjc1NzkyNDkyODM4', 'meta:aWdfbWVkaWFfM3B2OjE3ODkzMzQxMTE3MjUyODY4']
   Sample artifact_ids from meta GT: ['meta:122140809938790694', 'meta:4050755845195196', 'meta:1173810271458869', 'meta:1297387721748313', 'meta:1138085658345605']
   Sample flash_classification: [0, 100, 100, 100, 100]
   Sample meta ground_truth after merge: [0, 100, 100, 100, 100]
   Flash vs Meta Ground Truth identical BEFORE conversion: 299/299
   🚨 CRITICAL ERROR: flash_classification is identical to meta ground_truth!
   This means your meta

In [55]:
#!/usr/bin/env python3
"""
Flash Web Classification Metrics Dashboard - Flash Results vs Web Ground Truth
Calculates traditional ML metrics (F1, Precision, Recall, Confusion Matrix) for Flash decisions on Web content
"""

import pandas as pd
import json
from google.cloud import bigquery
from datetime import datetime
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
import numpy as np
from typing import Dict, List, Any, Optional

PROJECT_ID = "scope3-dev"
DATASET_ID = "research_bs_monitoring"
WEB_GROUND_TRUTH_TABLE = "BA_Web_Ground_Truth"  # Web ground truth table
WEB_RESULTS_TABLE = "BA_Web_Gemini_Pro_Judge_Results"  # Contains flash_classification for web

class FlashWebClassificationDashboard:
    def __init__(self):
        self.client = bigquery.Client(project=PROJECT_ID)
        
    def load_results_with_ground_truth(self) -> pd.DataFrame:
        """Load Flash results with real ground truth from Web Ground Truth table"""
        print("📥 Loading Flash Web results with ground truth...")
        
        # Get Flash classifications from the web results table
        results_query = f"""
        SELECT 
            artifact_id,
            flash_classification,
            flash_reasoning,
            model_prompt,
            source,
            data_source
        FROM `{PROJECT_ID}.{DATASET_ID}.{WEB_RESULTS_TABLE}`
        WHERE flash_classification IS NOT NULL
        """
        
        # Get real ground truth from Web Ground Truth table
        web_gt_query = f"""
        SELECT 
            artifact_id,
            correct_classification as ground_truth,
            correct_reasoning as ground_truth_reasoning,
            source as gt_source
        FROM `{PROJECT_ID}.{DATASET_ID}.{WEB_GROUND_TRUTH_TABLE}`
        """
        
        try:
            print("📊 Loading Flash Web results data...")
            results_df = self.client.query(results_query).to_dataframe()
            print(f"   Found {len(results_df)} Flash result records")
            
            print("📊 Loading Web ground truth data...")
            web_gt_df = self.client.query(web_gt_query).to_dataframe()
            print(f"   Found {len(web_gt_df)} web ground truth records")
            
            # Debug: Check ground truth distribution
            if len(web_gt_df) > 0:
                gt_dist = web_gt_df['ground_truth'].value_counts().to_dict()
                print(f"   Web ground truth distribution: {gt_dist}")
            
            # Join results with ground truth
            df = results_df.merge(web_gt_df, on='artifact_id', how='inner')
            print(f"✅ Merged dataset: {len(df)} records")
            
            if len(df) == 0:
                print("❌ No matching records found after merge")
                return pd.DataFrame()
            
            # Debug: Check what we actually got from the merge
            print(f"   Sample artifact_ids from results: {results_df['artifact_id'].head().tolist()}")
            print(f"   Sample artifact_ids from web GT: {web_gt_df['artifact_id'].head().tolist()}")
            print(f"   Sample flash_classification: {results_df['flash_classification'].head().tolist()}")
            print(f"   Sample web ground_truth after merge: {df['ground_truth'].head().tolist()}")
            
            # Check if flash_classification and ground_truth are identical BEFORE conversion
            identical_before_conversion = (df['flash_classification'] == df['ground_truth']).sum()
            print(f"   Flash vs Web Ground Truth identical BEFORE conversion: {identical_before_conversion}/{len(df)}")
            
            if identical_before_conversion == len(df):
                print("   🚨 CRITICAL ERROR: flash_classification is identical to web ground_truth!")
                print("   This means your web ground truth data IS the Flash predictions, not human annotations.")
                print("   Check your data pipeline - Web GT table might contain Flash results instead of human labels.")
                
                # Let's check a few individual records to confirm
                print("   📋 Sample records to verify:")
                for i in range(min(3, len(df))):
                    row = df.iloc[i]
                    print(f"      Record {i+1}: artifact_id={row['artifact_id']}, flash={row['flash_classification']}, web_gt={row['ground_truth']}")
            
            # Convert to binary format
            df['ground_truth_binary'] = (df['ground_truth'] == 100).astype(int)
            df['flash_binary'] = (df['flash_classification'] == 100).astype(int)
            
            print(f"📊 Data distribution:")
            print(f"   Web ground truth: {df['ground_truth_binary'].value_counts().to_dict()}")
            print(f"   Flash predictions: {df['flash_binary'].value_counts().to_dict()}")
            
            # Validate binary data
            for col, name in [('ground_truth_binary', 'Web ground truth'), 
                            ('flash_binary', 'Flash')]:
                if not all(df[col].isin([0, 1])):
                    print(f"⚠️ Warning: {name} contains non-binary values")
                    df = df[df[col].isin([0, 1])]
            
            print(f"📊 Final clean dataset: {len(df)} records")
            
            # Debug sample comparisons
            print(f"\n🔍 DEBUG - Sample comparisons (first 5 records):")
            print(f"   Columns available: {df.columns.tolist()}")
            for i in range(min(5, len(df))):
                row = df.iloc[i]
                print(f"   Record {i+1}:")
                print(f"      artifact_id: {row['artifact_id']}")
                print(f"      web_ground_truth (from Web GT): {row['ground_truth']} -> binary: {row['ground_truth_binary']}")
                print(f"      flash_classification: {row['flash_classification']} -> binary: {row['flash_binary']}")
                print(f"      Flash vs Web GT: {'✅ MATCH' if row['flash_binary'] == row['ground_truth_binary'] else '❌ DIFFER'}")
                print(f"      ---")
            
            # Check if Flash predictions are identical to ground truth
            flash_matches_gt = (df['flash_binary'] == df['ground_truth_binary']).sum()
            print(f"\n🚨 CRITICAL CHECK:")
            print(f"   Flash predictions matching web ground truth: {flash_matches_gt}/{len(df)} ({flash_matches_gt/len(df):.1%})")
            
            if flash_matches_gt == len(df):
                print("   🚨 ERROR: Flash predictions are 100% identical to web ground truth!")
                print("   This suggests the web ground truth data is actually Flash's predictions, not human annotations.")
                print("   Check your data pipeline - the 'ground_truth' field might be populated with Flash results.")
            elif flash_matches_gt > len(df) * 0.95:
                print("   ⚠️ WARNING: Flash predictions are suspiciously similar to web ground truth (>95% match)")
                print("   This might indicate data contamination.")
            else:
                print("   ✅ Good: Flash predictions differ from web ground truth as expected.")
            
            return df
            
        except Exception as e:
            print(f"❌ Error loading data: {e}")
            import traceback
            traceback.print_exc()
            return pd.DataFrame()
    
    def calculate_classification_metrics(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Calculate comprehensive classification metrics for Flash"""
        print("📊 Calculating Flash classification metrics...")
        
        if len(df) == 0:
            return {}
        
        y_true = df['ground_truth_binary'].values
        flash_pred = df['flash_binary'].values
        
        # Flash metrics - Compare Flash predictions with web ground truth  
        flash_metrics = self._calculate_model_metrics(y_true, flash_pred, "Flash")
        
        # Flash agreement with ground truth
        flash_vs_ground_truth_agreement = (flash_pred == y_true).mean()
        
        print(f"✅ Metrics calculated:")
        print(f"   Flash F1: {flash_metrics['f1_score']:.3f}")
        print(f"   Flash vs GT agreement: {flash_vs_ground_truth_agreement:.1%}")
        
        return {
            'dataset_info': {
                'total_samples': len(df),
                'positive_samples': int((y_true == 1).sum()),
                'negative_samples': int((y_true == 0).sum()),
                'class_balance': float((y_true == 1).mean())
            },
            'flash_web_metrics': flash_metrics,
            'agreement_analysis': {
                'flash_vs_ground_truth': flash_vs_ground_truth_agreement
            }
        }
    
    def _calculate_model_metrics(self, y_true: np.ndarray, y_pred: np.ndarray, model_name: str) -> Dict[str, Any]:
        """Calculate detailed metrics for a single model"""
        
        # Ensure binary values
        y_true = np.array(y_true).astype(int)
        y_pred = np.array(y_pred).astype(int)
        
        # Validate binary values
        if not (set(np.unique(y_true)) <= {0, 1} and set(np.unique(y_pred)) <= {0, 1}):
            print(f"⚠️ Warning: Non-binary values detected in {model_name}")
            print(f"   y_true unique: {np.unique(y_true)}")
            print(f"   y_pred unique: {np.unique(y_pred)}")
            # Force to binary
            y_true = np.clip(y_true, 0, 1)
            y_pred = np.clip(y_pred, 0, 1)
        
        accuracy = accuracy_score(y_true, y_pred)
        
        # Handle edge cases where only one class is predicted
        unique_pred = np.unique(y_pred)
        unique_true = np.unique(y_true)
        
        if len(unique_pred) == 1 or len(unique_true) == 1:
            # Use macro average for edge cases
            precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
            recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
        else:
            precision = precision_score(y_true, y_pred, average='binary', zero_division=0)
            recall = recall_score(y_true, y_pred, average='binary', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)
        
        # Confusion matrix - ensure we get 2x2 matrix
        cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
        
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
        elif cm.shape == (1, 1):
            # Handle case where only one class exists
            if unique_true[0] == 0 and unique_pred[0] == 0:
                tn, fp, fn, tp = cm[0,0], 0, 0, 0
            elif unique_true[0] == 1 and unique_pred[0] == 1:
                tn, fp, fn, tp = 0, 0, 0, cm[0,0]
            else:
                tn, fp, fn, tp = 0, cm[0,0], 0, 0
        else:
            print(f"⚠️ Unexpected confusion matrix shape for {model_name}: {cm.shape}")
            tn = fp = fn = tp = 0
        
        # Additional metrics
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0  # Same as recall
        npv = tn / (tn + fn) if (tn + fn) > 0 else 0  # Negative Predictive Value
        ppv = tp / (tp + fp) if (tp + fp) > 0 else 0  # Positive Predictive Value (same as precision)
        
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Positive Rate
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0  # False Negative Rate
        
        balanced_accuracy = (sensitivity + specificity) / 2
        
        # Matthews Correlation Coefficient
        mcc_num = (tp * tn) - (fp * fn)
        mcc_den = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
        mcc = mcc_num / mcc_den if mcc_den > 0 else 0
        
        return {
            'model_name': model_name,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'specificity': specificity,
            'sensitivity': sensitivity,
            'balanced_accuracy': balanced_accuracy,
            'negative_predictive_value': npv,
            'positive_predictive_value': ppv,
            'false_positive_rate': fpr,
            'false_negative_rate': fnr,
            'matthews_correlation': mcc,
            'confusion_matrix': {
                'true_positives': int(tp),
                'false_positives': int(fp), 
                'true_negatives': int(tn),
                'false_negatives': int(fn)
            }
        }
    
    def create_classification_dashboard(self, metrics: Dict[str, Any], df: pd.DataFrame) -> str:
        """Create comprehensive classification metrics dashboard"""
        
        if not metrics:
            return "<html><body><h1>No valid metrics to display</h1></body></html>"
        
        flash_metrics = metrics['flash_web_metrics']
        dataset_info = metrics['dataset_info']
        agreement = metrics['agreement_analysis']
        
        flash_cm = flash_metrics['confusion_matrix']
        
        # Performance assessment
        if flash_metrics['f1_score'] >= 0.8:
            flash_assessment = "🟢 EXCELLENT"
            flash_color = "#10B981"
        elif flash_metrics['f1_score'] >= 0.7:
            flash_assessment = "🟡 GOOD" 
            flash_color = "#F59E0B"
        elif flash_metrics['f1_score'] >= 0.6:
            flash_assessment = "🟠 MODERATE"
            flash_color = "#FF6B35"
        else:
            flash_assessment = "🔴 POOR"
            flash_color = "#EF4444"
        
        html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Flash Web Classification Metrics Dashboard</title>
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{ font-family: 'Inter', sans-serif; background: linear-gradient(135deg, #ff6b6b 0%, #ee5a52 100%); color: #333; line-height: 1.6; min-height: 100vh; }}
        .dashboard {{ max-width: 1800px; margin: 0 auto; padding: 20px; }}
        .header {{ background: rgba(255,255,255,0.95); padding: 40px; border-radius: 25px; text-align: center; margin-bottom: 30px; backdrop-filter: blur(15px); box-shadow: 0 20px 60px rgba(0,0,0,0.1); }}
        .header h1 {{ font-size: 3rem; margin-bottom: 15px; background: linear-gradient(135deg, #ff6b6b 0%, #ee5a52 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; }}
        .header h2 {{ font-size: 1.5rem; color: #666; margin-bottom: 20px; }}
        .model-badge {{ display: inline-block; padding: 10px 25px; background: linear-gradient(135deg, #ff6b6b 0%, #ee5a52 100%); color: white; border-radius: 30px; font-weight: bold; font-size: 1.1rem; margin: 5px; }}
        .metrics-overview {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 20px; margin-bottom: 40px; }}
        .metric-card {{ background: rgba(255,255,255,0.95); padding: 25px; border-radius: 20px; text-align: center; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); transition: transform 0.3s ease; }}
        .metric-card:hover {{ transform: translateY(-5px); }}
        .metric-value {{ font-size: 2.5rem; font-weight: bold; margin-bottom: 10px; }}
        .metric-label {{ font-size: 1rem; color: #666; font-weight: 600; margin-bottom: 5px; }}
        .metric-description {{ font-size: 0.85rem; color: #888; }}
        .excellent {{ color: #10B981; }}
        .good {{ color: #F59E0B; }}
        .moderate {{ color: #FF6B35; }}
        .poor {{ color: #EF4444; }}
        .dashboard-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(450px, 1fr)); gap: 25px; margin-bottom: 40px; }}
        .chart-section {{ background: rgba(255,255,255,0.95); padding: 30px; border-radius: 20px; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); }}
        .chart-title {{ font-size: 1.3rem; font-weight: bold; margin-bottom: 20px; color: #333; text-align: center; }}
        .confusion-matrix {{ display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px; margin: 20px 0; text-align: center; font-size: 0.9rem; }}
        .cm-cell {{ padding: 15px; border-radius: 12px; font-weight: bold; display: flex; flex-direction: column; justify-content: center; align-items: center; }}
        .cm-header {{ background: linear-gradient(135deg, #f8f9fa, #e9ecef); color: #495057; }}
        .cm-tp {{ background: linear-gradient(135deg, #d4edda, #c3e6cb); color: #155724; font-size: 1.8rem; }}
        .cm-fp {{ background: linear-gradient(135deg, #f8d7da, #f1b0b7); color: #721c24; font-size: 1.8rem; }}
        .cm-fn {{ background: linear-gradient(135deg, #fff3cd, #fce4a6); color: #856404; font-size: 1.8rem; }}
        .cm-tn {{ background: linear-gradient(135deg, #d1ecf1, #bee5eb); color: #0c5460; font-size: 1.8rem; }}
        .cm-label {{ font-size: 0.75rem; margin-top: 5px; opacity: 0.8; }}
        .performance-summary {{ background: linear-gradient(135deg, #ffe6e6, #ffcccc); padding: 30px; border-radius: 20px; margin: 30px 0; text-align: center; }}
        .performance-score {{ font-size: 3.5rem; font-weight: bold; color: {flash_color}; margin-bottom: 10px; }}
        .performance-label {{ font-size: 1.3rem; color: #37474f; margin-bottom: 8px; }}
        .performance-description {{ font-size: 1rem; color: #546e7a; }}
        .comparison-section {{ background: rgba(255,255,255,0.95); padding: 30px; border-radius: 20px; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); margin-bottom: 30px; }}
        .model-column {{ padding: 20px; border-radius: 15px; }}
        .flash-column {{ background: linear-gradient(135deg, #ffe6e6, #ffcccc); }}
        .model-title {{ font-size: 1.5rem; font-weight: bold; text-align: center; margin-bottom: 20px; }}
        .metric-row {{ display: flex; justify-content: space-between; margin: 8px 0; padding: 8px; background: rgba(255,255,255,0.5); border-radius: 8px; }}
    </style>
</head>
<body>
    <div class="dashboard">
        <div class="header">
            <h1>⚡ Flash Web Classification Metrics</h1>
            <h2>Performance vs Web Ground Truth</h2>
            <div>
                <span class="model-badge">Flash Web Model</span>
            </div>
            <p style="margin-top: 20px; color: #666;">
                Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | 
                Samples: {dataset_info['total_samples']:,} | 
                Positive: {dataset_info['positive_samples']} | Negative: {dataset_info['negative_samples']}
            </p>
        </div>

        <div class="performance-summary">
            <div class="performance-score">{flash_metrics['f1_score']:.3f}</div>
            <div class="performance-label">{flash_assessment} F1-Score (Flash Web Model)</div>
            <div class="performance-description">
                Flash achieves {flash_metrics['accuracy']:.1%} accuracy with {flash_metrics['precision']:.1%} precision on web content
                <br>Agreement with Web Ground Truth: {agreement['flash_vs_ground_truth']:.1%}
            </div>
        </div>

        <div class="metrics-overview">
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['f1_score'] >= 0.8 else 'good' if flash_metrics['f1_score'] >= 0.7 else 'moderate' if flash_metrics['f1_score'] >= 0.6 else 'poor'}">{flash_metrics['f1_score']:.3f}</div>
                <div class="metric-label">Flash F1-Score</div>
                <div class="metric-description">Harmonic Mean P&R</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['recall'] >= 0.8 else 'good' if flash_metrics['recall'] >= 0.7 else 'moderate' if flash_metrics['recall'] >= 0.6 else 'poor'}">{flash_metrics['recall']:.3f}</div>
                <div class="metric-label">Flash TPR</div>
                <div class="metric-description">True Positive Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['specificity'] >= 0.8 else 'good' if flash_metrics['specificity'] >= 0.7 else 'moderate' if flash_metrics['specificity'] >= 0.6 else 'poor'}">{flash_metrics['specificity']:.3f}</div>
                <div class="metric-label">Flash TNR</div>
                <div class="metric-description">True Negative Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['false_positive_rate'] <= 0.2 else 'good' if flash_metrics['false_positive_rate'] <= 0.3 else 'moderate' if flash_metrics['false_positive_rate'] <= 0.4 else 'poor'}">{flash_metrics['false_positive_rate']:.3f}</div>
                <div class="metric-label">Flash FPR</div>
                <div class="metric-description">False Positive Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['false_negative_rate'] <= 0.2 else 'good' if flash_metrics['false_negative_rate'] <= 0.3 else 'moderate' if flash_metrics['false_negative_rate'] <= 0.4 else 'poor'}">{flash_metrics['false_negative_rate']:.3f}</div>
                <div class="metric-label">Flash FNR</div>
                <div class="metric-description">False Negative Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['accuracy'] >= 0.9 else 'good' if flash_metrics['accuracy'] >= 0.8 else 'moderate' if flash_metrics['accuracy'] >= 0.7 else 'poor'}">{flash_metrics['accuracy']:.3f}</div>
                <div class="metric-label">Flash Accuracy</div>
                <div class="metric-description">Overall Correctness</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['precision'] >= 0.8 else 'good' if flash_metrics['precision'] >= 0.7 else 'moderate' if flash_metrics['precision'] >= 0.6 else 'poor'}">{flash_metrics['precision']:.3f}</div>
                <div class="metric-label">Flash Precision</div>
                <div class="metric-description">Positive Predictive Value</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if agreement['flash_vs_ground_truth'] >= 0.8 else 'good' if agreement['flash_vs_ground_truth'] >= 0.7 else 'moderate' if agreement['flash_vs_ground_truth'] >= 0.6 else 'poor'}">{agreement['flash_vs_ground_truth']:.1%}</div>
                <div class="metric-label">Ground Truth Agreement</div>
                <div class="metric-description">Flash matches human web labels</div>
            </div>
        </div>

        <div class="comparison-section">
            <div class="chart-title">📊 Flash Web Model Performance Metrics</div>
            <div style="max-width: 600px; margin: 0 auto;">
                <div class="model-column flash-column">
                    <div class="model-title">⚡ Flash vs Web Ground Truth</div>
                    <div class="metric-row"><span>F1-Score:</span><span class="{'excellent' if flash_metrics['f1_score'] >= 0.8 else 'good' if flash_metrics['f1_score'] >= 0.7 else 'moderate' if flash_metrics['f1_score'] >= 0.6 else 'poor'}">{flash_metrics['f1_score']:.3f}</span></div>
                    <div class="metric-row"><span>Accuracy:</span><span>{flash_metrics['accuracy']:.3f}</span></div>
                    <div class="metric-row"><span>Precision:</span><span>{flash_metrics['precision']:.3f}</span></div>
                    <div class="metric-row"><span>Recall (TPR):</span><span>{flash_metrics['recall']:.3f}</span></div>
                    <div class="metric-row"><span>Specificity (TNR):</span><span>{flash_metrics['specificity']:.3f}</span></div>
                    <div class="metric-row"><span>False Positive Rate:</span><span>{flash_metrics['false_positive_rate']:.3f}</span></div>
                    <div class="metric-row"><span>False Negative Rate:</span><span>{flash_metrics['false_negative_rate']:.3f}</span></div>
                    <div class="metric-row"><span>Web Ground Truth Agreement:</span><span>{agreement['flash_vs_ground_truth']:.1%}</span></div>
                </div>
            </div>
        </div>

        <div class="dashboard-grid">
            <div class="chart-section">
                <div class="chart-title">⚡ Flash Web Confusion Matrix</div>
                <div class="confusion-matrix">
                    <div class="cm-cell cm-header"></div>
                    <div class="cm-cell cm-header">Web GT: 0</div>
                    <div class="cm-cell cm-header">Web GT: 1</div>
                    
                    <div class="cm-cell cm-header">Flash: 0</div>
                    <div class="cm-cell cm-tn">
                        {flash_cm['true_negatives']}
                        <div class="cm-label">True Negatives</div>
                    </div>
                    <div class="cm-cell cm-fn">
                        {flash_cm['false_negatives']}
                        <div class="cm-label">False Negatives</div>
                    </div>
                    
                    <div class="cm-cell cm-header">Flash: 1</div>
                    <div class="cm-cell cm-fp">
                        {flash_cm['false_positives']}
                        <div class="cm-label">False Positives</div>
                    </div>
                    <div class="cm-cell cm-tp">
                        {flash_cm['true_positives']}
                        <div class="cm-label">True Positives</div>
                    </div>
                </div>
                <div style="text-align: center; margin-top: 15px;">
                    <p><strong>TP:</strong> {flash_cm['true_positives']} | <strong>FP:</strong> {flash_cm['false_positives']} | <strong>TN:</strong> {flash_cm['true_negatives']} | <strong>FN:</strong> {flash_cm['false_negatives']}</p>
                </div>
            </div>
        </div>
    </div>

    <script>
        window.addEventListener('load', function() {{
            console.log('Flash Web Classification Dashboard Loaded');
        }});
    </script>
</body>
</html>"""
        
        return html_content
    
    def generate_classification_dashboard(self):
        """Main function to generate the classification dashboard"""
        print("🚀 STARTING FLASH WEB CLASSIFICATION DASHBOARD GENERATION")
        print("=" * 70)
        
        try:
            # Load results with ground truth
            df = self.load_results_with_ground_truth()
            
            if df.empty:
                print("❌ No data available for dashboard")
                return None
            
            # Calculate metrics
            metrics = self.calculate_classification_metrics(df)
            
            if not metrics:
                print("❌ Could not calculate metrics")
                return None
            
            # Create dashboard
            print("\n📊 Creating Flash Web Classification Dashboard...")
            dashboard_html = self.create_classification_dashboard(metrics, df)
            
            # Save files
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            dashboard_filename = f"flash_web_classification_dashboard_{timestamp}.html"
            metrics_filename = f"flash_web_classification_metrics_{timestamp}.json"
            
            with open(dashboard_filename, "w", encoding='utf-8') as f:
                f.write(dashboard_html)
            
            with open(metrics_filename, "w") as f:
                json.dump(metrics, f, indent=2, default=str)
            
            print(f"✅ Dashboard saved: {dashboard_filename}")
            print(f"✅ Metrics saved: {metrics_filename}")
            
            # Summary
            flash_metrics = metrics['flash_web_metrics']
            agreement = metrics['agreement_analysis']
            
            print(f"\n📊 FLASH WEB CLASSIFICATION METRICS SUMMARY:")
            print(f"   ⚡ Flash Web Model:")
            print(f"      F1-Score: {flash_metrics['f1_score']:.3f}")
            print(f"      Precision: {flash_metrics['precision']:.3f}")
            print(f"      Recall: {flash_metrics['recall']:.3f}") 
            print(f"      Accuracy: {flash_metrics['accuracy']:.3f}")
            print(f"      TP: {flash_metrics['confusion_matrix']['true_positives']}, FP: {flash_metrics['confusion_matrix']['false_positives']}")
            print(f"      TN: {flash_metrics['confusion_matrix']['true_negatives']}, FN: {flash_metrics['confusion_matrix']['false_negatives']}")
            
            print(f"\n   ⚖️ Agreement Rates:")
            print(f"      Flash vs Web Ground Truth: {agreement['flash_vs_ground_truth']:.1%}")
            
            return dashboard_filename, metrics_filename
            
        except Exception as e:
            print(f"❌ Dashboard generation failed: {e}")
            import traceback
            traceback.print_exc()
            return None

def main():
    """Run the Flash Web classification dashboard generation"""
    dashboard_generator = FlashWebClassificationDashboard()
    result = dashboard_generator.generate_classification_dashboard()
    
    if result:
        dashboard_file, metrics_file = result
        print(f"\n🎉 SUCCESS!")
        print(f"📁 Files generated:")
        print(f"   1. {dashboard_file}")
        print(f"   2. {metrics_file}")
        print(f"\n💡 Open {dashboard_file} in your browser to view the dashboard!")
    else:
        print(f"\n❌ Dashboard generation failed")

if __name__ == "__main__":
    main()

🚀 STARTING FLASH WEB CLASSIFICATION DASHBOARD GENERATION
📥 Loading Flash Web results with ground truth...
📊 Loading Flash Web results data...
   Found 294 Flash result records
📊 Loading Web ground truth data...
   Found 294 web ground truth records
   Web ground truth distribution: {100: 252, 0: 42}
✅ Merged dataset: 294 records
   Sample artifact_ids from results: ['frag-mutti.de/fenchelsalat-fuer-fenchelhasser-a29249', 'hd.se/2025-05-24/gavan-som-kostar-kommunen-26-miljoner-kronor', 'forbes.com/sites/krisholt/2025/06/26/nyt-mini-crossword-hints-for-friday-june-27-clues-and-answers-for-todays-game-beach-feed-fillers', 'fr.de/rhein-main/euro-nach-hamburg-11640349.html', 'frag-mutti.de/bratensauce-verlaengern-das-einfachste-rezept-a62664']
   Sample artifact_ids from web GT: ['news.de/gesundheit/855915949/corona-zahlen-kreisfreie-stadt-solingen-heute-aktuell-27-06-2025-coronavirus-news-zu-rki-fallzahlen-tote-in-nordrhein-westfalen-intensivbetten-auslastung-und-neue-covid-19-variante-nim

In [56]:
#!/usr/bin/env python3
"""
Flash Meta Classification Metrics Dashboard - Fixed ID Matching
Enhanced version with better artifact_id matching and debugging
"""

import pandas as pd
import json
from google.cloud import bigquery
from datetime import datetime
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
import numpy as np
from typing import Dict, List, Any, Optional

PROJECT_ID = "scope3-dev"
DATASET_ID = "research_bs_monitoring"
META_GROUND_TRUTH_TABLE = "BA_Meta_Ground_Truth"

class FlashMetaCSVClassificationDashboard:
    def __init__(self, id_column_name=None, csv_file_path=None):
        self.client = bigquery.Client(project=PROJECT_ID)
        self.csv_file_path = csv_file_path or "texts_images_classification_results_meta_multimodal_dataset_300.csv"
        self.id_column_name = id_column_name  # Allow manual specification
        
    def load_csv_file(self) -> pd.DataFrame:
        """Load Flash results from the specific CSV file with improved ID handling"""
        print(f"📥 Loading Flash results from CSV: {self.csv_file_path}")
        
        try:
            df = pd.read_csv(self.csv_file_path)
            
            print(f"✅ Loaded {len(df)} Flash results from CSV")
            print(f"📊 CSV columns: {df.columns.tolist()}")
            
            # Auto-detect ID column - try different possible names
            if self.id_column_name:
                # Use manually specified column name
                if self.id_column_name in df.columns:
                    id_column = self.id_column_name
                    print(f"🎯 Using manually specified ID column: '{id_column}'")
                else:
                    raise ValueError(f"Specified ID column '{self.id_column_name}' not found in CSV")
            else:
                # Auto-detect from common names
                id_column_candidates = ['artifact_id', 'artifact', 'id', 'sample_id', 'record_id']
                id_column = None
                
                for candidate in id_column_candidates:
                    if candidate in df.columns:
                        id_column = candidate
                        print(f"🎯 Auto-detected ID column: '{id_column}'")
                        break
            
            if id_column is None:
                print(f"❌ No ID column found. Available columns: {df.columns.tolist()}")
                print(f"   Looking for any of: {id_column_candidates}")
                raise ValueError(f"No ID column found in CSV")
            
            # Validate required columns
            required_columns = [id_column, 'correct_classification']
            missing_columns = [col for col in required_columns if col not in df.columns]
            if missing_columns:
                raise ValueError(f"Missing required columns: {missing_columns}")
            
            # Standardize ID column name to 'artifact_id'
            if id_column != 'artifact_id':
                df['artifact_id'] = df[id_column]
                print(f"🔄 Standardized '{id_column}' → 'artifact_id'")
            
            # Clean and standardize artifact_ids
            print(f"🧹 Cleaning artifact_ids...")
            original_count = len(df)
            
            # Convert to string and strip whitespace
            df['artifact_id_original'] = df['artifact_id'].copy()
            df['artifact_id'] = df['artifact_id'].astype(str).str.strip()
            
            # Remove any rows with null/empty artifact_ids
            df = df[df['artifact_id'].notna() & (df['artifact_id'] != '') & (df['artifact_id'] != 'nan')]
            
            print(f"   Cleaned {original_count} → {len(df)} records")
            print(f"   Sample artifact_ids: {df['artifact_id'].head().tolist()}")
            
            # Show Flash classification distribution
            flash_dist = df['correct_classification'].value_counts().to_dict()
            print(f"📊 Flash CSV classification distribution: {flash_dist}")
            
            return df
            
        except Exception as e:
            print(f"❌ Error loading CSV file: {e}")
            raise
    
    def load_ground_truth_with_flexible_matching(self) -> pd.DataFrame:
        """Load ground truth with multiple ID matching strategies"""
        print("📊 Loading Meta ground truth data with flexible matching...")
        
        meta_gt_query = f"""
        SELECT 
            artifact_id,
            correct_classification as ground_truth,
            correct_reasoning as ground_truth_reasoning,
            source as gt_source
        FROM `{PROJECT_ID}.{DATASET_ID}.{META_GROUND_TRUTH_TABLE}`
        """
        
        try:
            meta_gt_df = self.client.query(meta_gt_query).to_dataframe()
            print(f"   Found {len(meta_gt_df)} meta ground truth records")
            
            if len(meta_gt_df) == 0:
                print("❌ No ground truth data found!")
                return pd.DataFrame()
            
            # Clean ground truth artifact_ids
            print(f"🧹 Cleaning ground truth artifact_ids...")
            original_gt_count = len(meta_gt_df)
            
            # Store original for debugging
            meta_gt_df['artifact_id_original'] = meta_gt_df['artifact_id'].copy()
            
            # Convert to string and strip
            meta_gt_df['artifact_id'] = meta_gt_df['artifact_id'].astype(str).str.strip()
            
            # Remove null/empty IDs
            meta_gt_df = meta_gt_df[meta_gt_df['artifact_id'].notna() & 
                                  (meta_gt_df['artifact_id'] != '') & 
                                  (meta_gt_df['artifact_id'] != 'nan')]
            
            print(f"   Cleaned {original_gt_count} → {len(meta_gt_df)} ground truth records")
            print(f"   Sample GT artifact_ids: {meta_gt_df['artifact_id'].head().tolist()}")
            
            # Check ground truth distribution
            if len(meta_gt_df) > 0:
                gt_dist = meta_gt_df['ground_truth'].value_counts().to_dict()
                print(f"   Meta ground truth distribution: {gt_dist}")
            
            return meta_gt_df
            
        except Exception as e:
            print(f"❌ Error loading ground truth data: {e}")
            import traceback
            traceback.print_exc()
            return pd.DataFrame()
        
    def load_results_with_ground_truth(self) -> pd.DataFrame:
        """Load Flash CSV results with ground truth using improved matching"""
        print("📥 Loading Flash CSV results with Meta ground truth...")
        
        # Load Flash results from CSV
        flash_df = self.load_csv_file()
        if flash_df.empty:
            return pd.DataFrame()
        
        # Load ground truth
        meta_gt_df = self.load_ground_truth_with_flexible_matching()
        if meta_gt_df.empty:
            return pd.DataFrame()
        
        # Debug ID comparison before merge
        print(f"\n🔍 Pre-merge ID analysis:")
        csv_ids = set(flash_df['artifact_id'])
        gt_ids = set(meta_gt_df['artifact_id'])
        
        print(f"   CSV unique IDs: {len(csv_ids)}")
        print(f"   GT unique IDs: {len(gt_ids)}")
        
        # Check for direct overlap
        direct_overlap = csv_ids.intersection(gt_ids)
        print(f"   Direct overlap: {len(direct_overlap)} IDs")
        
        if len(direct_overlap) == 0:
            print(f"   ❌ No direct overlap found!")
            print(f"   CSV ID samples: {list(csv_ids)[:5]}")
            print(f"   GT ID samples: {list(gt_ids)[:5]}")
            
            # Try alternative matching strategies
            print(f"\n🔄 Trying alternative matching strategies...")
            
            # Strategy 1: Try numeric conversion
            try:
                csv_numeric = set(pd.to_numeric(flash_df['artifact_id'], errors='coerce').dropna().astype(int).astype(str))
                gt_numeric = set(pd.to_numeric(meta_gt_df['artifact_id'], errors='coerce').dropna().astype(int).astype(str))
                numeric_overlap = csv_numeric.intersection(gt_numeric)
                print(f"   Strategy 1 (numeric): {len(numeric_overlap)} matches")
                
                if len(numeric_overlap) > 0:
                    # Convert both dataframes to use numeric string format
                    flash_df['artifact_id'] = pd.to_numeric(flash_df['artifact_id'], errors='coerce').astype('Int64').astype(str)
                    meta_gt_df['artifact_id'] = pd.to_numeric(meta_gt_df['artifact_id'], errors='coerce').astype('Int64').astype(str)
                    print(f"   ✅ Using numeric conversion strategy")
            except Exception as e:
                print(f"   Strategy 1 failed: {e}")
            
            # Strategy 2: Try removing common prefixes/suffixes
            # (Add more strategies here if needed)
        
        # Perform the merge
        print(f"\n🔗 Performing merge...")
        df = flash_df.merge(meta_gt_df, on='artifact_id', how='inner')
        print(f"✅ Merged dataset: {len(df)} records")
        
        if len(df) == 0:
            print("❌ No matching records found after merge")
            print(f"   Final CSV artifact_ids: {flash_df['artifact_id'].nunique()} unique")
            print(f"   Final GT artifact_ids: {meta_gt_df['artifact_id'].nunique()} unique")
            
            # Show detailed debugging
            print(f"\n🔍 DEBUGGING INFO:")
            print(f"   CSV ID examples: {flash_df['artifact_id'].head().tolist()}")
            print(f"   GT ID examples: {meta_gt_df['artifact_id'].head().tolist()}")
            
            # Check if any partial matches exist
            print(f"\n   Checking for partial matches...")
            csv_sample = flash_df['artifact_id'].head(10).tolist()
            gt_sample = meta_gt_df['artifact_id'].head(10).tolist()
            
            for csv_id in csv_sample:
                for gt_id in gt_sample:
                    if str(csv_id) in str(gt_id) or str(gt_id) in str(csv_id):
                        print(f"   Partial match found: '{csv_id}' ↔ '{gt_id}'")
            
            return pd.DataFrame()
        
        # Rename columns for clarity
        df['flash_classification'] = df['correct_classification']
        df['flash_reasoning'] = df.get('correct_reasoning', '')
        
        # Check for data contamination
        identical_before_conversion = (df['flash_classification'] == df['ground_truth']).sum()
        print(f"   Flash CSV vs Meta Ground Truth identical: {identical_before_conversion}/{len(df)} ({identical_before_conversion/len(df):.1%})")
        
        if identical_before_conversion == len(df):
            print("   🚨 WARNING: Flash CSV results are identical to meta ground_truth!")
            print("   This suggests potential data contamination.")
        
        # Convert to binary format
        df['ground_truth_binary'] = (df['ground_truth'] == 100).astype(int)
        df['flash_binary'] = (df['flash_classification'] == 100).astype(int)
        
        print(f"📊 Final data distribution:")
        print(f"   Meta ground truth: {df['ground_truth_binary'].value_counts().to_dict()}")
        print(f"   Flash CSV predictions: {df['flash_binary'].value_counts().to_dict()}")
        
        # Validate binary data
        for col, name in [('ground_truth_binary', 'Meta ground truth'), 
                        ('flash_binary', 'Flash CSV')]:
            if not all(df[col].isin([0, 1])):
                print(f"⚠️ Warning: {name} contains non-binary values")
                df = df[df[col].isin([0, 1])]
        
        print(f"📊 Final clean dataset: {len(df)} records")
        
        return df
    
    def calculate_classification_metrics(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Calculate comprehensive classification metrics for Flash"""
        print("📊 Calculating Flash classification metrics...")
        
        if len(df) == 0:
            return {}
        
        y_true = df['ground_truth_binary'].values
        flash_pred = df['flash_binary'].values
        
        # Flash metrics
        flash_metrics = self._calculate_model_metrics(y_true, flash_pred, "Flash")
        
        # Agreement analysis
        flash_vs_ground_truth_agreement = (flash_pred == y_true).mean()
        
        print(f"✅ Metrics calculated:")
        print(f"   Flash F1: {flash_metrics['f1_score']:.3f}")
        print(f"   Flash vs GT agreement: {flash_vs_ground_truth_agreement:.1%}")
        
        return {
            'dataset_info': {
                'total_samples': len(df),
                'positive_samples': int((y_true == 1).sum()),
                'negative_samples': int((y_true == 0).sum()),
                'class_balance': float((y_true == 1).mean())
            },
            'flash_csv_metrics': flash_metrics,
            'agreement_analysis': {
                'flash_vs_ground_truth': flash_vs_ground_truth_agreement
            }
        }
    
    def _calculate_model_metrics(self, y_true: np.ndarray, y_pred: np.ndarray, model_name: str) -> Dict[str, Any]:
        """Calculate detailed metrics for a single model"""
        
        y_true = np.array(y_true).astype(int)
        y_pred = np.array(y_pred).astype(int)
        
        accuracy = accuracy_score(y_true, y_pred)
        
        # Handle edge cases
        unique_pred = np.unique(y_pred)
        unique_true = np.unique(y_true)
        
        if len(unique_pred) == 1 or len(unique_true) == 1:
            precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
            recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
        else:
            precision = precision_score(y_true, y_pred, average='binary', zero_division=0)
            recall = recall_score(y_true, y_pred, average='binary', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)
        
        # Confusion matrix
        cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
        
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
        elif cm.shape == (1, 1):
            if unique_true[0] == 0 and unique_pred[0] == 0:
                tn, fp, fn, tp = cm[0,0], 0, 0, 0
            elif unique_true[0] == 1 and unique_pred[0] == 1:
                tn, fp, fn, tp = 0, 0, 0, cm[0,0]
            else:
                tn, fp, fn, tp = 0, cm[0,0], 0, 0
        else:
            tn = fp = fn = tp = 0
        
        # Additional metrics
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
        
        return {
            'model_name': model_name,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'specificity': specificity,
            'sensitivity': sensitivity,
            'false_positive_rate': fpr,
            'false_negative_rate': fnr,
            'confusion_matrix': {
                'true_positives': int(tp),
                'false_positives': int(fp), 
                'true_negatives': int(tn),
                'false_negatives': int(fn)
            }
        }
    
    def create_classification_dashboard(self, metrics: Dict[str, Any], df: pd.DataFrame) -> str:
        """Create comprehensive classification metrics dashboard"""
        
        if not metrics:
            return "<html><body><h1>No valid metrics to display</h1></body></html>"
        
        flash_metrics = metrics['flash_csv_metrics']
        dataset_info = metrics['dataset_info']
        agreement = metrics['agreement_analysis']
        
        flash_cm = flash_metrics['confusion_matrix']
        
        # Performance assessment
        if flash_metrics['f1_score'] >= 0.8:
            flash_assessment = "🟢 EXCELLENT"
            flash_color = "#10B981"
        elif flash_metrics['f1_score'] >= 0.7:
            flash_assessment = "🟡 GOOD" 
            flash_color = "#F59E0B"
        elif flash_metrics['f1_score'] >= 0.6:
            flash_assessment = "🟠 MODERATE"
            flash_color = "#FF6B35"
        else:
            flash_assessment = "🔴 POOR"
            flash_color = "#EF4444"
        
        html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Flash Meta Classification Metrics Dashboard</title>
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{ font-family: 'Inter', sans-serif; background: linear-gradient(135deg, #ff6b6b 0%, #ee5a52 100%); color: #333; line-height: 1.6; min-height: 100vh; }}
        .dashboard {{ max-width: 1800px; margin: 0 auto; padding: 20px; }}
        .header {{ background: rgba(255,255,255,0.95); padding: 40px; border-radius: 25px; text-align: center; margin-bottom: 30px; backdrop-filter: blur(15px); box-shadow: 0 20px 60px rgba(0,0,0,0.1); }}
        .header h1 {{ font-size: 3rem; margin-bottom: 15px; background: linear-gradient(135deg, #ff6b6b 0%, #ee5a52 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; }}
        .header h2 {{ font-size: 1.5rem; color: #666; margin-bottom: 20px; }}
        .model-badge {{ display: inline-block; padding: 10px 25px; background: linear-gradient(135deg, #ff6b6b 0%, #ee5a52 100%); color: white; border-radius: 30px; font-weight: bold; font-size: 1.1rem; margin: 5px; }}
        .csv-info {{ background: rgba(255,255,255,0.8); padding: 15px; border-radius: 10px; margin-top: 15px; font-size: 0.9rem; color: #555; }}
        .performance-summary {{ background: linear-gradient(135deg, #ffe6e6, #ffcccc); padding: 30px; border-radius: 20px; margin: 30px 0; text-align: center; }}
        .performance-score {{ font-size: 3.5rem; font-weight: bold; color: {flash_color}; margin-bottom: 10px; }}
        .performance-label {{ font-size: 1.3rem; color: #37474f; margin-bottom: 8px; }}
        .performance-description {{ font-size: 1rem; color: #546e7a; }}
        .metrics-overview {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 20px; margin-bottom: 40px; }}
        .metric-card {{ background: rgba(255,255,255,0.95); padding: 25px; border-radius: 20px; text-align: center; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); transition: transform 0.3s ease; }}
        .metric-card:hover {{ transform: translateY(-5px); }}
        .metric-value {{ font-size: 2.5rem; font-weight: bold; margin-bottom: 10px; }}
        .metric-label {{ font-size: 1rem; color: #666; font-weight: 600; margin-bottom: 5px; }}
        .metric-description {{ font-size: 0.85rem; color: #888; }}
        .excellent {{ color: #10B981; }}
        .good {{ color: #F59E0B; }}
        .moderate {{ color: #FF6B35; }}
        .poor {{ color: #EF4444; }}
        .dashboard-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(450px, 1fr)); gap: 25px; margin-bottom: 40px; }}
        .chart-section {{ background: rgba(255,255,255,0.95); padding: 30px; border-radius: 20px; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); }}
        .chart-title {{ font-size: 1.3rem; font-weight: bold; margin-bottom: 20px; color: #333; text-align: center; }}
        .confusion-matrix {{ display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px; margin: 20px 0; text-align: center; font-size: 0.9rem; }}
        .cm-cell {{ padding: 15px; border-radius: 12px; font-weight: bold; display: flex; flex-direction: column; justify-content: center; align-items: center; }}
        .cm-header {{ background: linear-gradient(135deg, #f8f9fa, #e9ecef); color: #495057; }}
        .cm-tp {{ background: linear-gradient(135deg, #d4edda, #c3e6cb); color: #155724; font-size: 1.8rem; }}
        .cm-fp {{ background: linear-gradient(135deg, #f8d7da, #f1b0b7); color: #721c24; font-size: 1.8rem; }}
        .cm-fn {{ background: linear-gradient(135deg, #fff3cd, #fce4a6); color: #856404; font-size: 1.8rem; }}
        .cm-tn {{ background: linear-gradient(135deg, #d1ecf1, #bee5eb); color: #0c5460; font-size: 1.8rem; }}
        .cm-label {{ font-size: 0.75rem; margin-top: 5px; opacity: 0.8; }}
    </style>
</head>
<body>
    <div class="dashboard">
        <div class="header">
            <h1>⚡ Flash CSV Meta Classification Metrics</h1>
            <h2>texts_images_classification_results_web_multimodal_dataset_300.csv vs Meta Ground Truth</h2>
            <div>
                <span class="model-badge">Flash CSV Model</span>
            </div>
            <div class="csv-info">
                📁 Source CSV: texts_images_classification_results_web_multimodal_dataset_300.csv<br>
                🎯 Successfully matched {dataset_info['total_samples']} artifact IDs with ground truth
            </div>
            <p style="margin-top: 20px; color: #666;">
                Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | 
                Samples: {dataset_info['total_samples']:,} | 
                Positive: {dataset_info['positive_samples']} | Negative: {dataset_info['negative_samples']}
            </p>
        </div>

        <div class="performance-summary">
            <div class="performance-score">{flash_metrics['f1_score']:.3f}</div>
            <div class="performance-label">{flash_assessment} F1-Score (Flash CSV Model)</div>
            <div class="performance-description">
                Flash CSV achieves {flash_metrics['accuracy']:.1%} accuracy with {flash_metrics['precision']:.1%} precision
                <br>Agreement with Meta Ground Truth: {agreement['flash_vs_ground_truth']:.1%}
            </div>
        </div>

        <div class="metrics-overview">
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['f1_score'] >= 0.8 else 'good' if flash_metrics['f1_score'] >= 0.7 else 'moderate' if flash_metrics['f1_score'] >= 0.6 else 'poor'}">{flash_metrics['f1_score']:.3f}</div>
                <div class="metric-label">Flash CSV F1-Score</div>
                <div class="metric-description">Harmonic Mean P&R</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['recall'] >= 0.8 else 'good' if flash_metrics['recall'] >= 0.7 else 'moderate' if flash_metrics['recall'] >= 0.6 else 'poor'}">{flash_metrics['recall']:.3f}</div>
                <div class="metric-label">Flash CSV TPR</div>
                <div class="metric-description">True Positive Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['specificity'] >= 0.8 else 'good' if flash_metrics['specificity'] >= 0.7 else 'moderate' if flash_metrics['specificity'] >= 0.6 else 'poor'}">{flash_metrics['specificity']:.3f}</div>
                <div class="metric-label">Flash CSV TNR</div>
                <div class="metric-description">True Negative Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['accuracy'] >= 0.9 else 'good' if flash_metrics['accuracy'] >= 0.8 else 'moderate' if flash_metrics['accuracy'] >= 0.7 else 'poor'}">{flash_metrics['accuracy']:.3f}</div>
                <div class="metric-label">Flash CSV Accuracy</div>
                <div class="metric-description">Overall Correctness</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if agreement['flash_vs_ground_truth'] >= 0.8 else 'good' if agreement['flash_vs_ground_truth'] >= 0.7 else 'moderate' if agreement['flash_vs_ground_truth'] >= 0.6 else 'poor'}">{agreement['flash_vs_ground_truth']:.1%}</div>
                <div class="metric-label">Ground Truth Agreement</div>
                <div class="metric-description">Flash CSV matches human labels</div>
            </div>
        </div>

        <div class="dashboard-grid">
            <div class="chart-section">
                <div class="chart-title">⚡ Flash CSV Confusion Matrix</div>
                <div class="confusion-matrix">
                    <div class="cm-cell cm-header"></div>
                    <div class="cm-cell cm-header">Meta GT: 0</div>
                    <div class="cm-cell cm-header">Meta GT: 1</div>
                    
                    <div class="cm-cell cm-header">Flash CSV: 0</div>
                    <div class="cm-cell cm-tn">
                        {flash_cm['true_negatives']}
                        <div class="cm-label">True Negatives</div>
                    </div>
                    <div class="cm-cell cm-fn">
                        {flash_cm['false_negatives']}
                        <div class="cm-label">False Negatives</div>
                    </div>
                    
                    <div class="cm-cell cm-header">Flash CSV: 1</div>
                    <div class="cm-cell cm-fp">
                        {flash_cm['false_positives']}
                        <div class="cm-label">False Positives</div>
                    </div>
                    <div class="cm-cell cm-tp">
                        {flash_cm['true_positives']}
                        <div class="cm-label">True Positives</div>
                    </div>
                </div>
                <div style="text-align: center; margin-top: 15px;">
                    <p><strong>TP:</strong> {flash_cm['true_positives']} | <strong>FP:</strong> {flash_cm['false_positives']} | <strong>TN:</strong> {flash_cm['true_negatives']} | <strong>FN:</strong> {flash_cm['false_negatives']}</p>
                </div>
            </div>
        </div>
    </div>
</body>
</html>"""
        
        return html_content
    
    def generate_classification_dashboard(self):
        """Main function to generate the classification dashboard"""
        print("🚀 STARTING FLASH CSV META CLASSIFICATION DASHBOARD GENERATION (FIXED)")
        print("=" * 70)
        print(f"📁 Using CSV file: {self.csv_file_path}")
        
        try:
            # Load results with ground truth
            df = self.load_results_with_ground_truth()
            
            if df.empty:
                print("❌ No data available for dashboard")
                return None
            
            # Calculate metrics
            metrics = self.calculate_classification_metrics(df)
            
            if not metrics:
                print("❌ Could not calculate metrics")
                return None
            
            # Create dashboard
            print("\n📊 Creating Flash CSV Meta Classification Dashboard...")
            dashboard_html = self.create_classification_dashboard(metrics, df)
            
            # Save files
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            dashboard_filename = f"flash_csv_meta_classification_dashboard_{timestamp}.html"
            metrics_filename = f"flash_csv_meta_classification_metrics_{timestamp}.json"
            
            with open(dashboard_filename, "w", encoding='utf-8') as f:
                f.write(dashboard_html)
            
            with open(metrics_filename, "w") as f:
                json.dump(metrics, f, indent=2, default=str)
            
            print(f"✅ Dashboard saved: {dashboard_filename}")
            print(f"✅ Metrics saved: {metrics_filename}")
            
            # Summary
            flash_metrics = metrics['flash_csv_metrics']
            agreement = metrics['agreement_analysis']
            
            print(f"\n📊 FLASH CSV META CLASSIFICATION METRICS SUMMARY:")
            print(f"   ⚡ Flash CSV Model:")
            print(f"      F1-Score: {flash_metrics['f1_score']:.3f}")
            print(f"      Precision: {flash_metrics['precision']:.3f}")
            print(f"      Recall: {flash_metrics['recall']:.3f}") 
            print(f"      Accuracy: {flash_metrics['accuracy']:.3f}")
            print(f"      TP: {flash_metrics['confusion_matrix']['true_positives']}, FP: {flash_metrics['confusion_matrix']['false_positives']}")
            print(f"      TN: {flash_metrics['confusion_matrix']['true_negatives']}, FN: {flash_metrics['confusion_matrix']['false_negatives']}")
            
            print(f"\n   ⚖️ Agreement Rates:")
            print(f"      Flash CSV vs Meta Ground Truth: {agreement['flash_vs_ground_truth']:.1%}")
            
            return dashboard_filename, metrics_filename
            
        except Exception as e:
            print(f"❌ Dashboard generation failed: {e}")
            import traceback
            traceback.print_exc()
            return None

def main():
    """Run the Flash CSV Meta classification dashboard generation"""
    # Option 1: Use a different CSV file with Meta IDs
    # dashboard_generator = FlashMetaCSVClassificationDashboard(csv_file_path="your_meta_ids_file.csv")
    
    # Option 2: Use a different column from the current file
    # dashboard_generator = FlashMetaCSVClassificationDashboard(id_column_name='source')
    
    # Option 3: Default behavior
    dashboard_generator = FlashMetaCSVClassificationDashboard()
    result = dashboard_generator.generate_classification_dashboard()
    
    if result:
        dashboard_file, metrics_file = result
        print(f"\n🎉 SUCCESS!")
        print(f"📁 Files generated:")
        print(f"   1. {dashboard_file}")
        print(f"   2. {metrics_file}")
        print(f"\n💡 Open {dashboard_file} in your browser to view the dashboard!")
    else:
        print(f"\n❌ Dashboard generation failed")

if __name__ == "__main__":
    main()

🚀 STARTING FLASH CSV META CLASSIFICATION DASHBOARD GENERATION (FIXED)
📁 Using CSV file: texts_images_classification_results_meta_multimodal_dataset_300.csv
📥 Loading Flash CSV results with Meta ground truth...
📥 Loading Flash results from CSV: texts_images_classification_results_meta_multimodal_dataset_300.csv
✅ Loaded 300 Flash results from CSV
📊 CSV columns: ['source', 'artifact_id', 'model_id', 'model_prompt', 'artifact', 'text_content', 'classified_at', 'correct_classification', 'correct_reasoning', 'artifact_json_gcs_url', 'asset_filepaths', 'detected_language', 'topics', 'text_length', 'asset_count']
🎯 Auto-detected ID column: 'artifact_id'
🧹 Cleaning artifact_ids...
   Cleaned 300 → 300 records
   Sample artifact_ids: ['meta:2247224725729245', 'meta:1044227981254015', 'meta:1790289251530852', 'meta:122172716408563103', 'meta:709838841794824']
📊 Flash CSV classification distribution: {100: 200, 0: 100}
📊 Loading Meta ground truth data with flexible matching...
   Found 299 meta g

In [58]:
#!/usr/bin/env python3
"""
Flash Meta Classification Metrics Dashboard - Fixed ID Matching
Enhanced version with better artifact_id matching and debugging
"""

import pandas as pd
import json
from google.cloud import bigquery
from datetime import datetime
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
import numpy as np
from typing import Dict, List, Any, Optional

PROJECT_ID = "scope3-dev"
DATASET_ID = "research_bs_monitoring"
META_GROUND_TRUTH_TABLE = "BA_Web_Ground_Truth"  # Changed to Web Ground Truth

class FlashMetaCSVClassificationDashboard:
    def __init__(self, id_column_name=None, csv_file_path=None):
        self.client = bigquery.Client(project=PROJECT_ID)
        self.csv_file_path = csv_file_path or "texts_images_classification_results_web_multimodal_dataset_300.csv"  # Back to web CSV
        self.id_column_name = id_column_name  # Allow manual specification
        
    def load_csv_file(self) -> pd.DataFrame:
        """Load Flash results from the specific CSV file with improved ID handling"""
        print(f"📥 Loading Flash results from CSV: {self.csv_file_path}")
        
        try:
            df = pd.read_csv(self.csv_file_path)
            
            print(f"✅ Loaded {len(df)} Flash results from CSV")
            print(f"📊 CSV columns: {df.columns.tolist()}")
            
            # Auto-detect ID column - try different possible names
            if self.id_column_name:
                # Use manually specified column name
                if self.id_column_name in df.columns:
                    id_column = self.id_column_name
                    print(f"🎯 Using manually specified ID column: '{id_column}'")
                else:
                    raise ValueError(f"Specified ID column '{self.id_column_name}' not found in CSV")
            else:
                # Auto-detect from common names
                id_column_candidates = ['artifact_id', 'artifact', 'id', 'sample_id', 'record_id']
                id_column = None
                
                for candidate in id_column_candidates:
                    if candidate in df.columns:
                        id_column = candidate
                        print(f"🎯 Auto-detected ID column: '{id_column}'")
                        break
            
            if id_column is None:
                print(f"❌ No ID column found. Available columns: {df.columns.tolist()}")
                print(f"   Looking for any of: {id_column_candidates}")
                raise ValueError(f"No ID column found in CSV")
            
            # Validate required columns
            required_columns = [id_column, 'correct_classification']
            missing_columns = [col for col in required_columns if col not in df.columns]
            if missing_columns:
                raise ValueError(f"Missing required columns: {missing_columns}")
            
            # Standardize ID column name to 'artifact_id'
            if id_column != 'artifact_id':
                df['artifact_id'] = df[id_column]
                print(f"🔄 Standardized '{id_column}' → 'artifact_id'")
            
            # Clean and standardize artifact_ids
            print(f"🧹 Cleaning artifact_ids...")
            original_count = len(df)
            
            # Convert to string and strip whitespace
            df['artifact_id_original'] = df['artifact_id'].copy()
            df['artifact_id'] = df['artifact_id'].astype(str).str.strip()
            
            # Remove any rows with null/empty artifact_ids
            df = df[df['artifact_id'].notna() & (df['artifact_id'] != '') & (df['artifact_id'] != 'nan')]
            
            print(f"   Cleaned {original_count} → {len(df)} records")
            print(f"   Sample artifact_ids: {df['artifact_id'].head().tolist()}")
            
            # Show Flash classification distribution
            flash_dist = df['correct_classification'].value_counts().to_dict()
            print(f"📊 Flash CSV classification distribution: {flash_dist}")
            
            return df
            
        except Exception as e:
            print(f"❌ Error loading CSV file: {e}")
            raise
    
    def load_ground_truth_with_flexible_matching(self) -> pd.DataFrame:
        """Load ground truth with multiple ID matching strategies"""
        print("📊 Loading Web ground truth data with flexible matching...")
        
        meta_gt_query = f"""
        SELECT 
            artifact_id,
            correct_classification as ground_truth,
            correct_reasoning as ground_truth_reasoning,
            source as gt_source
        FROM `{PROJECT_ID}.{DATASET_ID}.{META_GROUND_TRUTH_TABLE}`
        """
        
        try:
            meta_gt_df = self.client.query(meta_gt_query).to_dataframe()
            print(f"   Found {len(meta_gt_df)} meta ground truth records")
            
            if len(meta_gt_df) == 0:
                print("❌ No ground truth data found!")
                return pd.DataFrame()
            
            # Clean ground truth artifact_ids
            print(f"🧹 Cleaning ground truth artifact_ids...")
            original_gt_count = len(meta_gt_df)
            
            # Store original for debugging
            meta_gt_df['artifact_id_original'] = meta_gt_df['artifact_id'].copy()
            
            # Convert to string and strip
            meta_gt_df['artifact_id'] = meta_gt_df['artifact_id'].astype(str).str.strip()
            
            # Remove null/empty IDs
            meta_gt_df = meta_gt_df[meta_gt_df['artifact_id'].notna() & 
                                  (meta_gt_df['artifact_id'] != '') & 
                                  (meta_gt_df['artifact_id'] != 'nan')]
            
            print(f"   Cleaned {original_gt_count} → {len(meta_gt_df)} ground truth records")
            print(f"   Sample GT artifact_ids: {meta_gt_df['artifact_id'].head().tolist()}")
            
            # Check ground truth distribution
            if len(meta_gt_df) > 0:
                gt_dist = meta_gt_df['ground_truth'].value_counts().to_dict()
                print(f"   Web ground truth distribution: {gt_dist}")
            
            return meta_gt_df
            
        except Exception as e:
            print(f"❌ Error loading ground truth data: {e}")
            import traceback
            traceback.print_exc()
            return pd.DataFrame()
        
    def load_results_with_ground_truth(self) -> pd.DataFrame:
        """Load Flash CSV results with ground truth using improved matching"""
        print("📥 Loading Flash CSV results with Web ground truth...")
        
        # Load Flash results from CSV
        flash_df = self.load_csv_file()
        if flash_df.empty:
            return pd.DataFrame()
        
        # Load ground truth
        meta_gt_df = self.load_ground_truth_with_flexible_matching()
        if meta_gt_df.empty:
            return pd.DataFrame()
        
        # Debug ID comparison before merge
        print(f"\n🔍 Pre-merge ID analysis:")
        csv_ids = set(flash_df['artifact_id'])
        gt_ids = set(meta_gt_df['artifact_id'])
        
        print(f"   CSV unique IDs: {len(csv_ids)}")
        print(f"   GT unique IDs: {len(gt_ids)}")
        
        # Check for direct overlap
        direct_overlap = csv_ids.intersection(gt_ids)
        print(f"   Direct overlap: {len(direct_overlap)} IDs")
        
        if len(direct_overlap) == 0:
            print(f"   ❌ No direct overlap found!")
            print(f"   CSV ID samples: {list(csv_ids)[:5]}")
            print(f"   GT ID samples: {list(gt_ids)[:5]}")
            
            # Try alternative matching strategies
            print(f"\n🔄 Trying alternative matching strategies...")
            
            # Strategy 1: Try numeric conversion
            try:
                csv_numeric = set(pd.to_numeric(flash_df['artifact_id'], errors='coerce').dropna().astype(int).astype(str))
                gt_numeric = set(pd.to_numeric(meta_gt_df['artifact_id'], errors='coerce').dropna().astype(int).astype(str))
                numeric_overlap = csv_numeric.intersection(gt_numeric)
                print(f"   Strategy 1 (numeric): {len(numeric_overlap)} matches")
                
                if len(numeric_overlap) > 0:
                    # Convert both dataframes to use numeric string format
                    flash_df['artifact_id'] = pd.to_numeric(flash_df['artifact_id'], errors='coerce').astype('Int64').astype(str)
                    meta_gt_df['artifact_id'] = pd.to_numeric(meta_gt_df['artifact_id'], errors='coerce').astype('Int64').astype(str)
                    print(f"   ✅ Using numeric conversion strategy")
            except Exception as e:
                print(f"   Strategy 1 failed: {e}")
            
            # Strategy 2: Try removing common prefixes/suffixes
            # (Add more strategies here if needed)
        
        # Perform the merge
        print(f"\n🔗 Performing merge...")
        df = flash_df.merge(meta_gt_df, on='artifact_id', how='inner')
        print(f"✅ Merged dataset: {len(df)} records")
        
        if len(df) == 0:
            print("❌ No matching records found after merge")
            print(f"   Final CSV artifact_ids: {flash_df['artifact_id'].nunique()} unique")
            print(f"   Final GT artifact_ids: {meta_gt_df['artifact_id'].nunique()} unique")
            
            # Show detailed debugging
            print(f"\n🔍 DEBUGGING INFO:")
            print(f"   CSV ID examples: {flash_df['artifact_id'].head().tolist()}")
            print(f"   GT ID examples: {meta_gt_df['artifact_id'].head().tolist()}")
            
            # Check if any partial matches exist
            print(f"\n   Checking for partial matches...")
            csv_sample = flash_df['artifact_id'].head(10).tolist()
            gt_sample = meta_gt_df['artifact_id'].head(10).tolist()
            
            for csv_id in csv_sample:
                for gt_id in gt_sample:
                    if str(csv_id) in str(gt_id) or str(gt_id) in str(csv_id):
                        print(f"   Partial match found: '{csv_id}' ↔ '{gt_id}'")
            
            return pd.DataFrame()
        
        # Rename columns for clarity
        df['flash_classification'] = df['correct_classification']
        df['flash_reasoning'] = df.get('correct_reasoning', '')
        
        # Check for data contamination
        identical_before_conversion = (df['flash_classification'] == df['ground_truth']).sum()
        print(f"   Flash CSV vs Web Ground Truth identical: {identical_before_conversion}/{len(df)} ({identical_before_conversion/len(df):.1%})")
        
        if identical_before_conversion == len(df):
            print("   🚨 WARNING: Flash CSV results are identical to web ground_truth!")
            print("   This suggests potential data contamination.")
        
        # Convert to binary format
        df['ground_truth_binary'] = (df['ground_truth'] == 100).astype(int)
        df['flash_binary'] = (df['flash_classification'] == 100).astype(int)
        
        print(f"📊 Final data distribution:")
        print(f"   Web ground truth: {df['ground_truth_binary'].value_counts().to_dict()}")
        print(f"   Flash CSV predictions: {df['flash_binary'].value_counts().to_dict()}")
        
        # Validate binary data
        for col, name in [('ground_truth_binary', 'Web ground truth'), 
                        ('flash_binary', 'Flash CSV')]:
            if not all(df[col].isin([0, 1])):
                print(f"⚠️ Warning: {name} contains non-binary values")
                df = df[df[col].isin([0, 1])]
        
        print(f"📊 Final clean dataset: {len(df)} records")
        
        return df
    
    def calculate_classification_metrics(self, df: pd.DataFrame) -> Dict[str, Any]:
        """Calculate comprehensive classification metrics for Flash"""
        print("📊 Calculating Flash classification metrics...")
        
        if len(df) == 0:
            return {}
        
        y_true = df['ground_truth_binary'].values
        flash_pred = df['flash_binary'].values
        
        # Flash metrics
        flash_metrics = self._calculate_model_metrics(y_true, flash_pred, "Flash")
        
        # Agreement analysis
        flash_vs_ground_truth_agreement = (flash_pred == y_true).mean()
        
        print(f"✅ Metrics calculated:")
        print(f"   Flash F1: {flash_metrics['f1_score']:.3f}")
        print(f"   Flash vs GT agreement: {flash_vs_ground_truth_agreement:.1%}")
        
        return {
            'dataset_info': {
                'total_samples': len(df),
                'positive_samples': int((y_true == 1).sum()),
                'negative_samples': int((y_true == 0).sum()),
                'class_balance': float((y_true == 1).mean())
            },
            'flash_csv_metrics': flash_metrics,
            'agreement_analysis': {
                'flash_vs_ground_truth': flash_vs_ground_truth_agreement
            }
        }
    
    def _calculate_model_metrics(self, y_true: np.ndarray, y_pred: np.ndarray, model_name: str) -> Dict[str, Any]:
        """Calculate detailed metrics for a single model"""
        
        y_true = np.array(y_true).astype(int)
        y_pred = np.array(y_pred).astype(int)
        
        accuracy = accuracy_score(y_true, y_pred)
        
        # Handle edge cases
        unique_pred = np.unique(y_pred)
        unique_true = np.unique(y_true)
        
        if len(unique_pred) == 1 or len(unique_true) == 1:
            precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
            recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
        else:
            precision = precision_score(y_true, y_pred, average='binary', zero_division=0)
            recall = recall_score(y_true, y_pred, average='binary', zero_division=0)
            f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)
        
        # Confusion matrix
        cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
        
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
        elif cm.shape == (1, 1):
            if unique_true[0] == 0 and unique_pred[0] == 0:
                tn, fp, fn, tp = cm[0,0], 0, 0, 0
            elif unique_true[0] == 1 and unique_pred[0] == 1:
                tn, fp, fn, tp = 0, 0, 0, cm[0,0]
            else:
                tn, fp, fn, tp = 0, cm[0,0], 0, 0
        else:
            tn = fp = fn = tp = 0
        
        # Additional metrics
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
        
        return {
            'model_name': model_name,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'specificity': specificity,
            'sensitivity': sensitivity,
            'false_positive_rate': fpr,
            'false_negative_rate': fnr,
            'confusion_matrix': {
                'true_positives': int(tp),
                'false_positives': int(fp), 
                'true_negatives': int(tn),
                'false_negatives': int(fn)
            }
        }
    
    def create_classification_dashboard(self, metrics: Dict[str, Any], df: pd.DataFrame) -> str:
        """Create comprehensive classification metrics dashboard"""
        
        if not metrics:
            return "<html><body><h1>No valid metrics to display</h1></body></html>"
        
        flash_metrics = metrics['flash_csv_metrics']
        dataset_info = metrics['dataset_info']
        agreement = metrics['agreement_analysis']
        
        flash_cm = flash_metrics['confusion_matrix']
        
        # Performance assessment
        if flash_metrics['f1_score'] >= 0.8:
            flash_assessment = "🟢 EXCELLENT"
            flash_color = "#10B981"
        elif flash_metrics['f1_score'] >= 0.7:
            flash_assessment = "🟡 GOOD" 
            flash_color = "#F59E0B"
        elif flash_metrics['f1_score'] >= 0.6:
            flash_assessment = "🟠 MODERATE"
            flash_color = "#FF6B35"
        else:
            flash_assessment = "🔴 POOR"
            flash_color = "#EF4444"
        
        html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Flash Meta Classification Metrics Dashboard</title>
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{ font-family: 'Inter', sans-serif; background: linear-gradient(135deg, #ff6b6b 0%, #ee5a52 100%); color: #333; line-height: 1.6; min-height: 100vh; }}
        .dashboard {{ max-width: 1800px; margin: 0 auto; padding: 20px; }}
        .header {{ background: rgba(255,255,255,0.95); padding: 40px; border-radius: 25px; text-align: center; margin-bottom: 30px; backdrop-filter: blur(15px); box-shadow: 0 20px 60px rgba(0,0,0,0.1); }}
        .header h1 {{ font-size: 3rem; margin-bottom: 15px; background: linear-gradient(135deg, #ff6b6b 0%, #ee5a52 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; }}
        .header h2 {{ font-size: 1.5rem; color: #666; margin-bottom: 20px; }}
        .model-badge {{ display: inline-block; padding: 10px 25px; background: linear-gradient(135deg, #ff6b6b 0%, #ee5a52 100%); color: white; border-radius: 30px; font-weight: bold; font-size: 1.1rem; margin: 5px; }}
        .csv-info {{ background: rgba(255,255,255,0.8); padding: 15px; border-radius: 10px; margin-top: 15px; font-size: 0.9rem; color: #555; }}
        .performance-summary {{ background: linear-gradient(135deg, #ffe6e6, #ffcccc); padding: 30px; border-radius: 20px; margin: 30px 0; text-align: center; }}
        .performance-score {{ font-size: 3.5rem; font-weight: bold; color: {flash_color}; margin-bottom: 10px; }}
        .performance-label {{ font-size: 1.3rem; color: #37474f; margin-bottom: 8px; }}
        .performance-description {{ font-size: 1rem; color: #546e7a; }}
        .metrics-overview {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); gap: 20px; margin-bottom: 40px; }}
        .metric-card {{ background: rgba(255,255,255,0.95); padding: 25px; border-radius: 20px; text-align: center; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); transition: transform 0.3s ease; }}
        .metric-card:hover {{ transform: translateY(-5px); }}
        .metric-value {{ font-size: 2.5rem; font-weight: bold; margin-bottom: 10px; }}
        .metric-label {{ font-size: 1rem; color: #666; font-weight: 600; margin-bottom: 5px; }}
        .metric-description {{ font-size: 0.85rem; color: #888; }}
        .excellent {{ color: #10B981; }}
        .good {{ color: #F59E0B; }}
        .moderate {{ color: #FF6B35; }}
        .poor {{ color: #EF4444; }}
        .dashboard-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(450px, 1fr)); gap: 25px; margin-bottom: 40px; }}
        .chart-section {{ background: rgba(255,255,255,0.95); padding: 30px; border-radius: 20px; backdrop-filter: blur(15px); box-shadow: 0 15px 40px rgba(0,0,0,0.1); }}
        .chart-title {{ font-size: 1.3rem; font-weight: bold; margin-bottom: 20px; color: #333; text-align: center; }}
        .confusion-matrix {{ display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px; margin: 20px 0; text-align: center; font-size: 0.9rem; }}
        .cm-cell {{ padding: 15px; border-radius: 12px; font-weight: bold; display: flex; flex-direction: column; justify-content: center; align-items: center; }}
        .cm-header {{ background: linear-gradient(135deg, #f8f9fa, #e9ecef); color: #495057; }}
        .cm-tp {{ background: linear-gradient(135deg, #d4edda, #c3e6cb); color: #155724; font-size: 1.8rem; }}
        .cm-fp {{ background: linear-gradient(135deg, #f8d7da, #f1b0b7); color: #721c24; font-size: 1.8rem; }}
        .cm-fn {{ background: linear-gradient(135deg, #fff3cd, #fce4a6); color: #856404; font-size: 1.8rem; }}
        .cm-tn {{ background: linear-gradient(135deg, #d1ecf1, #bee5eb); color: #0c5460; font-size: 1.8rem; }}
        .cm-label {{ font-size: 0.75rem; margin-top: 5px; opacity: 0.8; }}
    </style>
</head>
<body>
    <div class="dashboard">
        <div class="header">
            <h1>⚡ Flash CSV Meta Classification Metrics</h1>
            <h2>texts_images_classification_results_web_multimodal_dataset_300.csv vs Meta Ground Truth</h2>
            <div>
                <span class="model-badge">Flash CSV Model</span>
            </div>
            <div class="csv-info">
                📁 Source CSV: texts_images_classification_results_web_multimodal_dataset_300.csv<br>
                🎯 Successfully matched {dataset_info['total_samples']} artifact IDs with ground truth
            </div>
            <p style="margin-top: 20px; color: #666;">
                Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | 
                Samples: {dataset_info['total_samples']:,} | 
                Positive: {dataset_info['positive_samples']} | Negative: {dataset_info['negative_samples']}
            </p>
        </div>

        <div class="performance-summary">
            <div class="performance-score">{flash_metrics['f1_score']:.3f}</div>
            <div class="performance-label">{flash_assessment} F1-Score (Flash CSV Model)</div>
            <div class="performance-description">
                Flash CSV achieves {flash_metrics['accuracy']:.1%} accuracy with {flash_metrics['precision']:.1%} precision
                <br>Agreement with Meta Ground Truth: {agreement['flash_vs_ground_truth']:.1%}
            </div>
        </div>

        <div class="metrics-overview">
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['f1_score'] >= 0.8 else 'good' if flash_metrics['f1_score'] >= 0.7 else 'moderate' if flash_metrics['f1_score'] >= 0.6 else 'poor'}">{flash_metrics['f1_score']:.3f}</div>
                <div class="metric-label">Flash CSV F1-Score</div>
                <div class="metric-description">Harmonic Mean P&R</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['recall'] >= 0.8 else 'good' if flash_metrics['recall'] >= 0.7 else 'moderate' if flash_metrics['recall'] >= 0.6 else 'poor'}">{flash_metrics['recall']:.3f}</div>
                <div class="metric-label">Flash CSV TPR</div>
                <div class="metric-description">True Positive Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['specificity'] >= 0.8 else 'good' if flash_metrics['specificity'] >= 0.7 else 'moderate' if flash_metrics['specificity'] >= 0.6 else 'poor'}">{flash_metrics['specificity']:.3f}</div>
                <div class="metric-label">Flash CSV TNR</div>
                <div class="metric-description">True Negative Rate</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if flash_metrics['accuracy'] >= 0.9 else 'good' if flash_metrics['accuracy'] >= 0.8 else 'moderate' if flash_metrics['accuracy'] >= 0.7 else 'poor'}">{flash_metrics['accuracy']:.3f}</div>
                <div class="metric-label">Flash CSV Accuracy</div>
                <div class="metric-description">Overall Correctness</div>
            </div>
            <div class="metric-card">
                <div class="metric-value {'excellent' if agreement['flash_vs_ground_truth'] >= 0.8 else 'good' if agreement['flash_vs_ground_truth'] >= 0.7 else 'moderate' if agreement['flash_vs_ground_truth'] >= 0.6 else 'poor'}">{agreement['flash_vs_ground_truth']:.1%}</div>
                <div class="metric-label">Ground Truth Agreement</div>
                <div class="metric-description">Flash CSV matches human labels</div>
            </div>
        </div>

        <div class="dashboard-grid">
            <div class="chart-section">
                <div class="chart-title">⚡ Flash CSV Confusion Matrix</div>
                <div class="confusion-matrix">
                    <div class="cm-cell cm-header"></div>
                    <div class="cm-cell cm-header">Meta GT: 0</div>
                    <div class="cm-cell cm-header">Meta GT: 1</div>
                    
                    <div class="cm-cell cm-header">Flash CSV: 0</div>
                    <div class="cm-cell cm-tn">
                        {flash_cm['true_negatives']}
                        <div class="cm-label">True Negatives</div>
                    </div>
                    <div class="cm-cell cm-fn">
                        {flash_cm['false_negatives']}
                        <div class="cm-label">False Negatives</div>
                    </div>
                    
                    <div class="cm-cell cm-header">Flash CSV: 1</div>
                    <div class="cm-cell cm-fp">
                        {flash_cm['false_positives']}
                        <div class="cm-label">False Positives</div>
                    </div>
                    <div class="cm-cell cm-tp">
                        {flash_cm['true_positives']}
                        <div class="cm-label">True Positives</div>
                    </div>
                </div>
                <div style="text-align: center; margin-top: 15px;">
                    <p><strong>TP:</strong> {flash_cm['true_positives']} | <strong>FP:</strong> {flash_cm['false_positives']} | <strong>TN:</strong> {flash_cm['true_negatives']} | <strong>FN:</strong> {flash_cm['false_negatives']}</p>
                </div>
            </div>
        </div>
    </div>
</body>
</html>"""
        
        return html_content
    
    def generate_classification_dashboard(self):
        """Main function to generate the classification dashboard"""
        print("🚀 STARTING FLASH CSV WEB CLASSIFICATION DASHBOARD GENERATION (FIXED)")
        print("=" * 70)
        print(f"📁 Using CSV file: {self.csv_file_path}")
        print(f"📊 Using BigQuery table: {PROJECT_ID}.{DATASET_ID}.{META_GROUND_TRUTH_TABLE}")
        
        try:
            # Load results with ground truth
            df = self.load_results_with_ground_truth()
            
            if df.empty:
                print("❌ No data available for dashboard")
                return None
            
            # Calculate metrics
            metrics = self.calculate_classification_metrics(df)
            
            if not metrics:
                print("❌ Could not calculate metrics")
                return None
            
            # Create dashboard
            print("\n📊 Creating Flash CSV Meta Classification Dashboard...")
            dashboard_html = self.create_classification_dashboard(metrics, df)
            
            # Save files
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            dashboard_filename = f"flash_csv_web_classification_dashboard_{timestamp}.html"
            metrics_filename = f"flash_csv_web_classification_metrics_{timestamp}.json"
            
            with open(dashboard_filename, "w", encoding='utf-8') as f:
                f.write(dashboard_html)
            
            with open(metrics_filename, "w") as f:
                json.dump(metrics, f, indent=2, default=str)
            
            print(f"✅ Dashboard saved: {dashboard_filename}")
            print(f"✅ Metrics saved: {metrics_filename}")
            
            # Summary
            flash_metrics = metrics['flash_csv_metrics']
            agreement = metrics['agreement_analysis']
            
            print(f"\n📊 FLASH CSV WEB CLASSIFICATION METRICS SUMMARY:")
            print(f"   ⚡ Flash CSV Model:")
            print(f"      F1-Score: {flash_metrics['f1_score']:.3f}")
            print(f"      Precision: {flash_metrics['precision']:.3f}")
            print(f"      Recall: {flash_metrics['recall']:.3f}") 
            print(f"      Accuracy: {flash_metrics['accuracy']:.3f}")
            print(f"      TP: {flash_metrics['confusion_matrix']['true_positives']}, FP: {flash_metrics['confusion_matrix']['false_positives']}")
            print(f"      TN: {flash_metrics['confusion_matrix']['true_negatives']}, FN: {flash_metrics['confusion_matrix']['false_negatives']}")
            
            print(f"\n   ⚖️ Agreement Rates:")
            print(f"      Flash CSV vs Web Ground Truth: {agreement['flash_vs_ground_truth']:.1%}")
            
            return dashboard_filename, metrics_filename
            
        except Exception as e:
            print(f"❌ Dashboard generation failed: {e}")
            import traceback
            traceback.print_exc()
            return None

def main():
    """Run the Flash CSV Meta classification dashboard generation"""
    # Option 1: Use a different CSV file with Meta IDs
    # dashboard_generator = FlashMetaCSVClassificationDashboard(csv_file_path="your_meta_ids_file.csv")
    
    # Option 2: Use a different column from the current file
    # dashboard_generator = FlashMetaCSVClassificationDashboard(id_column_name='source')
    
    # Option 3: Default behavior
    dashboard_generator = FlashMetaCSVClassificationDashboard()
    result = dashboard_generator.generate_classification_dashboard()
    
    if result:
        dashboard_file, metrics_file = result
        print(f"\n🎉 SUCCESS!")
        print(f"📁 Files generated:")
        print(f"   1. {dashboard_file}")
        print(f"   2. {metrics_file}")
        print(f"\n💡 Open {dashboard_file} in your browser to view the dashboard!")
    else:
        print(f"\n❌ Dashboard generation failed")

if __name__ == "__main__":
    main()

🚀 STARTING FLASH CSV WEB CLASSIFICATION DASHBOARD GENERATION (FIXED)
📁 Using CSV file: texts_images_classification_results_web_multimodal_dataset_300.csv
📊 Using BigQuery table: scope3-dev.research_bs_monitoring.BA_Web_Ground_Truth
📥 Loading Flash CSV results with Web ground truth...
📥 Loading Flash results from CSV: texts_images_classification_results_web_multimodal_dataset_300.csv
✅ Loaded 300 Flash results from CSV
📊 CSV columns: ['source', 'artifact_id', 'model_id', 'model_prompt', 'correct_classification', 'correct_reasoning', 'artifact_json_gcs_url', 'asset_filepaths', 'classified_at', 'detected_language', 'topics']
🎯 Auto-detected ID column: 'artifact_id'
🧹 Cleaning artifact_ids...
   Cleaned 300 → 300 records
   Sample artifact_ids: ['engadget.com/2020-02-15-ring-footage-might-not-help-catch-criminals.html', 'ebay.com/itm/187348622731', 'indiatimes.com/trending/gears-of-war-reloaded-how-to-access-the-beta-price-and-more-661063.html', 'menshealth.com/fitness/a28435487/tom-ellis-