# RFC 9460 Compliance - Exploratory Data Analysis

This notebook provides exploratory analysis of RFC 9460 compliance data.

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add parent directory to path for imports
sys.path.append('..')

from src.analyzer import calculate_compliance_metrics, analyze_alpn_protocols

# Set up plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## Load Data

In [None]:
# Find the most recent CSV file
results_dir = Path('../results')
csv_files = list(results_dir.glob('rfc9460_compliance_*.csv'))

if csv_files:
    latest_file = max(csv_files, key=lambda p: p.stat().st_mtime)
    print(f"Loading: {latest_file}")
    df = pd.read_csv(latest_file)
    print(f"Loaded {len(df)} records")
else:
    print("No CSV files found. Run the checker first.")
    df = pd.DataFrame()

## Basic Statistics

In [None]:
if not df.empty:
    print("Dataset Overview:")
    print(f"- Total records: {len(df)}")
    print(f"- Unique domains: {df['domain'].nunique()}")
    print(f"- Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
    print("\nColumn types:")
    print(df.dtypes)
    print("\nMissing values:")
    print(df.isnull().sum())

## RFC 9460 Adoption Overview

In [None]:
if not df.empty:
    # Calculate adoption rates
    metrics = calculate_compliance_metrics(df)
    
    # Create adoption visualization
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Overall adoption
    adoption_data = [
        metrics['adoption']['overall_adoption'],
        100 - metrics['adoption']['overall_adoption']
    ]
    axes[0].pie(adoption_data, labels=['Has HTTPS', 'No HTTPS'], 
                autopct='%1.1f%%', startangle=90)
    axes[0].set_title('Overall HTTPS Record Adoption')
    
    # Root vs WWW comparison
    subdomains = ['Root', 'WWW']
    adoption_rates = [
        metrics['adoption']['root_adoption'],
        metrics['adoption']['www_adoption']
    ]
    axes[1].bar(subdomains, adoption_rates)
    axes[1].set_ylabel('Adoption Rate (%)')
    axes[1].set_title('HTTPS Adoption by Subdomain')
    axes[1].set_ylim(0, 100)
    
    # Feature distribution
    features = list(metrics['features'].keys())
    feature_pcts = [metrics['features'][f]['percentage'] for f in features]
    axes[2].barh(features, feature_pcts)
    axes[2].set_xlabel('Percentage (%)')
    axes[2].set_title('RFC 9460 Feature Distribution')
    axes[2].set_xlim(0, 100)
    
    plt.tight_layout()
    plt.show()

## HTTP/3 Support Analysis

In [None]:
if not df.empty and 'has_http3' in df.columns:
    # HTTP/3 adoption over subdomains
    http3_by_subdomain = df.groupby('subdomain')['has_http3'].mean() * 100
    
    plt.figure(figsize=(10, 6))
    http3_by_subdomain.plot(kind='bar')
    plt.title('HTTP/3 Support by Subdomain Type')
    plt.ylabel('Percentage with HTTP/3 (%)')
    plt.xlabel('Subdomain Type')
    plt.xticks(rotation=0)
    plt.ylim(0, 100)
    plt.grid(axis='y', alpha=0.3)
    plt.show()
    
    # Domains with HTTP/3
    http3_domains = df[df['has_http3'] == True]['domain'].unique()
    print(f"\nDomains with HTTP/3 support: {len(http3_domains)}")
    if len(http3_domains) <= 20:
        print("Domains:")
        for domain in sorted(http3_domains)[:20]:
            print(f"  - {domain}")

## ALPN Protocol Distribution

In [None]:
if not df.empty and 'alpn_protocols' in df.columns:
    alpn_dist = analyze_alpn_protocols(df)
    
    if alpn_dist:
        plt.figure(figsize=(12, 6))
        protocols = list(alpn_dist.keys())[:10]  # Top 10
        counts = [alpn_dist[p] for p in protocols]
        
        plt.bar(protocols, counts)
        plt.title('Top ALPN Protocols in HTTPS Records')
        plt.xlabel('ALPN Protocol')
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
        print("\nALPN Protocol Statistics:")
        for protocol, count in list(alpn_dist.items())[:10]:
            print(f"  {protocol}: {count} occurrences")

## ECH (Encrypted Client Hello) Deployment

In [None]:
if not df.empty and 'ech_config' in df.columns:
    ech_stats = df.groupby('subdomain')['ech_config'].agg(['sum', 'mean'])
    ech_stats['percentage'] = ech_stats['mean'] * 100
    
    print("ECH Configuration Statistics:")
    print(ech_stats)
    
    # Domains with ECH
    ech_domains = df[df['ech_config'] == True]['domain'].unique()
    print(f"\nDomains with ECH configuration: {len(ech_domains)}")
    if len(ech_domains) > 0 and len(ech_domains) <= 20:
        print("Domains with ECH:")
        for domain in sorted(ech_domains):
            print(f"  - {domain}")

## IP Hints Analysis

In [None]:
if not df.empty:
    # IP hint statistics
    ipv4_count = df['ipv4hint'].notna().sum()
    ipv6_count = df['ipv6hint'].notna().sum()
    both_count = (df['ipv4hint'].notna() & df['ipv6hint'].notna()).sum()
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # IP hint distribution
    ip_data = ['IPv4 Only', 'IPv6 Only', 'Both', 'Neither']
    ip_counts = [
        ipv4_count - both_count,
        ipv6_count - both_count,
        both_count,
        len(df) - ipv4_count - ipv6_count + both_count
    ]
    
    ax1.pie(ip_counts, labels=ip_data, autopct='%1.1f%%')
    ax1.set_title('IP Hint Distribution')
    
    # Comparison by subdomain
    ip_by_subdomain = df.groupby('subdomain').agg({
        'ipv4hint': lambda x: x.notna().mean() * 100,
        'ipv6hint': lambda x: x.notna().mean() * 100
    })
    
    ip_by_subdomain.plot(kind='bar', ax=ax2)
    ax2.set_title('IP Hints by Subdomain')
    ax2.set_ylabel('Percentage (%)')
    ax2.set_xlabel('Subdomain')
    ax2.legend(['IPv4 Hints', 'IPv6 Hints'])
    ax2.set_xticklabels(ax2.get_xticklabels(), rotation=0)
    
    plt.tight_layout()
    plt.show()

## Query Error Analysis

In [None]:
if not df.empty and 'query_error' in df.columns:
    error_counts = df['query_error'].value_counts()
    
    if not error_counts.empty:
        plt.figure(figsize=(10, 6))
        error_counts.head(10).plot(kind='barh')
        plt.title('Top Query Error Types')
        plt.xlabel('Count')
        plt.ylabel('Error Type')
        plt.tight_layout()
        plt.show()
        
        print("\nQuery Error Statistics:")
        print(f"Total queries with errors: {df['query_error'].notna().sum()}")
        print(f"Error rate: {df['query_error'].notna().mean() * 100:.2f}%")
        print("\nTop error types:")
        for error, count in error_counts.head(5).items():
            print(f"  {error}: {count}")

## Compliance Score Distribution

In [None]:
if not df.empty:
    # Calculate compliance scores
    def calculate_score(row):
        score = 0
        if row.get('has_https_record'):
            score += 40
            if row.get('has_http3'):
                score += 20
            if row.get('ech_config'):
                score += 15
            if row.get('ipv4hint') or row.get('ipv6hint'):
                score += 15
            if row.get('alpn_protocols'):
                score += 10
        return score
    
    df['compliance_score'] = df.apply(calculate_score, axis=1)
    
    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    plt.hist(df['compliance_score'], bins=20, edgecolor='black')
    plt.title('Compliance Score Distribution')
    plt.xlabel('Compliance Score (0-100)')
    plt.ylabel('Count')
    plt.axvline(df['compliance_score'].mean(), color='red', 
                linestyle='--', label=f'Mean: {df["compliance_score"].mean():.1f}')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    subdomain_scores = df.groupby('subdomain')['compliance_score'].mean()
    subdomain_scores.plot(kind='bar')
    plt.title('Average Compliance Score by Subdomain')
    plt.ylabel('Average Score')
    plt.xlabel('Subdomain')
    plt.xticks(rotation=0)
    plt.ylim(0, 100)
    
    plt.tight_layout()
    plt.show()
    
    print("\nCompliance Score Statistics:")
    print(df['compliance_score'].describe())

## Top Performing Domains

In [None]:
if not df.empty and 'compliance_score' in df.columns:
    # Calculate average score per domain
    domain_scores = df.groupby('domain')['compliance_score'].mean().sort_values(ascending=False)
    
    print("Top 20 RFC 9460 Compliant Domains:")
    print("=" * 50)
    for i, (domain, score) in enumerate(domain_scores.head(20).items(), 1):
        print(f"{i:2d}. {domain:30s} {score:5.1f}/100")
    
    # Visualize top performers
    plt.figure(figsize=(12, 8))
    top_10 = domain_scores.head(10)
    plt.barh(range(len(top_10)), top_10.values)
    plt.yticks(range(len(top_10)), top_10.index)
    plt.xlabel('Compliance Score')
    plt.title('Top 10 RFC 9460 Compliant Domains')
    plt.xlim(0, 100)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

## Export Summary Report

In [None]:
if not df.empty:
    # Generate summary for export
    summary = {
        'total_domains': df['domain'].nunique(),
        'total_records': len(df),
        'adoption_rate': df['has_https_record'].mean() * 100,
        'http3_rate': df['has_http3'].mean() * 100,
        'ech_rate': df['ech_config'].mean() * 100,
        'average_compliance': df.get('compliance_score', pd.Series([0])).mean(),
        'scan_date': df['timestamp'].max() if 'timestamp' in df else 'N/A'
    }
    
    print("\nSummary Report:")
    print("=" * 50)
    for key, value in summary.items():
        if isinstance(value, float):
            print(f"{key:20s}: {value:.2f}")
        else:
            print(f"{key:20s}: {value}")
    
    # Save summary
    import json
    summary_file = Path('../results/analysis_summary.json')
    with open(summary_file, 'w') as f:
        json.dump(summary, f, indent=2, default=str)
    print(f"\nSummary saved to: {summary_file}")