In [13]:
import json
from collections import defaultdict

def analyze_zgrab_output(file_path):
    # Initialize counters for status codes and error messages
    status_counts = defaultdict(int)
    error_counts = defaultdict(int)
    
    # Dictionary to group domains by status and error
    domains_by_status = defaultdict(list)
    domains_by_error = defaultdict(list)
    
    # Read and process the file
    with open(file_path, 'r') as file:
        for line in file:
            try:
                # Parse the JSON line
                data = json.loads(line.strip())
                domain = data.get('domain')
                
                # Extract HTTP data
                http_data = data.get('data', {}).get('http', {})
                status = http_data.get('status', 'unknown')
                error = http_data.get('error', 'none')
                
                # Count status codes
                status_counts[status] += 1
                if status == "success":
                    # t = http_data.get('result', {}).get('status_code')
                    t = http_data['result']['response']['status_code']
                    if t < 300:
                        print(data['domain'])
                # Group domains by status
                domains_by_status[status].append(domain)
                
                # If there's an error, count and group by error type
                if error != 'none':
                    # Simplify error message for better grouping
                    simplified_error = simplify_error(error)
                    error_counts[simplified_error] += 1
                    domains_by_error[simplified_error].append(domain)
                    
            except json.JSONDecodeError:
                print(f"Failed to parse line: {line}")
                continue
    
    return status_counts, error_counts, domains_by_status, domains_by_error

def simplify_error(error):
    """Simplify error messages for better grouping"""
    # Common error patterns to group
    if "Client.Timeout" in error:
        return "Connection Timeout"
    if "no such host" in error:
        return "No Such Host"
    if "connection refused" in error:
        return "Connection Refused"
    if "missing ']' in host" in error:
        return "URL Parsing Error"
    # Default case - return the original error
    return error

def print_report(status_counts, error_counts, domains_by_status, domains_by_error):
    """Print a formatted report of the analysis"""
    print("\n=== STATUS CODE SUMMARY ===")
    total_domains = sum(status_counts.values())
    print(f"Total domains scanned: {total_domains}")
    
    for status, count in sorted(status_counts.items(), key=lambda x: x[1], reverse=True):
        percentage = (count / total_domains) * 100
        print(f"{status}: {count} ({percentage:.2f}%)")
    
    if error_counts:
        print("\n=== ERROR TYPE SUMMARY ===")
        for error, count in sorted(error_counts.items(), key=lambda x: x[1], reverse=True):
            percentage = (count / total_domains) * 100
            print(f"{error}: {count} ({percentage:.2f}%)")
    
    # Save domains by status to files
    print("\n=== SAVING DOMAIN GROUPS ===")
    for status, domains in domains_by_status.items():
        filename = f"domains_with_{status}.txt"
        with open(filename, 'w') as f:
            for domain in domains:
                f.write(f"{domain}\n")
        print(f"Saved {len(domains)} domains with status '{status}' to {filename}")
    
    # Save domains by error to files
    for error, domains in domains_by_error.items():
        sanitized_error = ''.join(c if c.isalnum() else '_' for c in error)[:30]  # Sanitize filename
        filename = f"domains_with_error_{sanitized_error}.txt"
        with open(filename, 'w') as f:
            for domain in domains:
                f.write(f"{domain}\n")
        print(f"Saved {len(domains)} domains with error '{error}' to {filename}")

if __name__ == "__main__":
    import sys
    
    if len(sys.argv) != 2:
        print("Usage: python analyze_zgrab.py <output_file>")
        sys.exit(1)
    
    file_path = "output.txt"
    status_counts, error_counts, domains_by_status, domains_by_error = analyze_zgrab_output(file_path)
    print_report(status_counts, error_counts, domains_by_status, domains_by_error)

accesspoint.vpce-0b1d23da317244353-i6qgl1fo-ap-south-1a.s3.ap-south-1.vpce.amazonaws.com
control.vpce-0b1d23da317244353-i6qgl1fo-ap-south-1a.s3.ap-south-1.vpce.amazonaws.com
control.vpce-0b1d23da317244353-i6qgl1fo.s3.ap-south-1.vpce.amazonaws.com
bucket.vpce-0b1d23da317244353-i6qgl1fo.s3.ap-south-1.vpce.amazonaws.com

=== STATUS CODE SUMMARY ===
Total domains scanned: 100766
success: 60120 (59.66%)
unknown-error: 24945 (24.76%)
connection-timeout: 15692 (15.57%)
io-timeout: 9 (0.01%)

=== ERROR TYPE SUMMARY ===
Connection Timeout: 17467 (17.33%)
301 response missing Location header: 7476 (7.42%)
No Such Host: 363 (0.36%)
dial tcp <nil>->54.222.52.114:80: i/o timeout: 13 (0.01%)
Connection Refused: 12 (0.01%)
dial tcp <nil>->54.222.116.43:80: i/o timeout: 10 (0.01%)
dial tcp <nil>->54.222.54.174:80: i/o timeout: 9 (0.01%)
dial tcp <nil>->54.222.54.133:80: i/o timeout: 8 (0.01%)
dial tcp <nil>->54.222.96.35:80: i/o timeout: 8 (0.01%)
dial tcp <nil>->54.222.116.19:80: i/o timeout: 7 (0.01