# intro
This notebook is an analysis of a web archive file.  Most of it was authored by ChatGPT - it's a great example of using ChatGPT to jump-start analysis or development.

In this first cell, I'd asket ChatGPT to analyze the page load requests of a web site and summarize calls to third-party web sites.  It asked me to upload a HAR file and then it gave these results:

In [3]:
import json

# Load the HAR file to analyze it
har_file_path = './ebay.har'

with open(har_file_path, 'r') as file:
    har_data = json.load(file)

# Extract the requests from the HAR file
requests = []
for entry in har_data.get('log', {}).get('entries', []):
    request = entry.get('request', {})
    if request:
        url = request.get('url', '')
        domain = url.split('/')[2] if '://' in url else url
        requests.append(domain)

# Count occurrences of each domain
from collections import Counter

domain_counts = Counter(requests)

# Identify third-party domains
primary_domain = 'ebay.com'
third_party_domains = {domain: count for domain, count in domain_counts.items() if primary_domain not in domain}

# Summarize third-party domains
third_party_summary = [{"domain": domain, "requests": count} for domain, count in third_party_domains.items()]
third_party_summary[:10]  # Show the top 10 for an initial check


[{'domain': 'ir.ebaystatic.com', 'requests': 28},
 {'domain': 'i.ebayimg.com', 'requests': 92},
 {'domain': 'secureir.ebaystatic.com', 'requests': 1},
 {'domain': 'srv.main.ebayrtm.com', 'requests': 2},
 {'domain': 'www.ebayadservices.com', 'requests': 1},
 {'domain': 's3.amazonaws.com', 'requests': 1},
 {'domain': 'dynamic.criteo.com', 'requests': 1},
 {'domain': 'sp.analytics.yahoo.com', 'requests': 1},
 {'domain': 's.pinimg.com', 'requests': 2},
 {'domain': 'connect.facebook.net', 'requests': 1}]

Fine-tuning a bit, I asked ChatGPT to include the likely purpose of each domain connection.  Note the array of domain purposes in the cell below - it's likely you'd need to augment and/or tweak these a bit:

In [None]:
# Summarize third-party requests by top-level domain (TLD)
from urllib.parse import urlparse

# Extract top-level domains and summarize with purposes
domain_purposes = {
    "google-analytics.com": "Website traffic tracking and analytics",
    "googletagmanager.com": "Tag management for marketing and analytics",
    "google.com": "Various services, e.g., reCAPTCHA, APIs",
    "bing.com": "Microsoft Bing Ads tracking and analytics",
    "facebook.net": "Facebook plugins and ad tracking",
    "facebook.com": "Facebook tracking pixels and widgets",
    "criteo.net": "Personalized retargeting ads",
}

# Summarize requests by top-level domain and annotate purposes
tld_summary = {}
for domain, count in third_party_domains.items():
    tld = ".".join(domain.split(".")[-2:])  # Extract TLD (e.g., "google.com")
    if tld not in tld_summary:
        tld_summary[tld] = {"requests": 0, "purpose": domain_purposes.get(tld, "Unknown")}
    tld_summary[tld]["requests"] += count

# Convert summary to a sorted list
tld_summary_list = [
    {"tld": tld, "requests": data["requests"], "purpose": data["purpose"]}
    for tld, data in tld_summary.items()
]
tld_summary_sorted = sorted(tld_summary_list, key=lambda x: x["requests"], reverse=True)

# Save the TLD summary as a JSON file for user download
tld_summary_file_path = './tld_summary.json'
with open(tld_summary_file_path, 'w') as output_file:
    json.dump(tld_summary_sorted, output_file, indent=2)

tld_summary_sorted[:10], tld_summary_file_path  # Preview the top 10 and provide file path


([{'tld': 'ebayimg.com', 'requests': 92, 'purpose': 'Unknown'},
  {'tld': 'ebaystatic.com', 'requests': 29, 'purpose': 'Unknown'},
  {'tld': 'doubleclick.net', 'requests': 10, 'purpose': 'Unknown'},
  {'tld': 'criteo.com', 'requests': 6, 'purpose': 'Unknown'},
  {'tld': 'pinterest.com', 'requests': 6, 'purpose': 'Unknown'},
  {'tld': 'google.com',
   'requests': 5,
   'purpose': 'Various services, e.g., reCAPTCHA, APIs'},
  {'tld': 'snapchat.com', 'requests': 5, 'purpose': 'Unknown'},
  {'tld': 'googletagmanager.com',
   'requests': 3,
   'purpose': 'Tag management for marketing and analytics'},
  {'tld': 'amplitude.com', 'requests': 3, 'purpose': 'Unknown'},
  {'tld': 'ebayrtm.com', 'requests': 2, 'purpose': 'Unknown'}],
 './tld_summary.json')

In this last cell, I asked to categorize small gif files as "tracking" in addition to the known domains identified earlier:

In [7]:
# Re-examine requests to identify if "unknown" domains are used for tracking (e.g., small GIF files)

# Collect request sizes and purposes
domain_request_details = {}
for entry in har_data.get('log', {}).get('entries', []):
    request = entry.get('request', {})
    response = entry.get('response', {})
    if request and response:
        url = request.get('url', '')
        domain = url.split('/')[2] if '://' in url else url
        size = response.get('content', {}).get('size', 0)
        if domain not in domain_request_details:
            domain_request_details[domain] = {"total_requests": 0, "gif_requests": 0}
        domain_request_details[domain]["total_requests"] += 1
        if url.endswith(".gif") and size < 1000:  # Small GIF threshold
            domain_request_details[domain]["gif_requests"] += 1

# Update purposes for domains with unknown purposes
for tld in tld_summary:
    if tld_summary[tld]["purpose"] == "Unknown":
        gif_requests = sum(
            details["gif_requests"]
            for domain, details in domain_request_details.items()
            if domain.endswith(tld)
        )
        total_requests = tld_summary[tld]["requests"]
        if gif_requests == total_requests:
            tld_summary[tld]["purpose"] = "Tracking"

# Convert the updated TLD summary to a sorted list
updated_tld_summary_list = [
    {"tld": tld, "requests": data["requests"], "purpose": data["purpose"]}
    for tld, data in tld_summary.items()
]
updated_tld_summary_sorted = sorted(
    updated_tld_summary_list, key=lambda x: x["requests"], reverse=True
)

# Save the updated TLD summary as a JSON file for user download
updated_tld_summary_file_path = './updated_tld_summary.json'
with open(updated_tld_summary_file_path, 'w') as output_file:
    json.dump(updated_tld_summary_sorted, output_file, indent=2)

updated_tld_summary_sorted[:10], updated_tld_summary_file_path  # Preview top 10 and file path


([{'tld': 'ebayimg.com', 'requests': 92, 'purpose': 'Unknown'},
  {'tld': 'ebaystatic.com', 'requests': 29, 'purpose': 'Unknown'},
  {'tld': 'doubleclick.net', 'requests': 10, 'purpose': 'Unknown'},
  {'tld': 'criteo.com', 'requests': 6, 'purpose': 'Unknown'},
  {'tld': 'pinterest.com', 'requests': 6, 'purpose': 'Unknown'},
  {'tld': 'google.com',
   'requests': 5,
   'purpose': 'Various services, e.g., reCAPTCHA, APIs'},
  {'tld': 'snapchat.com', 'requests': 5, 'purpose': 'Unknown'},
  {'tld': 'googletagmanager.com',
   'requests': 3,
   'purpose': 'Tag management for marketing and analytics'},
  {'tld': 'amplitude.com', 'requests': 3, 'purpose': 'Unknown'},
  {'tld': 'ebayrtm.com', 'requests': 2, 'purpose': 'Unknown'}],
 './updated_tld_summary.json')