# Tavily Web Scraper – Colab Notebook

This notebook is the main UI for the Tavily Web Research Engineer assignment.

It demonstrates a **hybrid scraping pipeline** over ~1k–10k mixed static and JS-heavy URLs:

- Stage 1: async `httpx` fast path for static / mostly-static pages.
- Stage 2: `playwright` (Chromium, headless) fallback for dynamic / blocked pages.

Install dependencies, configure environment, and upload input files. Run the batch scraper via `run_all`, then analyze results through visualizations and metrics.

> **Tip:** If you're viewing this on Colab from GitHub, the working directory is set up below so imports from `tavily_scraper` work out of the box.


In [None]:
# Clone repository (Colab only)

import sys

IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    !git clone https://github.com/chernistry/tavily.git
    %cd tavily
    print("Repository cloned and working directory set to /content/tavily")
else:
    print("Running locally, assuming repository is already present")


In [None]:
# Environment & path setup

import os
from pathlib import Path

repo_root = Path.cwd()

if IN_COLAB:
    print(f"Running in Colab, repo root: {repo_root}")
else:
    print(f"Running locally, CWD: {repo_root}")


In [None]:
# Install dependencies (run once per fresh Colab session)

if IN_COLAB:
    # Use %pip so the environment is updated in the current kernel.
    %pip install -q -r requirements.txt
    # Install Chromium for Playwright (JS-enabled browser automation).
    !python -m playwright install --with-deps chromium
else:
    print("Assuming dependencies are already installed in the local environment.")


In [None]:
# Configure environment and data paths

import shutil
from pathlib import Path

from tavily_scraper.utils.io import ensure_canonical_urls_file

base_data_dir = Path("/content/data" if IN_COLAB else "data").resolve()
base_data_dir.mkdir(parents=True, exist_ok=True)

# Core environment flags for the pipeline
os.environ["TAVILY_ENV"] = "colab" if IN_COLAB else "local"
os.environ["TAVILY_DATA_DIR"] = str(base_data_dir)
os.environ.setdefault("HTTPX_TIMEOUT_SECONDS", "10")
os.environ.setdefault("HTTPX_MAX_CONCURRENCY", "32")
os.environ.setdefault("PLAYWRIGHT_HEADLESS", "true")
os.environ.setdefault("PLAYWRIGHT_MAX_CONCURRENCY", "2")
os.environ.setdefault("SHARD_SIZE", "500")

# Use files from repository .sdd/raw/ directory
repo_raw_dir = Path(".sdd/raw")
urls_csv_path = repo_raw_dir / "urls.csv"
proxy_json_path = repo_raw_dir / "proxy.json"

urls_txt_path = base_data_dir / "urls.txt"

if urls_csv_path.exists():
    urls_txt_path = ensure_canonical_urls_file(urls_csv_path, urls_txt_path)
    print(f"Loaded URLs from {urls_csv_path} -> {urls_txt_path}")
else:
    print(f"Warning: {urls_csv_path} not found in repository")

if proxy_json_path.exists():
    proxy_dst = base_data_dir / "proxy.json"
    shutil.copy(proxy_json_path, proxy_dst)
    os.environ["PROXY_CONFIG_PATH"] = str(proxy_dst)
    print(f"Loaded proxy config from {proxy_json_path} -> {proxy_dst}")
else:
    print(f"Warning: {proxy_json_path} not found in repository")


In [None]:
# Load configuration and inspect

from tavily_scraper.config.env import load_run_config

config = load_run_config()
config


In [None]:
# Run the hybrid scraper (HTTPX first, optional Playwright fallback)

from tavily_scraper.pipelines.batch_runner import run_all

# Tune these for your run. For the assignment, target ~1,000 successful URLs.
TARGET_SUCCESS = 1000
USE_BROWSER = True  # set to False to run HTTP-only

run_summary = await run_all(
    config,
    target_success=TARGET_SUCCESS,
    use_browser=USE_BROWSER,
)
run_summary


In [None]:
# Load per-URL stats and run summary from disk

import json

import pandas as pd

from tavily_scraper.utils.io import read_stats_jsonl

data_dir = config.data_dir
stats_path = data_dir / "stats.jsonl"
summary_path = data_dir / "run_summary.json"

stats_rows = read_stats_jsonl(stats_path)
df = pd.DataFrame(stats_rows)
print(f"Loaded {len(df)} UrlStats rows from {stats_path}")

run_summary_from_disk = json.loads(summary_path.read_text(encoding="utf-8"))
run_summary_from_disk


In [None]:
# Visualizations: latency distributions and status breakdowns

import matplotlib.pyplot as plt

if df.empty:
    raise RuntimeError("No stats loaded – run the scraper cell first.")

httpx_df = df[df["method"] == "httpx"]
playwright_df = df[df["method"] == "playwright"]

fig, axes = plt.subplots(1, 2, figsize=(14, 4))

axes[0].hist(httpx_df["latency_ms"].dropna(), bins=40, color="tab:blue", alpha=0.8)
axes[0].set_title("HTTPX latency (ms)")
axes[0].set_xlabel("Latency (ms)")
axes[0].set_ylabel("Count")

axes[1].hist(
    playwright_df["latency_ms"].dropna(),
    bins=40,
    color="tab:orange",
    alpha=0.8,
)
axes[1].set_title("Playwright latency (ms)")
axes[1].set_xlabel("Latency (ms)")

plt.tight_layout()
plt.show()

plt.figure(figsize=(6, 4))
df["status"].value_counts().plot(kind="bar", title="Status distribution")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# Per-domain status breakdown (top domains by total URLs)
print("\n=== Top 20 Domains by Status ===")
domain_status = df.groupby(["domain", "status"]).size().unstack(fill_value=0)
# Sort by total URLs per domain
domain_status["total"] = domain_status.sum(axis=1)
top_domains = domain_status.sort_values("total", ascending=False).head(20)
# Drop total column for display
display_cols = [col for col in top_domains.columns if col != "total"]
print(top_domains[display_cols].to_string())

# Visualize top error domains
if "http_error" in domain_status.columns:
    error_domains = domain_status.sort_values("http_error", ascending=False).head(10)
    error_domains["http_error"].plot(kind="barh", title="Top 10 Domains by HTTP Errors", color="#ef4444")
    plt.xlabel("Error Count")
    plt.tight_layout()
    plt.show()


In [None]:
# Content length distribution by method (log x-axis)

import numpy as np

if df.empty:
    raise RuntimeError("No stats loaded – run the scraper cell first.")

fig, axes = plt.subplots(1, 2, figsize=(14, 4))

for ax, subset, title, color in [
    (axes[0], httpx_df, "HTTPX content length", "tab:blue"),
    (axes[1], playwright_df, "Playwright content length", "tab:orange"),
]:
    # Consider only rows where we actually have meaningful content
    subset_ok = subset[subset["status"].isin(["success", "too_large"])]
    sizes = subset_ok["content_len"].dropna()
    sizes = sizes[sizes > 0]
    if sizes.empty:
        continue
    bins = np.logspace(np.log10(sizes.min()), np.log10(sizes.max()), 40)
    ax.hist(sizes, bins=bins, color=color, alpha=0.8)
    ax.set_xscale("log")
    ax.set_title(title)
    ax.set_xlabel("Content length (bytes, log)")
    ax.set_ylabel("Count")

plt.tight_layout()
plt.show()
