# Tavily Web Scraper – Colab Notebook

This notebook is the main UI for the Tavily Web Research Engineer assignment.

It demonstrates a **hybrid scraping pipeline** over ~1k–10k mixed static and JS-heavy URLs:

- Stage 1: async `httpx` fast path for static / mostly-static pages.
- Stage 2: `playwright` (Chromium, headless) fallback for dynamic / blocked pages.

Install dependencies, configure environment, and upload input files. Run the batch scraper via `run_all`, then analyze results through visualizations and metrics.

> **Tip:** If you're viewing this on Colab from GitHub, the working directory is set up below so imports from `tavily_scraper` work out of the box.


In [None]:
# Clone repository (Colab only)

import sys

IN_COLAB = "google.colab" in sys.modules

if IN_COLAB:
    !git clone https://github.com/chernistry/tavily.git
    %cd tavily
    print("Repository cloned and working directory set to /content/tavily")
else:
    print("Running locally, assuming repository is already present")


In [None]:
# Environment & path setup

import os
from pathlib import Path

repo_root = Path.cwd()

if IN_COLAB:
    print(f"Running in Colab, repo root: {repo_root}")
else:
    print(f"Running locally, CWD: {repo_root}")


In [None]:
# Install dependencies (run once per fresh Colab session)

if IN_COLAB:
    # Use %pip so the environment is updated in the current kernel.
    %pip install -q -r requirements.txt
    # Install Chromium for Playwright (JS-enabled browser automation).
    !python -m playwright install --with-deps chromium
else:
    print("Assuming dependencies are already installed in the local environment.")


In [None]:
# Configure environment and data paths

import shutil
from pathlib import Path

from tavily_scraper.utils.io import ensure_canonical_urls_file

base_data_dir = Path("/content/data" if IN_COLAB else "data").resolve()
base_data_dir.mkdir(parents=True, exist_ok=True)

# Core environment flags for the pipeline
os.environ["TAVILY_ENV"] = "colab" if IN_COLAB else "local"
os.environ["TAVILY_DATA_DIR"] = str(base_data_dir)
os.environ.setdefault("HTTPX_TIMEOUT_SECONDS", "10")
os.environ.setdefault("HTTPX_MAX_CONCURRENCY", "32")
os.environ.setdefault("PLAYWRIGHT_HEADLESS", "true")
os.environ.setdefault("PLAYWRIGHT_MAX_CONCURRENCY", "2")
os.environ.setdefault("SHARD_SIZE", "500")

# Use files from repository .sdd/raw/ directory
repo_raw_dir = Path(".sdd/raw")
urls_csv_path = repo_raw_dir / "urls.csv"
proxy_json_path = repo_raw_dir / "proxy.json"

urls_txt_path = base_data_dir / "urls.txt"

if urls_csv_path.exists():
    urls_txt_path = ensure_canonical_urls_file(urls_csv_path, urls_txt_path)
    print(f"Loaded URLs from {urls_csv_path} -> {urls_txt_path}")
else:
    print(f"Warning: {urls_csv_path} not found in repository")

if proxy_json_path.exists():
    proxy_dst = base_data_dir / "proxy.json"
    shutil.copy(proxy_json_path, proxy_dst)
    os.environ["PROXY_CONFIG_PATH"] = str(proxy_dst)
    print(f"Loaded proxy config from {proxy_json_path} -> {proxy_dst}")
else:
    print(f"Warning: {proxy_json_path} not found in repository")


In [None]:
# Load configuration and inspect

from tavily_scraper.config.env import load_run_config

config = load_run_config()
config


In [None]:
# Run the hybrid scraper (HTTPX first, optional Playwright fallback)

from tavily_scraper.pipelines.batch_runner import run_all

# Tune these for your run. For the assignment, target ~1,000 successful URLs.
TARGET_SUCCESS = 1000
USE_BROWSER = True  # set to False to run HTTP-only

run_summary = await run_all(
    config,
    target_success=TARGET_SUCCESS,
    use_browser=USE_BROWSER,
)
run_summary


In [None]:
# Load per-URL stats and run summary from disk

import json

import pandas as pd

from tavily_scraper.utils.io import read_stats_jsonl

data_dir = config.data_dir
stats_path = data_dir / "stats.jsonl"
summary_path = data_dir / "run_summary.json"

stats_rows = read_stats_jsonl(stats_path)
df = pd.DataFrame(stats_rows)
print(f"Loaded {len(df)} UrlStats rows from {stats_path}")

run_summary_from_disk = json.loads(summary_path.read_text(encoding="utf-8"))
run_summary_from_disk


In [None]:
# Visualizations: high-level metrics and interactive charts

import plotly.express as px
import plotly.io as pio

pio.templates.default = 'plotly_white'

if df.empty:
    raise RuntimeError('No stats loaded - run the scraper cell first.')

httpx_df = df[df['method'] == 'httpx']
playwright_df = df[df['method'] == 'playwright']

# 1) Status distribution (what happened?)
status_counts = df['status'].value_counts().reset_index()
status_counts.columns = ['status', 'count']

fig = px.bar(
    status_counts,
    x='status',
    y='count',
    title='Status distribution',
    text='count',
)
fig.update_traces(textposition='outside')
fig.update_layout(xaxis_title='Status', yaxis_title='Count')
fig.show()

# 2) Latency distributions by method (how fast?)
latency_df = df.dropna(subset=['latency_ms'])
if not latency_df.empty:
    fig = px.histogram(
        latency_df,
        x='latency_ms',
        color='method',
        nbins=40,
        barmode='overlay',
        opacity=0.8,
        title='Latency distribution by method',
    )
    fig.update_layout(xaxis_title='Latency (ms)', yaxis_title='Count')
    fig.show()

# 3) Latency percentiles from run summary (P50/P95)
latency_rows: list[dict[str, object]] = []
for method_label, p50_key, p95_key in [
    ('HTTPX', 'p50_latency_httpx_ms', 'p95_latency_httpx_ms'),
    ('Playwright', 'p50_latency_playwright_ms', 'p95_latency_playwright_ms'),
]:
    p50 = run_summary_from_disk.get(p50_key)
    p95 = run_summary_from_disk.get(p95_key)
    if p50 is None or p95 is None:
        continue
    latency_rows.append({'method': method_label, 'metric': 'P50', 'latency_ms': p50})
    latency_rows.append({'method': method_label, 'metric': 'P95', 'latency_ms': p95})

if latency_rows:
    latency_summary_df = pd.DataFrame(latency_rows)
    fig = px.bar(
        latency_summary_df,
        x='method',
        y='latency_ms',
        color='metric',
        barmode='group',
        title='Latency by method (P50 vs P95)',
        labels={'latency_ms': 'Latency (ms)'},
    )
    fig.show()

# 4) Per-domain status breakdown (top 20 by URL count)
print('
=== Top 20 Domains by Status ===')
domain_status = df.groupby(['domain', 'status']).size().unstack(fill_value=0)
domain_status['total'] = domain_status.sum(axis=1)
top_domains = domain_status.sort_values('total', ascending=False).head(20)
display_cols = [col for col in top_domains.columns if col != 'total']
print(top_domains[display_cols].to_string())


In [None]:
# Content length distribution by method (log x-axis)

import numpy as np
import plotly.express as px

if df.empty:
    raise RuntimeError('No stats loaded - run the scraper cell first.')

content_df = df[(df['content_len'].notna()) & (df['content_len'] > 0)].copy()

if not content_df.empty:
    fig = px.histogram(
        content_df,
        x='content_len',
        color='method',
        nbins=40,
        barmode='overlay',
        opacity=0.8,
        title='Content length distribution by method',
        labels={'content_len': 'Content length (bytes)'},
    )
    fig.update_xaxes(type='log')
    fig.show()
else:
    print('No non-empty content rows found for content length analysis.')
