In [1]:
import requests
import pandas as pd
from tqdm import tqdm

BASE_URL = "https://api.ooni.io/api/v1/measurements"

params = {
    "test_name": "web_connectivity",
    "country": "CN",            # China
    "since": "2025-09-01",      # start date
    "until": "2025-09-05",      # end date
    "limit": 100                # number of results per page
}

measurements = []
next_url = BASE_URL

print("Fetching OONI web_connectivity data for China...")

for _ in tqdm(range(10)):  # fetch up to 10 pages (you can increase this)
    r = requests.get(next_url, params=params)
    data = r.json()

    if "results" not in data or not data["results"]:
        break

    measurements.extend(data["results"])

    if "metadata" in data and data["metadata"].get("next_url"):
        next_url = data["metadata"]["next_url"]
        params = {}  # clear params (next_url already contains them)
    else:
        break

print(f"Fetched {len(measurements)} measurements.")


Fetching OONI web_connectivity data for China...


100%|███████████████████████████████████████████| 10/10 [00:11<00:00,  1.19s/it]

Fetched 1000 measurements.





In [3]:
df = pd.DataFrame(measurements)

df.head()

Unnamed: 0,anomaly,confirmed,failure,input,probe_asn,probe_cc,report_id,scores,test_name,measurement_url,measurement_start_time,measurement_uid
0,False,False,True,https://www.imf.org/,AS701,US,20250904T235957Z_webconnectivity_US_701_n1_BXN...,"{'blocking_general': 0.0, 'blocking_global': 0...",web_connectivity,https://api.prod.ooni.io/api/v1/raw_measuremen...,2025-09-05T00:00:00.000000Z,20250905000003.499604_US_webconnectivity_00694...
1,False,False,False,https://www.coinbase.com/,AS701,US,20250904T235957Z_webconnectivity_US_701_n1_BXN...,"{'blocking_general': 0.0, 'blocking_global': 0...",web_connectivity,https://api.prod.ooni.io/api/v1/raw_measuremen...,2025-09-05T00:00:00.000000Z,20250905000002.083326_US_webconnectivity_8f34f...
2,False,False,False,https://www.getoutline.org/,AS9031,BE,20250904T235952Z_webconnectivity_BE_9031_n1_LD...,"{'blocking_general': 0.0, 'blocking_global': 0...",web_connectivity,https://api.prod.ooni.io/api/v1/raw_measuremen...,2025-09-05T00:00:00.000000Z,20250905000000.653177_BE_webconnectivity_1ec81...
3,False,False,False,http://www.gayegypt.com/,AS9031,BE,20250904T235952Z_webconnectivity_BE_9031_n1_LD...,"{'blocking_general': 0.0, 'blocking_global': 0...",web_connectivity,https://api.prod.ooni.io/api/v1/raw_measuremen...,2025-09-05T00:00:00.000000Z,20250904235959.740722_BE_webconnectivity_8559a...
4,False,False,False,http://www.humanrightsfirst.org/,AS28573,BR,20250904T235938Z_webconnectivity_BR_28573_n1_k...,"{'blocking_general': 0.0, 'blocking_global': 0...",web_connectivity,https://api.prod.ooni.io/api/v1/raw_measuremen...,2025-09-05T00:00:00.000000Z,20250905000003.444993_BR_webconnectivity_54a4b...


In [6]:
df.columns

Index(['anomaly', 'confirmed', 'failure', 'input', 'probe_asn', 'probe_cc',
       'report_id', 'scores', 'test_name', 'measurement_url',
       'measurement_start_time', 'measurement_uid'],
      dtype='object')

In [7]:
sample_url = measurements[0]["measurement_url"]

In [9]:
r = requests.get(sample_url)
full_measurement = r.json()

# Explore structure
print(full_measurement.keys())

dict_keys(['annotations', 'data_format_version', 'input', 'measurement_start_time', 'probe_asn', 'probe_cc', 'probe_ip', 'probe_network_name', 'report_id', 'resolver_asn', 'resolver_ip', 'resolver_network_name', 'software_name', 'software_version', 'test_helpers', 'test_keys', 'test_name', 'test_runtime', 'test_start_time', 'test_version'])


In [10]:
full_measurement

{'annotations': {'architecture': 'amd64',
  'engine_name': 'ooniprobe-engine',
  'engine_version': '3.23.0',
  'go_version': 'go1.21.11',
  'platform': 'windows',
  'vcs_modified': 'false',
  'vcs_revision': '7e9a078d54a1a0911654e6389a0e3d2b916c6b19',
  'vcs_time': '2024-08-08T06:53:24Z',
  'vcs_tool': 'git'},
 'data_format_version': '0.2.0',
 'input': 'https://www.imf.org/',
 'measurement_start_time': '2025-09-05 00:00:00',
 'probe_asn': 'AS701',
 'probe_cc': 'US',
 'probe_ip': '127.0.0.1',
 'probe_network_name': 'Verizon Business',
 'report_id': '20250904T235957Z_webconnectivity_US_701_n1_BXN3QAlsnmxlZSqr',
 'resolver_asn': 'AS15169',
 'resolver_ip': '192.178.115.215',
 'resolver_network_name': 'Google LLC',
 'software_name': 'ooniprobe-desktop-unattended',
 'software_version': '3.23.0',
 'test_helpers': {'backend': {'address': 'https://6.th.ooni.org',
   'type': 'https'}},
 'test_keys': {'agent': 'redirect',
  'client_resolver': '192.178.115.215',
  'retries': None,
  'socksproxy': 

In [11]:
import requests
import pandas as pd
import time
from tqdm import tqdm

def fetch_ooni_data(test_name: str, country: str, since: str, until: str,
                    limit: int = 100, max_pages: int = 10, sleep: float = 1.0):
    """
    Fetch OONI measurement summaries from the public API.

    Args:
        test_name (str): e.g. "web_connectivity", "telegram", "facebook_messenger"
        country (str): ISO country code (e.g., "CN", "US")
        since (str): start date (ISO format, e.g., "2025-09-01")
        until (str): end date (ISO format, e.g., "2025-09-05")
        limit (int): results per page (default 100)
        max_pages (int): maximum number of pages to fetch (default 10)
        sleep (float): delay between requests to avoid rate limits (default 1 sec)

    Returns:
        pd.DataFrame: A dataframe of summarized measurement metadata.
    """
    BASE_URL = "https://api.ooni.io/api/v1/measurements"
    params = {
        "test_name": test_name,
        "country": country,
        "since": since,
        "until": until,
        "limit": limit
    }

    measurements = []
    next_url = BASE_URL

    print(f"Fetching OONI data for {test_name.upper()} in {country} "
          f"from {since} to {until}...")

    for _ in tqdm(range(max_pages)):
        r = requests.get(next_url, params=params)
        if r.status_code != 200:
            print(f"Error {r.status_code}: {r.text}")
            break

        data = r.json()
        results = data.get("results", [])
        if not results:
            break

        measurements.extend(results)

        # follow pagination
        next_url = data.get("metadata", {}).get("next_url")
        if not next_url:
            break
        params = {}
        time.sleep(sleep)

    if not measurements:
        print("No results found.")
        return pd.DataFrame()

    df = pd.DataFrame(measurements)
    core_cols = ["measurement_start_time", "probe_cc", "probe_asn",
                 "test_name", "input", "scores", "failure", "report_id",
                 "measurement_url"]

    df = df[[c for c in core_cols if c in df.columns]]
    print(f"Fetched {len(df)} measurements.")
    return df


In [26]:
df_rs = fetch_ooni_data(
    test_name="web_connectivity",
    country="RU",
    since="2025-01-01",
    until="2025-08-03",
    limit=100,
    max_pages=5
)

df_rs.head()

Fetching OONI data for WEB_CONNECTIVITY in RU from 2025-01-01 to 2025-08-03...


100%|█████████████████████████████████████████████| 5/5 [00:10<00:00,  2.05s/it]

Fetched 500 measurements.





Unnamed: 0,measurement_start_time,probe_cc,probe_asn,test_name,input,scores,failure,report_id,measurement_url
0,2025-08-03T00:00:00.000000Z,FR,AS3215,web_connectivity,https://www.facebook.com/,"{'blocking_general': 0.0, 'blocking_global': 0...",False,20250802T235959Z_webconnectivity_FR_3215_n1_Z5...,https://api.prod.ooni.io/api/v1/raw_measuremen...
1,2025-08-03T00:00:00.000000Z,SK,AS6830,web_connectivity,https://secondlife.com/,"{'blocking_general': 0.0, 'blocking_global': 0...",False,20250802T235917Z_webconnectivity_SK_6830_n1_It...,https://api.prod.ooni.io/api/v1/raw_measuremen...
2,2025-08-03T00:00:00.000000Z,SK,AS6830,web_connectivity,https://scontent.xx.fbcdn.net/robots.txt,"{'blocking_general': 0.0, 'blocking_global': 0...",False,20250802T235917Z_webconnectivity_SK_6830_n1_It...,https://api.prod.ooni.io/api/v1/raw_measuremen...
3,2025-08-03T00:00:00.000000Z,SK,AS6830,web_connectivity,https://scontent.cdninstagram.com/robots.txt,"{'blocking_general': 0.0, 'blocking_global': 0...",False,20250802T235917Z_webconnectivity_SK_6830_n1_It...,https://api.prod.ooni.io/api/v1/raw_measuremen...
4,2025-08-03T00:00:00.000000Z,SK,AS6830,web_connectivity,https://scontent-vie1-1.cdninstagram.com/favic...,"{'blocking_general': 0.0, 'blocking_global': 0...",False,20250802T235917Z_webconnectivity_SK_6830_n1_It...,https://api.prod.ooni.io/api/v1/raw_measuremen...


In [21]:
rsteleurl=df_rs['measurement_url'][0]

In [22]:
r = requests.get(rsteleurl)
full_measurement = r.json()

# Explore structure
print(full_measurement.keys())

dict_keys(['annotations', 'data_format_version', 'extensions', 'input', 'measurement_start_time', 'probe_asn', 'probe_cc', 'probe_ip', 'probe_network_name', 'report_id', 'resolver_asn', 'resolver_ip', 'resolver_network_name', 'software_name', 'software_version', 'test_keys', 'test_name', 'test_runtime', 'test_start_time', 'test_version'])


In [39]:
full_measurement

{'annotations': {'architecture': 'arm64',
  'engine_name': 'ooniprobe-engine',
  'engine_version': '3.26.0',
  'flavor': 'ooniprobe-android-unattended',
  'go_version': 'go1.23.7',
  'network_type': 'wifi',
  'ooni_run_link_id': '',
  'origin': 'autorun',
  'os_version': '35',
  'platform': 'android',
  'vcs_modified': '',
  'vcs_revision': '',
  'vcs_time': '',
  'vcs_tool': ''},
 'data_format_version': '0.2.0',
 'extensions': {'dnst': 0,
  'httpt': 0,
  'netevents': 0,
  'tcpconnect': 0,
  'tlshandshake': 0,
  'tunnel': 0},
 'input': None,
 'measurement_start_time': '2025-08-02 23:59:52',
 'probe_asn': 'AS45758',
 'probe_cc': 'TH',
 'probe_ip': '127.0.0.1',
 'probe_network_name': 'Triple T Broadband Public Company Limited',
 'report_id': '20250802T235953Z_telegram_TH_45758_n1_shTlSfWm5uoVQXZL',
 'resolver_asn': 'AS45758',
 'resolver_ip': '110.164.15.244',
 'resolver_network_name': 'Triple T Broadband Public Company Limited',
 'software_name': 'ooniprobe-android-unattended',
 'softwar

In [42]:
!pip install boto3 pandas




In [45]:
!pip uninstall -y panda

Found existing installation: panda 0.3.1
Uninstalling panda-0.3.1:
  Successfully uninstalled panda-0.3.1


In [59]:
import boto3, gzip, json
import pandas as pd
from io import BytesIO
from datetime import date, timedelta
from botocore import UNSIGNED
from botocore.client import Config

def fetch_ooni_raw(
    country: str,
    test_name: str,
    start_day: str,
    end_day: str,
    max_measurements: int = None,
    verbose: bool = True
):
    """
    Fetch OONI measurements from the public S3 bucket (raw/ structure).
    
    Example S3 layout:
    s3://ooni-data-eu-fra/raw/20220223/15/RU/webconnectivity/2022022315_RU_webconnectivity.n0.0.jsonl.gz
    """
    s3 = boto3.client("s3", region_name="eu-central-1", config=Config(signature_version=UNSIGNED))
    bucket = "ooni-data-eu-fra"
    base_prefix = "raw"

    start = date.fromisoformat(start_day)
    end = date.fromisoformat(end_day)
    records = []
    total_jsonl = 0
    total_tar = 0

    for d in (start + timedelta(n) for n in range((end - start).days + 1)):
        for hour in range(24):
            prefix = f"{base_prefix}/{d.strftime('%Y%m%d')}/{hour:02d}/{country}/{test_name}/"
            if verbose:
                print(f"📦 Listing {prefix} ...")

            paginator = s3.get_paginator("list_objects_v2")
            pages = paginator.paginate(Bucket=bucket, Prefix=prefix)

            for page in pages:
                if "Contents" not in page:
                    continue

                for obj in page["Contents"]:
                    key = obj["Key"]

                    # Skip tar.gz (postcans)
                    if key.endswith(".tar.gz"):
                        total_tar += 1
                        if verbose:
                            print(f"⏭️  Skipping postcan: {key}")
                        continue

                    if not key.endswith(".jsonl.gz"):
                        continue

                    total_jsonl += 1
                    if verbose:
                        print(f"⬇️  Downloading {key} ...")

                    data = s3.get_object(Bucket=bucket, Key=key)["Body"].read()

                    try:
                        with gzip.open(BytesIO(data), "rt", encoding="utf-8") as fh:
                            for line in fh:
                                record = json.loads(line)
                                records.append(record)

                                # Stop early if we hit the max_measurements limit
                                if max_measurements and len(records) >= max_measurements:
                                    if verbose:
                                        print(f"🚫 Reached max_measurements ({max_measurements}), stopping early.")
                                    df = pd.DataFrame(records)
                                    print(f"✅ Loaded {len(records)} measurements "
                                          f"({total_jsonl} JSONL, {total_tar} TAR files skipped)")
                                    return df
                    except Exception as e:
                        print(f"⚠️  Failed to decompress {key}: {e}")

    df = pd.DataFrame(records)
    print(f"✅ Loaded {len(records)} measurements ({total_jsonl} JSONL, {total_tar} TAR files skipped)")
    return df


In [60]:
df = fetch_ooni_raw(
    country="RU",
    test_name="webconnectivity",
    start_day="2022-02-23",
    end_day="2022-02-23",
    max_measurements=5000,  # stop after 5000 lines
)
print(df[["measurement_start_time", "probe_cc", "input"]].head())


📦 Listing raw/20220223/00/RU/webconnectivity/ ...
⬇️  Downloading raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.0.jsonl.gz ...
⏭️  Skipping postcan: raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.0.tar.gz
⬇️  Downloading raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.1.jsonl.gz ...
⏭️  Skipping postcan: raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.1.tar.gz
⬇️  Downloading raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.10.jsonl.gz ...
⏭️  Skipping postcan: raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.10.tar.gz
⬇️  Downloading raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.11.jsonl.gz ...
⏭️  Skipping postcan: raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.11.tar.gz
⬇️  Downloading raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.12.jsonl.gz ...
⏭️  Skipping postcan: raw/20220223/00/RU/webconnectivi

In [61]:
df

Unnamed: 0,annotations,data_format_version,input,measurement_start_time,probe_asn,probe_cc,probe_ip,probe_network_name,report_id,resolver_asn,resolver_ip,resolver_network_name,software_name,software_version,test_helpers,test_keys,test_name,test_runtime,test_start_time,test_version
0,"{'architecture': 'arm64', 'engine_name': 'ooni...",0.2.0,https://top.jdu.ru/,2022-02-22 23:59:44,AS12389,RU,127.0.0.1,PJSC Rostelecom,20220222T235210Z_webconnectivity_RU_12389_n1_C...,AS12389,78.37.77.86,PJSC Rostelecom,ooniprobe-android,3.5.0,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '78.3...",web_connectivity,13.854797,2022-02-22 23:52:09,0.4.1
1,"{'architecture': 'arm', 'engine_name': 'oonipr...",0.2.0,http://www.match.com/,2022-02-22 23:59:54,AS31257,RU,127.0.0.1,Orion Telecom LLC,20220222T234931Z_webconnectivity_RU_31257_n1_8...,AS60068,185.76.9.81,Datacamp Limited,ooniprobe-android,3.5.0,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '185....",web_connectivity,5.903487,2022-02-22 23:49:31,0.4.1
2,"{'architecture': 'arm64', 'engine_name': 'ooni...",0.2.0,https://www.shroomery.org/,2022-02-22 23:59:59,AS12389,RU,127.0.0.1,PJSC Rostelecom,20220222T235210Z_webconnectivity_RU_12389_n1_C...,AS12389,78.37.77.86,PJSC Rostelecom,ooniprobe-android,3.5.0,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '78.3...",web_connectivity,2.203999,2022-02-22 23:52:09,0.4.1
3,"{'engine_name': 'ooniprobe-engine', 'engine_ve...",0.2.0,https://www.bnaibrith.org/,2022-02-22 23:59:30,AS25490,RU,127.0.0.1,PJSC Rostelecom,20220222T235807Z_webconnectivity_RU_25490_n1_N...,AS15169,172.217.37.134,Google LLC,ooniprobe-desktop-unattended,3.10.0-beta.3,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '172....",web_connectivity,32.222068,2022-02-22 23:58:04,0.4.0
4,"{'engine_name': 'ooniprobe-engine', 'engine_ve...",0.2.0,https://www.dea.gov/index.shtml,2022-02-22 23:59:56,AS8492,RU,127.0.0.1,"""OBIT"" Ltd.",20220222T235325Z_webconnectivity_RU_8492_n1_4D...,AS13335,172.69.9.12,"Cloudflare, Inc.",ooniprobe-desktop-unattended,3.10.0-beta.3,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '172....",web_connectivity,6.092541,2022-02-22 23:53:22,0.4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,"{'architecture': 'arm64', 'engine_name': 'ooni...",0.2.0,https://www.wsj.com/,2022-02-23 00:26:11,AS51604,RU,127.0.0.1,"JSC ""ER-Telecom Holding""",20220223T002422Z_webconnectivity_RU_51604_n1_P...,AS6939,216.66.80.90,Hurricane Electric LLC,ooniprobe-android,3.5.0,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '216....",web_connectivity,3.037764,2022-02-23 00:24:21,0.4.1
4996,"{'architecture': 'arm64', 'engine_name': 'ooni...",0.2.0,http://www.on-instant.com/,2022-02-23 00:26:13,AS34757,RU,127.0.0.1,Sibirskie Seti Ltd.,20220223T000150Z_webconnectivity_RU_34757_n1_D...,AS34757,193.238.131.65,Sibirskie Seti Ltd.,ooniprobe-android,3.5.0,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '193....",web_connectivity,1.798442,2022-02-23 00:01:49,0.4.1
4997,"{'engine_name': 'ooniprobe-engine', 'engine_ve...",0.2.0,https://dl.google.com/robots.txt,2022-02-23 00:26:15,AS12389,RU,127.0.0.1,PJSC Rostelecom,20220223T002535Z_webconnectivity_RU_12389_n1_M...,AS12389,212.48.197.66,PJSC Rostelecom,ooniprobe-desktop-unattended,3.10.0-beta.3,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '212....",web_connectivity,0.355002,2022-02-23 00:25:35,0.4.0
4998,"{'architecture': 'arm64', 'engine_name': 'ooni...",0.2.0,https://search.brave.com/,2022-02-23 00:25:26,AS8402,RU,127.0.0.1,"PJSC ""Vimpelcom""",20220223T002427Z_webconnectivity_RU_8402_n1_ps...,AS15169,172.217.37.135,Google LLC,ooniprobe-android,3.5.0,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '172....",web_connectivity,45.834149,2022-02-23 00:24:23,0.4.1


In [62]:
df.columns

Index(['annotations', 'data_format_version', 'input', 'measurement_start_time',
       'probe_asn', 'probe_cc', 'probe_ip', 'probe_network_name', 'report_id',
       'resolver_asn', 'resolver_ip', 'resolver_network_name', 'software_name',
       'software_version', 'test_helpers', 'test_keys', 'test_name',
       'test_runtime', 'test_start_time', 'test_version'],
      dtype='object')