In [1]:
import boto3, gzip, json
import pandas as pd
from io import BytesIO
from datetime import date, timedelta
from botocore import UNSIGNED
from botocore.client import Config

def fetch_ooni_raw(
    country: str,
    test_name: str,
    start_day: str,
    end_day: str,
    max_measurements: int = None,
    verbose: bool = True
):
    """
    Fetch OONI measurements from the public S3 bucket (raw/ structure).
    
    Example S3 layout:
    s3://ooni-data-eu-fra/raw/20220223/15/RU/webconnectivity/2022022315_RU_webconnectivity.n0.0.jsonl.gz
    """
    s3 = boto3.client("s3", region_name="eu-central-1", config=Config(signature_version=UNSIGNED))
    bucket = "ooni-data-eu-fra"
    base_prefix = "raw"

    start = date.fromisoformat(start_day)
    end = date.fromisoformat(end_day)
    records = []
    total_jsonl = 0
    total_tar = 0

    for d in (start + timedelta(n) for n in range((end - start).days + 1)):
        for hour in range(24):
            prefix = f"{base_prefix}/{d.strftime('%Y%m%d')}/{hour:02d}/{country}/{test_name}/"
            if verbose:
                print(f"📦 Listing {prefix} ...")

            paginator = s3.get_paginator("list_objects_v2")
            pages = paginator.paginate(Bucket=bucket, Prefix=prefix)

            for page in pages:
                if "Contents" not in page:
                    continue

                for obj in page["Contents"]:
                    key = obj["Key"]

                    # Skip tar.gz (postcans)
                    if key.endswith(".tar.gz"):
                        total_tar += 1
                        if verbose:
                            print(f"⏭️  Skipping postcan: {key}")
                        continue

                    if not key.endswith(".jsonl.gz"):
                        continue

                    total_jsonl += 1
                    if verbose:
                        print(f"⬇️  Downloading {key} ...")

                    data = s3.get_object(Bucket=bucket, Key=key)["Body"].read()

                    try:
                        with gzip.open(BytesIO(data), "rt", encoding="utf-8") as fh:
                            for line in fh:
                                record = json.loads(line)
                                records.append(record)

                                # Stop early if we hit the max_measurements limit
                                if max_measurements and len(records) >= max_measurements:
                                    if verbose:
                                        print(f"🚫 Reached max_measurements ({max_measurements}), stopping early.")
                                    df = pd.DataFrame(records)
                                    print(f"✅ Loaded {len(records)} measurements "
                                          f"({total_jsonl} JSONL, {total_tar} TAR files skipped)")
                                    return df
                    except Exception as e:
                        print(f"⚠️  Failed to decompress {key}: {e}")

    df = pd.DataFrame(records)
    print(f"✅ Loaded {len(records)} measurements ({total_jsonl} JSONL, {total_tar} TAR files skipped)")
    return df


In [2]:
df = fetch_ooni_raw(
    country="RU",
    test_name="webconnectivity",
    start_day="2022-02-23",
    end_day="2022-02-23",
    max_measurements=5000,  # stop after 5000 lines
)
print(df[["measurement_start_time", "probe_cc", "input"]].head())


📦 Listing raw/20220223/00/RU/webconnectivity/ ...
⬇️  Downloading raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.0.jsonl.gz ...
⏭️  Skipping postcan: raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.0.tar.gz
⬇️  Downloading raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.1.jsonl.gz ...
⏭️  Skipping postcan: raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.1.tar.gz
⬇️  Downloading raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.10.jsonl.gz ...
⏭️  Skipping postcan: raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.10.tar.gz
⬇️  Downloading raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.11.jsonl.gz ...
⏭️  Skipping postcan: raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.11.tar.gz
⬇️  Downloading raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.12.jsonl.gz ...
⏭️  Skipping postcan: raw/20220223/00/RU/webconnectivi

In [3]:
df

Unnamed: 0,annotations,data_format_version,input,measurement_start_time,probe_asn,probe_cc,probe_ip,probe_network_name,report_id,resolver_asn,resolver_ip,resolver_network_name,software_name,software_version,test_helpers,test_keys,test_name,test_runtime,test_start_time,test_version
0,"{'architecture': 'arm64', 'engine_name': 'ooni...",0.2.0,https://top.jdu.ru/,2022-02-22 23:59:44,AS12389,RU,127.0.0.1,PJSC Rostelecom,20220222T235210Z_webconnectivity_RU_12389_n1_C...,AS12389,78.37.77.86,PJSC Rostelecom,ooniprobe-android,3.5.0,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '78.3...",web_connectivity,13.854797,2022-02-22 23:52:09,0.4.1
1,"{'architecture': 'arm', 'engine_name': 'oonipr...",0.2.0,http://www.match.com/,2022-02-22 23:59:54,AS31257,RU,127.0.0.1,Orion Telecom LLC,20220222T234931Z_webconnectivity_RU_31257_n1_8...,AS60068,185.76.9.81,Datacamp Limited,ooniprobe-android,3.5.0,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '185....",web_connectivity,5.903487,2022-02-22 23:49:31,0.4.1
2,"{'architecture': 'arm64', 'engine_name': 'ooni...",0.2.0,https://www.shroomery.org/,2022-02-22 23:59:59,AS12389,RU,127.0.0.1,PJSC Rostelecom,20220222T235210Z_webconnectivity_RU_12389_n1_C...,AS12389,78.37.77.86,PJSC Rostelecom,ooniprobe-android,3.5.0,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '78.3...",web_connectivity,2.203999,2022-02-22 23:52:09,0.4.1
3,"{'engine_name': 'ooniprobe-engine', 'engine_ve...",0.2.0,https://www.bnaibrith.org/,2022-02-22 23:59:30,AS25490,RU,127.0.0.1,PJSC Rostelecom,20220222T235807Z_webconnectivity_RU_25490_n1_N...,AS15169,172.217.37.134,Google LLC,ooniprobe-desktop-unattended,3.10.0-beta.3,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '172....",web_connectivity,32.222068,2022-02-22 23:58:04,0.4.0
4,"{'engine_name': 'ooniprobe-engine', 'engine_ve...",0.2.0,https://www.dea.gov/index.shtml,2022-02-22 23:59:56,AS8492,RU,127.0.0.1,"""OBIT"" Ltd.",20220222T235325Z_webconnectivity_RU_8492_n1_4D...,AS13335,172.69.9.12,"Cloudflare, Inc.",ooniprobe-desktop-unattended,3.10.0-beta.3,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '172....",web_connectivity,6.092541,2022-02-22 23:53:22,0.4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,"{'architecture': 'arm64', 'engine_name': 'ooni...",0.2.0,https://www.wsj.com/,2022-02-23 00:26:11,AS51604,RU,127.0.0.1,"JSC ""ER-Telecom Holding""",20220223T002422Z_webconnectivity_RU_51604_n1_P...,AS6939,216.66.80.90,Hurricane Electric LLC,ooniprobe-android,3.5.0,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '216....",web_connectivity,3.037764,2022-02-23 00:24:21,0.4.1
4996,"{'architecture': 'arm64', 'engine_name': 'ooni...",0.2.0,http://www.on-instant.com/,2022-02-23 00:26:13,AS34757,RU,127.0.0.1,Sibirskie Seti Ltd.,20220223T000150Z_webconnectivity_RU_34757_n1_D...,AS34757,193.238.131.65,Sibirskie Seti Ltd.,ooniprobe-android,3.5.0,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '193....",web_connectivity,1.798442,2022-02-23 00:01:49,0.4.1
4997,"{'engine_name': 'ooniprobe-engine', 'engine_ve...",0.2.0,https://dl.google.com/robots.txt,2022-02-23 00:26:15,AS12389,RU,127.0.0.1,PJSC Rostelecom,20220223T002535Z_webconnectivity_RU_12389_n1_M...,AS12389,212.48.197.66,PJSC Rostelecom,ooniprobe-desktop-unattended,3.10.0-beta.3,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '212....",web_connectivity,0.355002,2022-02-23 00:25:35,0.4.0
4998,"{'architecture': 'arm64', 'engine_name': 'ooni...",0.2.0,https://search.brave.com/,2022-02-23 00:25:26,AS8402,RU,127.0.0.1,"PJSC ""Vimpelcom""",20220223T002427Z_webconnectivity_RU_8402_n1_ps...,AS15169,172.217.37.135,Google LLC,ooniprobe-android,3.5.0,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '172....",web_connectivity,45.834149,2022-02-23 00:24:23,0.4.1


In [62]:
df.columns

Index(['annotations', 'data_format_version', 'input', 'measurement_start_time',
       'probe_asn', 'probe_cc', 'probe_ip', 'probe_network_name', 'report_id',
       'resolver_asn', 'resolver_ip', 'resolver_network_name', 'software_name',
       'software_version', 'test_helpers', 'test_keys', 'test_name',
       'test_runtime', 'test_start_time', 'test_version'],
      dtype='object')