In [1]:
import boto3, gzip, json
import pandas as pd
from io import BytesIO
from datetime import date, timedelta
from botocore import UNSIGNED
from botocore.client import Config

def fetch_ooni_raw(
    country: str,
    test_name: str,
    start_day: str,
    end_day: str,
    max_measurements: int = None,
    verbose: bool = True
):
    """
    Fetch OONI measurements from the public S3 bucket (raw/ structure).
    
    Example S3 layout:
    s3://ooni-data-eu-fra/raw/20220223/15/RU/webconnectivity/2022022315_RU_webconnectivity.n0.0.jsonl.gz
    """
    s3 = boto3.client("s3", region_name="eu-central-1", config=Config(signature_version=UNSIGNED))
    bucket = "ooni-data-eu-fra"
    base_prefix = "raw"

    start = date.fromisoformat(start_day)
    end = date.fromisoformat(end_day)
    records = []
    total_jsonl = 0
    total_tar = 0

    for d in (start + timedelta(n) for n in range((end - start).days + 1)):
        for hour in range(24):
            prefix = f"{base_prefix}/{d.strftime('%Y%m%d')}/{hour:02d}/{country}/{test_name}/"
            if verbose:
                print(f"📦 Listing {prefix} ...")

            paginator = s3.get_paginator("list_objects_v2")
            pages = paginator.paginate(Bucket=bucket, Prefix=prefix)

            for page in pages:
                if "Contents" not in page:
                    continue

                for obj in page["Contents"]:
                    key = obj["Key"]

                    # Skip tar.gz (postcans)
                    if key.endswith(".tar.gz"):
                        total_tar += 1
                        if verbose:
                            print(f"⏭️  Skipping postcan: {key}")
                        continue

                    if not key.endswith(".jsonl.gz"):
                        continue

                    total_jsonl += 1
                    if verbose:
                        print(f"⬇️  Downloading {key} ...")

                    data = s3.get_object(Bucket=bucket, Key=key)["Body"].read()

                    try:
                        with gzip.open(BytesIO(data), "rt", encoding="utf-8") as fh:
                            for line in fh:
                                record = json.loads(line)
                                records.append(record)

                                # Stop early if we hit the max_measurements limit
                                if max_measurements and len(records) >= max_measurements:
                                    if verbose:
                                        print(f"🚫 Reached max_measurements ({max_measurements}), stopping early.")
                                    df = pd.DataFrame(records)
                                    print(f"✅ Loaded {len(records)} measurements "
                                          f"({total_jsonl} JSONL, {total_tar} TAR files skipped)")
                                    return df
                    except Exception as e:
                        print(f"⚠️  Failed to decompress {key}: {e}")

    df = pd.DataFrame(records)
    print(f"✅ Loaded {len(records)} measurements ({total_jsonl} JSONL, {total_tar} TAR files skipped)")
    return df


In [2]:
df = fetch_ooni_raw(
    country="RU",
    test_name="webconnectivity",
    start_day="2022-02-23",
    end_day="2022-02-23",
    max_measurements=5000,  # stop after 5000 lines
)
print(df[["measurement_start_time", "probe_cc", "input"]].head())


📦 Listing raw/20220223/00/RU/webconnectivity/ ...
⬇️  Downloading raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.0.jsonl.gz ...
⏭️  Skipping postcan: raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.0.tar.gz
⬇️  Downloading raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.1.jsonl.gz ...
⏭️  Skipping postcan: raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.1.tar.gz
⬇️  Downloading raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.10.jsonl.gz ...
⏭️  Skipping postcan: raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.10.tar.gz
⬇️  Downloading raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.11.jsonl.gz ...
⏭️  Skipping postcan: raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.11.tar.gz
⬇️  Downloading raw/20220223/00/RU/webconnectivity/2022022300_RU_webconnectivity.n0.12.jsonl.gz ...
⏭️  Skipping postcan: raw/20220223/00/RU/webconnectivi

In [3]:
df

Unnamed: 0,annotations,data_format_version,input,measurement_start_time,probe_asn,probe_cc,probe_ip,probe_network_name,report_id,resolver_asn,resolver_ip,resolver_network_name,software_name,software_version,test_helpers,test_keys,test_name,test_runtime,test_start_time,test_version
0,"{'architecture': 'arm64', 'engine_name': 'ooni...",0.2.0,https://top.jdu.ru/,2022-02-22 23:59:44,AS12389,RU,127.0.0.1,PJSC Rostelecom,20220222T235210Z_webconnectivity_RU_12389_n1_C...,AS12389,78.37.77.86,PJSC Rostelecom,ooniprobe-android,3.5.0,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '78.3...",web_connectivity,13.854797,2022-02-22 23:52:09,0.4.1
1,"{'architecture': 'arm', 'engine_name': 'oonipr...",0.2.0,http://www.match.com/,2022-02-22 23:59:54,AS31257,RU,127.0.0.1,Orion Telecom LLC,20220222T234931Z_webconnectivity_RU_31257_n1_8...,AS60068,185.76.9.81,Datacamp Limited,ooniprobe-android,3.5.0,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '185....",web_connectivity,5.903487,2022-02-22 23:49:31,0.4.1
2,"{'architecture': 'arm64', 'engine_name': 'ooni...",0.2.0,https://www.shroomery.org/,2022-02-22 23:59:59,AS12389,RU,127.0.0.1,PJSC Rostelecom,20220222T235210Z_webconnectivity_RU_12389_n1_C...,AS12389,78.37.77.86,PJSC Rostelecom,ooniprobe-android,3.5.0,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '78.3...",web_connectivity,2.203999,2022-02-22 23:52:09,0.4.1
3,"{'engine_name': 'ooniprobe-engine', 'engine_ve...",0.2.0,https://www.bnaibrith.org/,2022-02-22 23:59:30,AS25490,RU,127.0.0.1,PJSC Rostelecom,20220222T235807Z_webconnectivity_RU_25490_n1_N...,AS15169,172.217.37.134,Google LLC,ooniprobe-desktop-unattended,3.10.0-beta.3,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '172....",web_connectivity,32.222068,2022-02-22 23:58:04,0.4.0
4,"{'engine_name': 'ooniprobe-engine', 'engine_ve...",0.2.0,https://www.dea.gov/index.shtml,2022-02-22 23:59:56,AS8492,RU,127.0.0.1,"""OBIT"" Ltd.",20220222T235325Z_webconnectivity_RU_8492_n1_4D...,AS13335,172.69.9.12,"Cloudflare, Inc.",ooniprobe-desktop-unattended,3.10.0-beta.3,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '172....",web_connectivity,6.092541,2022-02-22 23:53:22,0.4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,"{'architecture': 'arm64', 'engine_name': 'ooni...",0.2.0,https://www.wsj.com/,2022-02-23 00:26:11,AS51604,RU,127.0.0.1,"JSC ""ER-Telecom Holding""",20220223T002422Z_webconnectivity_RU_51604_n1_P...,AS6939,216.66.80.90,Hurricane Electric LLC,ooniprobe-android,3.5.0,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '216....",web_connectivity,3.037764,2022-02-23 00:24:21,0.4.1
4996,"{'architecture': 'arm64', 'engine_name': 'ooni...",0.2.0,http://www.on-instant.com/,2022-02-23 00:26:13,AS34757,RU,127.0.0.1,Sibirskie Seti Ltd.,20220223T000150Z_webconnectivity_RU_34757_n1_D...,AS34757,193.238.131.65,Sibirskie Seti Ltd.,ooniprobe-android,3.5.0,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '193....",web_connectivity,1.798442,2022-02-23 00:01:49,0.4.1
4997,"{'engine_name': 'ooniprobe-engine', 'engine_ve...",0.2.0,https://dl.google.com/robots.txt,2022-02-23 00:26:15,AS12389,RU,127.0.0.1,PJSC Rostelecom,20220223T002535Z_webconnectivity_RU_12389_n1_M...,AS12389,212.48.197.66,PJSC Rostelecom,ooniprobe-desktop-unattended,3.10.0-beta.3,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '212....",web_connectivity,0.355002,2022-02-23 00:25:35,0.4.0
4998,"{'architecture': 'arm64', 'engine_name': 'ooni...",0.2.0,https://search.brave.com/,2022-02-23 00:25:26,AS8402,RU,127.0.0.1,"PJSC ""Vimpelcom""",20220223T002427Z_webconnectivity_RU_8402_n1_ps...,AS15169,172.217.37.135,Google LLC,ooniprobe-android,3.5.0,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '172....",web_connectivity,45.834149,2022-02-23 00:24:23,0.4.1


In [62]:
df.columns

Index(['annotations', 'data_format_version', 'input', 'measurement_start_time',
       'probe_asn', 'probe_cc', 'probe_ip', 'probe_network_name', 'report_id',
       'resolver_asn', 'resolver_ip', 'resolver_network_name', 'software_name',
       'software_version', 'test_helpers', 'test_keys', 'test_name',
       'test_runtime', 'test_start_time', 'test_version'],
      dtype='object')

In [17]:
df['annotations'][10]

{'architecture': 'arm64',
 'engine_name': 'ooniprobe-engine',
 'engine_version': '3.13.0',
 'flavor': 'stableFull',
 'network_type': 'wifi',
 'origin': 'autorun',
 'platform': 'android'}

In [7]:
df['test_keys'][0]

{'agent': 'redirect',
 'client_resolver': '78.37.77.86',
 'retries': None,
 'socksproxy': None,
 'network_events': [{'address': '193.0.170.23:443',
   'failure': None,
   'operation': 'connect',
   'proto': 'tcp',
   't': 13.256913539,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'operation': 'tls_handshake_start',
   't': 13.256953924,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'num_bytes': 280,
   'operation': 'write',
   't': 13.260765539,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'num_bytes': 517,
   'operation': 'read',
   't': 13.285954539,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'num_bytes': 3949,
   'operation': 'read',
   't': 13.286219078,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'num_bytes': 126,
   'operation': 'write',
   't': 13.303253616,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'num_bytes': 51,
   'operation': 'read',
   't': 13.322337309,
   'tags': ['tcptls_experiment']},
 

In [8]:
df['test_name'][0]

'web_connectivity'

In [9]:
df['resolver_asn'].value_counts()

AS15169    688
AS13335    593
AS12389    538
AS8402     421
AS42       400
AS60068    304
AS42387    299
AS12714    198
AS43966    137
AS31200    118
AS8790     105
AS35807    100
AS21479    100
AS51410    100
AS20485    100
AS34267    100
AS31059    100
AS16276    100
AS25513     93
AS42139     84
AS34757     78
AS56724     72
AS35473     65
AS8359      60
AS6939      42
AS43832      5
Name: resolver_asn, dtype: int64

In [10]:
df['probe_asn'].value_counts()

AS12389    1095
AS8402      518
AS42387     299
AS25513     293
AS8359      260
AS31257     204
AS8334      200
AS12714     198
AS25490     141
AS43966     137
AS44927     129
AS31200     118
AS8790      105
AS35807     100
AS12668     100
AS42610     100
AS29125     100
AS31163     100
AS21479     100
AS51813     100
AS31059     100
AS44724      84
AS34757      78
AS56724      72
AS41691      66
AS25086      65
AS8492       51
AS51604      42
AS41733      40
AS34456       5
Name: probe_asn, dtype: int64

In [11]:
df['resolver_network_name'].value_counts()

PJSC Rostelecom                                                                                 738
Google LLC                                                                                      688
Cloudflare, Inc.                                                                                593
PJSC "Vimpelcom"                                                                                421
WoodyNet                                                                                        400
Datacamp Limited                                                                                304
Limited Company Svyazservice                                                                    299
Net By Net Holding LLC                                                                          198
IT REGION LTD                                                                                   137
MTS PJSC                                                                                        125


In [18]:
df.columns

Index(['annotations', 'data_format_version', 'input', 'measurement_start_time',
       'probe_asn', 'probe_cc', 'probe_ip', 'probe_network_name', 'report_id',
       'resolver_asn', 'resolver_ip', 'resolver_network_name', 'software_name',
       'software_version', 'test_helpers', 'test_keys', 'test_name',
       'test_runtime', 'test_start_time', 'test_version'],
      dtype='object')

In [20]:
df['test_runtime'][1]

5.903487188

In [23]:
df['probe_asn'].value_counts()

AS12389    1095
AS8402      518
AS42387     299
AS25513     293
AS8359      260
AS31257     204
AS8334      200
AS12714     198
AS25490     141
AS43966     137
AS44927     129
AS31200     118
AS8790      105
AS35807     100
AS12668     100
AS42610     100
AS29125     100
AS31163     100
AS21479     100
AS51813     100
AS31059     100
AS44724      84
AS34757      78
AS56724      72
AS41691      66
AS25086      65
AS8492       51
AS51604      42
AS41733      40
AS34456       5
Name: probe_asn, dtype: int64

In [39]:
df[df['probe_asn']=='AS34456']['test_keys'][184]

{'agent': 'redirect',
 'client_resolver': '194.226.75.81',
 'retries': None,
 'socksproxy': None,
 'network_events': [{'address': '172.224.37.10:443',
   'failure': 'connection_refused',
   'operation': 'connect',
   'proto': 'tcp',
   't': 1.560841,
   'tags': ['tcptls_experiment']},
  {'address': '172.224.37.15:443',
   'failure': 'connection_refused',
   'operation': 'connect',
   'proto': 'tcp',
   't': 1.560938,
   'tags': ['tcptls_experiment']},
  {'address': '172.224.37.16:443',
   'failure': 'connection_refused',
   'operation': 'connect',
   'proto': 'tcp',
   't': 1.562028,
   'tags': ['tcptls_experiment']},
  {'address': '172.224.37.17:443',
   'failure': 'connection_refused',
   'operation': 'connect',
   'proto': 'tcp',
   't': 1.61152,
   'tags': ['tcptls_experiment']},
  {'address': '172.224.37.4:443',
   'failure': 'connection_refused',
   'operation': 'connect',
   'proto': 'tcp',
   't': 1.61299,
   'tags': ['tcptls_experiment']},
  {'address': '172.224.37.6:443',
   

In [40]:
df[df['probe_asn']=='AS34456']['test_keys'][177]

{'agent': 'redirect',
 'client_resolver': '194.226.75.81',
 'retries': None,
 'socksproxy': None,
 'network_events': [{'address': '17.248.150.196:443',
   'failure': None,
   'operation': 'connect',
   'proto': 'tcp',
   't': 5.537738,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'operation': 'tls_handshake_start',
   't': 5.537753,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'num_bytes': 288,
   'operation': 'write',
   't': 5.538771,
   'tags': ['tcptls_experiment']},
  {'failure': 'eof_error',
   'operation': 'read',
   't': 5.563343,
   'tags': ['tcptls_experiment']},
  {'failure': 'eof_error',
   'operation': 'tls_handshake_done',
   't': 5.56347,
   'tags': ['tcptls_experiment']},
  {'address': '17.248.150.197:443',
   'failure': None,
   'operation': 'connect',
   'proto': 'tcp',
   't': 5.538439,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'operation': 'tls_handshake_start',
   't': 5.538447,
   'tags': ['tcptls_experiment']},
  {'failur

In [43]:
df['test_keys'][10]

{'agent': 'redirect',
 'client_resolver': '5.129.181.74',
 'retries': None,
 'socksproxy': None,
 'network_events': [{'address': '172.67.74.153:443',
   'failure': None,
   'operation': 'connect',
   'proto': 'tcp',
   't': 1.757886666,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'operation': 'tls_handshake_start',
   't': 1.758076093,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'num_bytes': 285,
   'operation': 'write',
   't': 1.76242802,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'num_bytes': 517,
   'operation': 'read',
   't': 1.82775677,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'num_bytes': 2392,
   'operation': 'read',
   't': 1.831352603,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'num_bytes': 64,
   'operation': 'write',
   't': 1.838874426,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'operation': 'tls_handshake_done',
   't': 1.839061301,
   'tags': ['tcptls_experiment']},
  {'failure': N

In [45]:
df['software_version'].value_counts()

3.5.0            3237
3.10.0-beta.3    1259
1.0.0             504
Name: software_version, dtype: int64

In [47]:
df[df['software_version']=='3.5.0']['test_keys'][0]

{'agent': 'redirect',
 'client_resolver': '78.37.77.86',
 'retries': None,
 'socksproxy': None,
 'network_events': [{'address': '193.0.170.23:443',
   'failure': None,
   'operation': 'connect',
   'proto': 'tcp',
   't': 13.256913539,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'operation': 'tls_handshake_start',
   't': 13.256953924,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'num_bytes': 280,
   'operation': 'write',
   't': 13.260765539,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'num_bytes': 517,
   'operation': 'read',
   't': 13.285954539,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'num_bytes': 3949,
   'operation': 'read',
   't': 13.286219078,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'num_bytes': 126,
   'operation': 'write',
   't': 13.303253616,
   'tags': ['tcptls_experiment']},
  {'failure': None,
   'num_bytes': 51,
   'operation': 'read',
   't': 13.322337309,
   'tags': ['tcptls_experiment']},
 

In [51]:
df[df['software_version']=='3.10.0-beta.3']['test_keys'][3]

{'agent': 'redirect',
 'client_resolver': '172.217.37.134',
 'retries': None,
 'socksproxy': None,
 'network_events': [{'address': '199.34.228.49:443',
   'failure': 'generic_timeout_error',
   'operation': 'connect',
   'proto': 'tcp',
   't': 17.1777439,
   'tags': ['tcptls_experiment']}],
 'tls_handshakes': None,
 'queries': [{'answers': [{'asn': 27647,
     'as_org_name': 'Weebly, Inc.',
     'answer_type': 'A',
     'ipv4': '199.34.228.49',
     'ttl': None}],
   'engine': 'system',
   'failure': None,
   'hostname': 'www.bnaibrith.org',
   'query_type': 'A',
   'resolver_hostname': None,
   'resolver_port': None,
   'resolver_address': '',
   't': 0.1062524}],
 'dns_experiment_failure': None,
 'dns_consistency': 'consistent',
 'control_failure': None,
 'control': {'tcp_connect': {'199.34.228.49:443': {'status': True,
    'failure': None}},
  'http_request': {'body_length': 159268,
   'failure': None,
   'title': "B'nai B'rith International - B'nai B'rith International",
   'heade

In [53]:
df['annotations'].value_counts()

{'architecture': 'arm64', 'engine_name': 'ooniprobe-engine', 'engine_version': '3.13.0', 'flavor': 'stableFull', 'network_type': 'wifi', 'origin': 'autorun', 'platform': 'android'}                                                  2294
{'engine_name': 'ooniprobe-engine', 'engine_version': '3.10.0-beta.3', 'platform': 'windows'}                                                                                                                                          925
{'engine_name': 'ooniprobe-engine', 'engine_version': '3.10.0-beta.3', 'platform': 'linux'}                                                                                                                                            490
{'architecture': 'arm64', 'engine_name': 'ooniprobe-engine', 'engine_version': '3.13.0', 'flavor': 'stableFdroid', 'network_type': 'wifi', 'origin': 'autorun', 'platform': 'android'}                                                 489
{'architecture': 'arm', 'engine_name': 'ooniprobe-engine', '

In [60]:
test={'_probe_engine_sanitize_test_keys': 'true', 'engine_name': 'ooniprobe-engine', 'engine_version': '3.10.0-beta.3', 'platform': 'macos'}

In [65]:
df[df['annotations']==test]

Unnamed: 0,annotations,data_format_version,input,measurement_start_time,probe_asn,probe_cc,probe_ip,probe_network_name,report_id,resolver_asn,resolver_ip,resolver_network_name,software_name,software_version,test_helpers,test_keys,test_name,test_runtime,test_start_time,test_version
615,"{'_probe_engine_sanitize_test_keys': 'true', '...",0.2.0,http://www.sina.com.cn/,2022-02-23 00:29:59,AS41733,RU,127.0.0.1,"JSC ""ER-Telecom Holding""",20220223T002600Z_webconnectivity_RU_41733_n1_H...,AS15169,173.194.98.7,Google LLC,ooniprobe-desktop-unattended,3.10.0-beta.3,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '173....",web_connectivity,1.112522,2022-02-23 00:26:00,0.4.0
635,"{'_probe_engine_sanitize_test_keys': 'true', '...",0.2.0,https://www.queerussia.info/,2022-02-23 00:30:04,AS41733,RU,127.0.0.1,"JSC ""ER-Telecom Holding""",20220223T002600Z_webconnectivity_RU_41733_n1_H...,AS15169,173.194.98.7,Google LLC,ooniprobe-desktop-unattended,3.10.0-beta.3,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '173....",web_connectivity,0.493637,2022-02-23 00:26:00,0.4.0
2961,"{'_probe_engine_sanitize_test_keys': 'true', '...",0.2.0,https://ru.m.wikisource.org/,2022-02-23 00:52:58,AS25086,RU,127.0.0.1,MTS PJSC,20220223T004441Z_webconnectivity_RU_25086_n1_2...,AS35473,213.87.74.17,MTS PJSC,ooniprobe-desktop-unattended,3.10.0-beta.3,{'backend': {'address': 'https://wcth.ooni.io'...,"{'agent': 'redirect', 'client_resolver': '213....",web_connectivity,30.972973,2022-02-23 00:44:41,0.4.0
