## Amazon ECS proportion

In [3]:
from collections import defaultdict
from geogiant.common.files_utils import load_json, dump_json
from geogiant.common.ip_addresses_utils import get_prefix_from_ip
from geogiant.common.settings import PathSettings

path_settings = PathSettings()

org_per_hostname = load_json(
    path_settings.DATASET / "ecs_hostnames_organization_original.json",
)

hostname_per_org = defaultdict(set)
for i, (hostname, orgs) in enumerate(org_per_hostname.items()):
    for org in orgs:
        hostname_per_org[org].add(hostname)


all_hostnames = set()
for i, (org, hostnames) in enumerate(hostname_per_org.items()):
    all_hostnames.update(hostnames)


print(len(hostname_per_org["AMAZON-02"]) / len(all_hostnames) * 100)

13.679361163135331


In [3]:
from datetime import datetime, timedelta

t1 = 1715292343.221449
t2 = 1715294212.4327064

t = datetime.fromtimestamp(t2) - datetime.fromtimestamp(t1)

t

datetime.timedelta(seconds=1869, microseconds=211257)

# Last dataset of hostname

In [4]:
hostname_per_org_per_ns = load_json(
    path_settings.DATASET
    / "hostname_geo_score_selection_20_BGP_3_hostnames_per_org_ns.json"
)

selected_hostnames = set()
selected_hostnames_per_cdn = defaultdict(list)
for ns in hostname_per_org_per_ns:
    for org, hostnames in hostname_per_org_per_ns[ns].items():
        selected_hostnames.update(hostnames)
        selected_hostnames_per_cdn[org].extend(hostnames)

print(f"{len(selected_hostnames)=}")

print(
    f"AMAZON:: {len(selected_hostnames_per_cdn['AMAZON']) / len(selected_hostnames) * 100}"
)

len(selected_hostnames)=389
AMAZON:: 4.627249357326478


In [18]:
cdn_per_hostname = load_json(path_settings.DATASET / "ecs_hostnames_organization.json")

bgp_prefix_per_hostname = defaultdict(set)
for hostname, bgp_prefixes_per_cdn in cdn_per_hostname.items():
    for bgp_prefixes in bgp_prefixes_per_cdn.values():
        bgp_prefix_per_hostname[hostname].update(bgp_prefixes)

google_hostnames = selected_hostnames_per_cdn["AKAMAI"]
bgp_prefixes_per_hostname_google = {}
all_bgp_prefixes_google = set()
for hostname in google_hostnames:
    bgp_prefixes = bgp_prefix_per_hostname[hostname]
    bgp_prefixes_per_hostname_google[hostname] = bgp_prefixes

cumulative_bgp_prefixes = set()
bgp_prefixes_per_hostname_google = sorted(
    bgp_prefixes_per_hostname_google.items(), key=lambda x: len(x[1]), reverse=True
)
for hostname, bgp_prefixes in bgp_prefixes_per_hostname_google[:3]:
    all_bgp_prefixes_google.update(bgp_prefixes)

for hostname, bgp_prefixes in bgp_prefixes_per_hostname_google[:3]:
    cumulative_bgp_prefixes = cumulative_bgp_prefixes.union(set(bgp_prefixes))
    print(round(len(cumulative_bgp_prefixes) / len(all_bgp_prefixes_google) * 100, 2))

54.09
80.56
100.0


# Removed probes / anchors

In [2]:
from geogiant.common.queries import (
    load_targets,
    load_vps,
)
from collections import defaultdict
from geogiant.common.files_utils import load_json, dump_json
from geogiant.common.ip_addresses_utils import get_prefix_from_ip
from geogiant.common.settings import PathSettings, ClickhouseSettings

path_settings = PathSettings()
clickhouse_settings = ClickhouseSettings()

wrongly_geolocated_probes = load_json(
    path_settings.DATASET / "wrongly_geolocated_probes.json"
)

# here load VPs and country file info
targets = load_targets(clickhouse_settings.VPS_RAW_TABLE)
targets = [target["addr"] for target in targets]
vps = load_vps(clickhouse_settings.VPS_RAW_TABLE)
vps = [vp["addr"] for vp in vps]

removed_anchors = []
removed_probes = []

for target in wrongly_geolocated_probes:
    if target in targets:
        removed_anchors.append(target)
    if target in vps:
        removed_probes.append(target)


print(f"{len(removed_anchors)=}")
print(f"{len(removed_probes)=}")

[32m2024-05-10 07:49:57.375[0m | [1mINFO    [0m | [36mgeogiant.clickhouse.query[0m:[36mexecute[0m:[36m122[0m - [1mquery=GetVPs table_name=vps_raw  limit=None[0m
[32m2024-05-10 07:49:57.488[0m | [1mINFO    [0m | [36mgeogiant.clickhouse.query[0m:[36mexecute[0m:[36m122[0m - [1mquery=GetVPs table_name=vps_raw  limit=None[0m


len(vps)=11265
len(filtered_vps)=10864
len(removed_anchors)=47
len(removed_probes)=139


# Last mile delay VP

In [8]:
from geogiant.common.queries import get_min_rtt_per_vp
from geogiant.common.ip_addresses_utils import get_prefix_from_ip
from geogiant.common.settings import PathSettings, ClickhouseSettings

path_settings = PathSettings()
clickhouse_settings = ClickhouseSettings()

last_mile_delay = get_min_rtt_per_vp(clickhouse_settings.VPS_MESHED_TRACEROUTE_TABLE)

vp_above_2ms = []
filtered_vps = []
for vp, min_rtt in last_mile_delay.items():
    if min_rtt > 2:
        vp_above_2ms.append(vp)
    else:
        filtered_vps.append(vp)

print(f"{len(last_mile_delay)=}")
print(f"{len(vp_above_2ms)=}")
print(f"{len(filtered_vps)=}")

print(f"{len(filtered_vps)=}")
filtered_vps = set(filtered_vps).difference(set(wrongly_geolocated_probes))
print(f"{len(filtered_vps)=}")

filtered_vps_subnets = [get_prefix_from_ip(vp_addr) for vp_addr in filtered_vps]

print(f"{len(filtered_vps_subnets)}")
dump_json(filtered_vps_subnets, path_settings.DATASET / "vps_subnets_filtered.json")

[32m2024-05-10 07:53:51.656[0m | [1mINFO    [0m | [36mgeogiant.clickhouse.query[0m:[36mexecute[0m:[36m122[0m - [1mquery=GetLastMileDelay table_name=traceroutes_last_mile_delay  limit=None[0m


len(last_mile_delay)=10735
len(vp_above_2ms)=4851
len(filtered_vps)=5884
len(filtered_vps)=5884
len(filtered_vps)=5884
5884


# Anchors dataset taxionomie

In [19]:
from collections import defaultdict
from geogiant.common.queries import load_targets
from geogiant.common.files_utils import load_json, load_countries_continent
from geogiant.common.settings import PathSettings, ClickhouseSettings

path_settings = PathSettings()
clickhouse_settings = ClickhouseSettings()

wrongly_geolocated_probes = load_json(
    path_settings.DATASET / "wrongly_geolocated_probes.json"
)

# here load VPs and country file info
targets = load_targets(clickhouse_settings.VPS_RAW_TABLE)
countries_continent = load_countries_continent()

target_countries = set()
target_asns = set()
target_per_continent = defaultdict(int)
for target in targets:
    target_countries.add(target["country_code"])
    target_asns.add(target["asn_v4"])
    continent = countries_continent[target["country_code"]]

    target_per_continent[continent] += 1

print(f"{len(target_countries)=}")
print(f"{len(target_asns)=}")

for continent, nb_targets in target_per_continent.items():
    print(f"{continent=}, {nb_targets=}")

[32m2024-05-09 18:56:25.338[0m | [1mINFO    [0m | [36mgeogiant.clickhouse.query[0m:[36mexecute[0m:[36m122[0m - [1mquery=GetVPs table_name=vps_raw  limit=None[0m


len(target_countries)=96
len(target_asns)=603
continent='Europe', nb_targets=475
continent='Asia', nb_targets=124
continent='South America', nb_targets=29
continent='North America', nb_targets=141
continent='Oceania', nb_targets=23
continent='Africa', nb_targets=19


In [4]:
from geogiant.common.queries import load_targets, get_pings_per_target
from geogiant.common.settings import ClickhouseSettings

targets = load_targets(ClickhouseSettings().VPS_FILTERED_TABLE)
# pings_anchors = get_pings_per_target(ClickhouseSettings().VPS_VPS_MESHED_PINGS_TABLE)

print(f"{len(targets)}")
# print(f"{len(pings_anchors)}")

[32m2024-05-10 12:14:02.668[0m | [1mINFO    [0m | [36mgeogiant.clickhouse.query[0m:[36mexecute[0m:[36m122[0m - [1mquery=GetVPs table_name=filtered_vps  limit=None[0m


759


In [1]:
from collections import defaultdict
from geogiant.common.queries import load_targets, get_pings_per_target
from geogiant.common.files_utils import load_json, load_countries_continent
from geogiant.common.settings import PathSettings, ClickhouseSettings

path_settings = PathSettings()
clickhouse_settings = ClickhouseSettings()

wrongly_geolocated_probes = load_json(
    path_settings.DATASET / "wrongly_geolocated_probes.json"
)

# here load VPs and country file info
targets = load_targets(clickhouse_settings.VPS_FILTERED_TABLE)
pings_target = get_pings_per_target(clickhouse_settings.VPS_MESHED_PINGS_TABLE)

countries_continent = load_countries_continent()

[32m2024-09-02 12:13:56.440[0m | [1mINFO    [0m | [36mgeogiant.clickhouse.query[0m:[36mexecute[0m:[36m122[0m - [1mquery=GetVPs table_name=filtered_vps  limit=None[0m
[32m2024-09-02 12:13:56.562[0m | [1mINFO    [0m | [36mgeogiant.clickhouse.query[0m:[36mexecute[0m:[36m122[0m - [1mquery=CreatePingTable table_name=ping_vps_to_targets  limit=None[0m
[32m2024-09-02 12:13:56.580[0m | [1mINFO    [0m | [36mgeogiant.clickhouse.query[0m:[36mexecute[0m:[36m122[0m - [1mquery=GetPingsPerTarget table_name=ping_vps_to_targets  limit=None[0m
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7fe42b059ff0>>
Traceback (most recent call last):
  File "/home/hugo/.cache/pypoetry/virtualenvs/geogiant-pnmpKHwb-py3.10/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [None]:
target_countries = set()
target_asns = set()
target_per_continent = defaultdict(int)

for target in targets:
    if target["addr"] in [
        "2.59.58.2",
        "5.182.48.97",
        "37.202.7.126",
        "45.86.126.73",
        "81.169.160.78",
        "104.225.15.170",
        "185.142.156.40",
        "194.15.98.10",
    ]:
        print(target["addr"])
        continue

    target_countries.add(target["country_code"])
    target_asns.add(target["asn_v4"])
    continent = countries_continent[target["country_code"]]

    target_per_continent[continent] += 1

print(f"{len(target_countries)=}")
print(f"{len(target_asns)=}")
print(f"{sum(target_per_continent.values())}")
print(f"{len(target_per_continent)}")
for continent, nb_targets in target_per_continent.items():
    print(f"{continent=}, {nb_targets=}")

2.59.58.2
5.182.48.97
37.202.7.126
45.86.126.73
81.169.160.78
104.225.15.170
185.142.156.40
194.15.98.10
len(target_countries)=94
len(target_asns)=567
751
6
continent='Europe', nb_targets=444
continent='Asia', nb_targets=111
continent='South America', nb_targets=27
continent='North America', nb_targets=129
continent='Africa', nb_targets=19
continent='Oceania', nb_targets=21


In [1]:
from pych_client import ClickHouseClient

from geogiant.clickhouse import GetAllResponsiveIP, GetMetroIP
from geogiant.common.settings import ClickhouseSettings

clickhouse_settings = ClickhouseSettings()


def get_metro_ip(ping_table: str) -> set:
    with ClickHouseClient(**clickhouse_settings.clickhouse) as client:
        rows = GetMetroIP().execute(client, ping_table)
        for row in rows:
            metro_ip.add(row["dst_addr"])

    return metro_ip


def get_responsive_ip(ping_table: str) -> set:
    metro_ip = set()
    with ClickHouseClient(**clickhouse_settings.clickhouse) as client:
        rows = GetAllResponsiveIP().execute(client, ping_table)
        for row in rows:
            metro_ip.add(row["dst_addr"])

    return metro_ip


metro_ip = get_metro_ip("pings_internet_scale")
responsive_ip = get_responsive_ip("pings_internet_scale")
metro_ip_aggregation = get_metro_ip("pings_internet_scale_aggregation")
responsive_ip_aggregation = get_responsive_ip("pings_internet_scale_aggregation")

all_metro_ip = metro_ip.union(metro_ip_aggregation)
all_responsive_ip = responsive_ip.union(responsive_ip_aggregation)

[32m2024-09-02 12:14:19.672[0m | [1mINFO    [0m | [36mgeogiant.clickhouse.query[0m:[36mexecute[0m:[36m122[0m - [1mquery=GetMetroIP table_name=pings_internet_scale  limit=None[0m
[32m2024-09-02 12:14:22.878[0m | [1mINFO    [0m | [36mgeogiant.clickhouse.query[0m:[36mexecute[0m:[36m122[0m - [1mquery=GetAllResponsiveIP table_name=pings_internet_scale  limit=None[0m
[32m2024-09-02 12:14:29.830[0m | [1mINFO    [0m | [36mgeogiant.clickhouse.query[0m:[36mexecute[0m:[36m122[0m - [1mquery=GetMetroIP table_name=pings_internet_scale_aggregation  limit=None[0m
[32m2024-09-02 12:14:31.276[0m | [1mINFO    [0m | [36mgeogiant.clickhouse.query[0m:[36mexecute[0m:[36m122[0m - [1mquery=GetAllResponsiveIP table_name=pings_internet_scale_aggregation  limit=None[0m


In [2]:
print(f"{len(metro_ip)=}")
print(f"{len(responsive_ip)=}")
print(f"Proportion:: {len(metro_ip) / len(responsive_ip)}")
print(f"{len(metro_ip_aggregation)=}")
print(f"{len(responsive_ip_aggregation)=}")
print(f"Proportion:: {len(metro_ip_aggregation) / len(responsive_ip_aggregation)}")
print(f"{len(all_metro_ip)=}")

len(metro_ip)=239915
len(responsive_ip)=1015779
Proportion:: 0.23618818660358207
len(metro_ip_aggregation)=223729
len(responsive_ip_aggregation)=270981
Proportion:: 0.8256261509109495
len(all_metro_ip)=455737


# ITDK DATASET

In [1]:
from geogiant.common.files_utils import load_csv, dump_csv
from geogiant.common.ip_addresses_utils import get_prefix_from_ip
from geogiant.common.settings import PathSettings

path_settings = PathSettings()

In [2]:
raw_itdk_data = load_csv(path_settings.DATASET / "static_files/midar-iff.nodes")

In [3]:
nb_routers = 0
for row in raw_itdk_data:
    if row.startswith("node"):
        nb_routers += 1

print(f"Nb routers:: {nb_routers}")

Nb routers:: 137749857


In [None]:
137_749_857
137_749_857

# ITDK routers
10_574_963
10_550_268

In [14]:
# parse ITDK file
itdk_addrs = []
for row in raw_itdk_data:
    if row.startswith("node"):
        addrs = row.split(":  ")[-1].split(" ")
        addrs = [addr.strip("\n") for addr in addrs]
        itdk_addrs.extend(addrs)

dump_csv(itdk_addrs, path_settings.DATASET / "static_files/itdk_addrs.csv")

In [None]:
zmap_addrs = load_csv(path_settings.DATASET / "zmap_scan_results_2024_07_05.csv")

In [None]:
itdk_subnets = [get_prefix_from_ip(ip) for ip in itdk_addrs]
zmap_subnets = [get_prefix_from_ip(ip) for ip in zmap_addrs]

In [10]:
responsive_ip_itdk = set(itdk_subnets).intersection(set(zmap_subnets))

In [11]:
print(f"Number of responsive IP addrs in ITDK dataset:: {len(responsive_ip_itdk)}")
print(
    f"Proportion of responsive IP addrs in ITDK:: {round(len(responsive_ip_itdk) * 100 / len(itdk_subnets))} [%]"
)

Number of responsive IP addrs in ITDK dataset:: 585233
Proportion of responsive IP addrs in ITDK:: 16 [%]


In [None]:
dump_csv(responsive_ip_itdk, path_settings.DATASET / "itdk_responsive_addrs.csv")

In [None]:
# load itdk router dataset
print(f"Number of unique router IP addrs:: {len(itdk_addrs)}")

itdk_subnets = set()
for ip_addr in itdk_addrs:
    subnet = ".".join(ip_addr.split(".")[:-1])
    itdk_subnets.add(subnet)

# New geolocation dataset

In [3]:
from pych_client import ClickHouseClient

from geogiant.clickhouse import GetAllResponsiveIP, GetMetroIP
from geogiant.common.settings import ClickhouseSettings

clickhouse_settings = ClickhouseSettings()


def get_metro_ip(ping_table: str) -> set:
    with ClickHouseClient(**clickhouse_settings.clickhouse) as client:
        metro_ip = set()
        rows = GetMetroIP().execute(client, ping_table)
        for row in rows:
            metro_ip.add(row["dst_addr"])

    return metro_ip


def get_responsive_ip(ping_table: str) -> set:
    metro_ip = set()
    with ClickHouseClient(**clickhouse_settings.clickhouse) as client:
        rows = GetAllResponsiveIP().execute(client, ping_table)
        for row in rows:
            metro_ip.add(row["dst_addr"])

    return metro_ip


metro_ip = get_metro_ip(clickhouse_settings.TARGET_PING_TABLE)
responsive_ip = get_responsive_ip(clickhouse_settings.TARGET_PING_TABLE)

[32m2024-09-19 08:58:22.241[0m | [34m[1mDEBUG   [0m | [36mgeogiant.clickhouse.query[0m:[36mexecute[0m:[36m122[0m - [34m[1mquery=GetMetroIP table_name=target_ping  limit=None[0m


[32m2024-09-19 08:58:22.333[0m | [34m[1mDEBUG   [0m | [36mgeogiant.clickhouse.query[0m:[36mexecute[0m:[36m122[0m - [34m[1mquery=GetAllResponsiveIP table_name=target_ping  limit=None[0m


In [5]:
print(f"{len(metro_ip)=}")
print(f"{len(responsive_ip)=}")
print(f"Proportion:: {len(metro_ip) / len(responsive_ip)}")

len(metro_ip)=197
len(responsive_ip)=1144
Proportion:: 0.17220279720279721
