In [13]:
from geogiant.clickhouse import Query
from collections import defaultdict
from loguru import logger

from geogiant.common.files_utils import load_json, load_csv, dump_csv
from geogiant.common.settings import PathSettings, ClickhouseSettings

path_settings = PathSettings()
clickhouse_settings = ClickhouseSettings()


hostname_ranks = load_csv(path_settings.DATASET / "202402.csv")
org_per_hostname = load_json(path_settings.DATASET / "ecs_hostnames_organization.json")

46016


# From single subnet resolution to VPs subnets resolution

## Filter Anycast prefixes

In [7]:
def load_anycatch_data() -> None:
    """get all anycast prefixes detected by anycatch and remove them"""
    anycast_prefixes = set()
    with path_settings.ANYCATCH_DATA.open("r") as f:
        for row in f.readlines():
            prefix = row.split(",")[0].strip("\n")
            anycast_prefixes.add(prefix)

    return anycast_prefixes


anycatch_db = load_anycatch_data()

hostname_per_org = defaultdict(set)
anycast_hostnames = set()

unicast_hostnames = set()
for hostname, bgp_prefix_per_org in org_per_hostname.items():
    anycast_hostname = False
    for org, bgp_prefixes in bgp_prefix_per_org.items():
        for bgp_prefix in bgp_prefixes:
            if bgp_prefix in anycatch_db:
                anycast_hostnames.add(hostname)
                anycast_hostname = True

    if not anycast_hostname:
        hostname_per_org[org].add(hostname)
        unicast_hostnames.add(hostname)


hostname_per_org_sorted = sorted(
    hostname_per_org.items(), key=lambda x: len(x[1]), reverse=True
)

In [15]:
logger.info(f"Unicast hostnames = {len(unicast_hostnames)}")
logger.info(f"Anycast hostnames = {len(anycast_hostnames)}")

logger.info(f"Nb orgs = {len(hostname_per_org)}")

[32m2024-04-21 16:11:43.320[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mUnicast hostnames = 120963[0m
[32m2024-04-21 16:11:43.325[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mAnycast hostnames = 257600[0m
[32m2024-04-21 16:11:43.327[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mNb orgs = 3932[0m


## Filter per organization

In [None]:
import random
from collections import OrderedDict

rank_per_hostname = {}
for hostname_with_rank in hostname_ranks[1:]:
    hostname, rank = hostname_with_rank.split(",")
    hostname = hostname.split("//")[-1]
    rank_per_hostname[hostname] = rank

logger.info(len(rank_per_hostname))

max_hostnames_per_org = 1_500

selected_hostnames_per_org = defaultdict(set)
for org, total_hostnames in hostname_per_org_sorted:
    hostname_per_org_per_rank = OrderedDict()
    for hostname in total_hostnames:
        try:
            rank = rank_per_hostname[hostname]
        except KeyError:
            continue

        try:
            hostname_per_org_per_rank[int(rank)].append(hostname)
        except KeyError:
            hostname_per_org_per_rank[int(rank)] = [hostname]

    # sort per rank
    hostname_per_org_per_rank = OrderedDict(
        sorted(hostname_per_org_per_rank.items(), key=lambda x: x[0], reverse=True)
    )

    original_nb_hostnames = set()
    for rank, hostnames in hostname_per_org_per_rank.items():
        original_nb_hostnames.update(hostnames)
        if len(selected_hostnames_per_org[org]) < max_hostnames_per_org:
            remaining_hostnames_to_select = max_hostnames_per_org - len(
                selected_hostnames_per_org[org]
            )
            selected_hostnames_per_org[org].update(
                random.sample(
                    hostnames,
                    (
                        remaining_hostnames_to_select
                        if len(hostnames) > remaining_hostnames_to_select
                        else len(hostnames)
                    ),
                )
            )

total_selected_hostnames = set()
for org, selected_hostnames in selected_hostnames_per_org.items():
    logger.info(
        f"{org}, nb_hostnames={len(selected_hostnames)}, {len(set(hostname_per_org[org]))=}"
    )

    total_selected_hostnames.update(selected_hostnames)

total_selected_hostnames = list(total_selected_hostnames)
random.shuffle(total_selected_hostnames)
dump_csv(
    total_selected_hostnames, path_settings.DATASET / "ecs_selected_hostnames_test.csv"
)

logger.info(f"total number of organization:: {len(selected_hostnames_per_org)}")
logger.info(f"total number of hostnames:: {len(total_selected_hostnames)}")

In [32]:
count = 0
large_org_hostnames = set()
for org, hostnames in hostname_per_org_sorted:
    if len(hostnames) > 1500:
        count += 1
        large_org_hostnames.update(hostnames)
        print(org, len(hostnames))

print(count)

AMAZON-02 49478
GOOGLE 8242
MICROSOFT-CORP-MSN-AS-BLOCK 6036
HETZNER-AS 3267
OVH 2957
AKAMAI-ASN1 2377
AMAZON-AES 2291
DIGITALOCEAN-ASN 2121
INCAPSULA 2088
AKAMAI-AS 1945
ALIBABA-CN-NET 1874
GOOGLE-CLOUD-PLATFORM 1559
12


In [31]:
print(len(large_org_hostnames) * 100 / len(unicast_hostnames))

69.63699643692699
