This code can be used to create your own datasets

In [1]:
from random import shuffle
from pprint import pprint

from utils.common import country_filtering
from utils.file_utils import load_json, dump_json
from utils.atlas_api import get_atlas_anchors, get_atlas_probes
from default import PROBES_FILE, ANCHORS_FILE, PROBES_AND_ANCHORS_FILE, COUNTRIES_TXT_FILE, COUNTRIES_JSON_FILE, ADDRESS_FILE, HITLIST_FILE

## retrieve atlas probes

In [2]:
# create and fill the probes.json dataset.
probes, probes_rejected, probes_geoloc_disputed = get_atlas_probes()
dump_json(probes, PROBES_FILE)

# display number of selected and rejected probes.
print(
    f"selected probes: {len(probes)} ({round(len(probes) * 100 / (probes_rejected+len(probes)), 2)}%), rejected: {probes_rejected}, geoloc rejected: {probes_geoloc_disputed}")

selected probes: 10347 (22.86%), rejected: 34917, geoloc rejected: 1


In [3]:
# create and fill the anchors.json dataset.
anchors, anchors_rejected, anchors_geoloc_disputed = get_atlas_anchors()
dump_json(anchors, ANCHORS_FILE)

# display number of selected and rejected anchors.
print(
    f"selected anchors: {len(anchors)} ({round(len(anchors) * 100 / (anchors_rejected+len(anchors)), 2)}%), rejected: {anchors_rejected}, geoloc rejected: {anchors_geoloc_disputed}")

selected anchors: 779 (62.62%), rejected: 465, geoloc rejected: 1


## test loading

In [3]:
probes = load_json(PROBES_FILE)
anchors = load_json(ANCHORS_FILE)

print(f"selected probes: {len(probes)}")
print(f"selected anchors: {len(anchors)}")

for i, probe in enumerate(probes):
    if i > 10:
        break
    pprint(probe)

selected probes: 10067
selected anchors: 784
{'address_v4': '82.217.219.124',
 'address_v6': None,
 'asn_v4': 33915,
 'asn_v6': None,
 'country_code': 'NL',
 'description': '@milan ZIGGO 200/20 Mbit/s',
 'first_connected': 1288686777,
 'geometry': {'coordinates': [6.0075, 51.2005], 'type': 'Point'}}
{'address_v4': '83.81.83.145',
 'address_v6': '2001:1c05:2011:fa00:220:4aff:fec8:2464',
 'asn_v4': 33915,
 'asn_v6': 33915,
 'country_code': 'NL',
 'description': '@dfk ZIGGO 1000/75 Mbit/s Cable',
 'first_connected': 1288619044,
 'geometry': {'coordinates': [5.9975, 51.1915], 'type': 'Point'}}
{'address_v4': '95.232.13.43',
 'address_v6': None,
 'asn_v4': 3269,
 'asn_v6': None,
 'country_code': 'IT',
 'description': 'rome probe',
 'first_connected': 1289551375,
 'geometry': {'coordinates': [12.4375, 41.8995], 'type': 'Point'}}
{'address_v4': '193.0.0.78',
 'address_v6': '2001:67c:2e8:ffe1:220:4aff:fec6:cc6e',
 'asn_v4': 3333,
 'asn_v6': 3333,
 'country_code': 'NL',
 'description': 'SG offi

## get country goeloc dataset

In [5]:
countries = {}
with open(COUNTRIES_TXT_FILE, "r", encoding="utf8") as f:
    for i, row in enumerate(f.readlines()):

        row = [value.strip() for value in row.split(" ")]
        countries[row[0]] = {
            "latitude": row[1],
            "longitude": row[2],
            "name": row[3],
        }

for i, (country_code, geoloc) in enumerate(countries.items()):
    if i > 10:
        break
    print(f"{country_code} : {geoloc}")

# save results
dump_json(countries, COUNTRIES_JSON_FILE)

CI : {'latitude': '7.539989', 'longitude': '-5.54708', 'name': 'Côte'}


## eliminate default geoloc probes

In [5]:
countries = load_json(COUNTRIES_JSON_FILE)

filtered_probes = country_filtering(probes, countries)
filtered_anchors = country_filtering(anchors, countries)

print(
    f"Number of Atlas probes kept: {len(filtered_probes)}, rejected: {len(probes) - len(filtered_probes)}")
print(
    f"Number of Atlas anchors kept: {len(filtered_anchors)}, rejected: {len(anchors) - len(filtered_anchors)}")

# save results
dump_json(filtered_anchors, ANCHORS_FILE)
dump_json(filtered_probes, PROBES_FILE)

Number of Atlas probes kept: 9998, rejected: 69
Number of Atlas anchors kept: 776, rejected: 8


In [7]:
# create and fill the probes_and_anchors.json dataset.
probes_and_anchors = filtered_probes + filtered_anchors
print(len(probes_and_anchors))
shuffle(probes_and_anchors)

dump_json(probes_and_anchors, PROBES_AND_ANCHORS_FILE)

10774


## generate ip level target list for all /24 prefixes

In [3]:
verfploeter_hitlist_file = ADDRESS_FILE

targets_per_prefix = {}

with open(verfploeter_hitlist_file, "r") as f:
    for i, row in enumerate(f.readlines()[1:]):
        row = row.split("\t")

        # get prefix from hex value
        prefix_hex = row[0]
        prefix = ["".join(x) for x in zip(*[iter(prefix_hex)]*2)]
        prefix = [int(x, 16) for x in prefix]
        prefix = ".".join(str(x) for x in prefix)

        target_list = row[-1].strip("\n")
        target_list = target_list.split(",")

        # parse and save targets
        if target_list[0] != '-':
            for i, target in enumerate(target_list):
                target_list[i] = prefix.split(".")[:-1]
                target_list[i].append(str(int(target, 16)))
                target_list[i] = ".".join(target_list[i])

            try:
                targets_per_prefix[prefix].extend(target_list)
            except KeyError:
                targets_per_prefix[prefix] = target_list

In [5]:
dump_json(targets_per_prefix, HITLIST_FILE)

print("target hitlist")
for i, prefix in enumerate(targets_per_prefix):
    if i > 10:
        break
    print("prefix:", prefix, "target hitlist:", targets_per_prefix[prefix])

target hitlist
prefix: 1.0.0.0 target hitlist: ['1.0.0.0', '1.0.0.1', '1.0.0.2']
prefix: 1.0.4.0 target hitlist: ['1.0.4.1', '1.0.4.4']
prefix: 1.0.5.0 target hitlist: ['1.0.5.1', '1.0.5.5']
prefix: 1.0.6.0 target hitlist: ['1.0.6.1', '1.0.6.6']
prefix: 1.0.7.0 target hitlist: ['1.0.7.1', '1.0.7.7']
prefix: 1.0.16.0 target hitlist: ['1.0.16.14', '1.0.16.9', '1.0.16.10', '1.0.16.11']
prefix: 1.0.64.0 target hitlist: ['1.0.64.25', '1.0.64.94', '1.0.64.95']
prefix: 1.0.65.0 target hitlist: ['1.0.65.6', '1.0.65.176', '1.0.65.243']
prefix: 1.0.66.0 target hitlist: ['1.0.66.10', '1.0.66.13', '1.0.66.205']
prefix: 1.0.67.0 target hitlist: ['1.0.67.15', '1.0.67.23', '1.0.67.43']
prefix: 1.0.68.0 target hitlist: ['1.0.68.21', '1.0.68.68', '1.0.68.131']
