This code can be used to create your own datasets

In [4]:
from utils.helpers import distance
from utils.measurement_utils import load_pickle, dump_pickle, load_json
from utils.atlas_api import get_atlas_anchors, get_atlas_probes, get_atlas_probes_and_anchors
from default import PROBES_FILE, ANCHORS_FILE, PROBES_AND_ANCHORS_FILE, COUNTRIES_TXT_FILE, COUNTRIES_PICKLE_FILE, REMOVED_PROBES_FILE, ADDRESS_FILE, HITLIST_FILE

## retrieve atlas probes

In [2]:
# create and fill the probes_and_anchors.pickle dataset.
probes_and_anchors = get_atlas_probes_and_anchors()
dump_pickle(probes_and_anchors, PROBES_AND_ANCHORS_FILE)

print(
    f"selected probes and anchors: {len(probes_and_anchors)}")


selected probes and anchors: 25740


In [16]:
# create and fill the probes.pickle dataset.
probes, probes_rejected, probes_geoloc_disputed = get_atlas_probes()
dump_pickle(probes, PROBES_FILE)

# display number of selected and rejected probes.
print(
    f"selected probes: {len(probes)} ({round(len(probes) * 100 / (probes_rejected+len(probes)), 2)}%), rejected: {probes_rejected}, geoloc rejected: {probes_geoloc_disputed}")

selected probes: 10083 (22.42%), rejected: 34888, geoloc rejected: 1


In [17]:
# create and fill the anchors.pickle dataset.
anchors, anchors_rejected, anchors_geoloc_disputed = get_atlas_anchors()
dump_pickle(anchors, ANCHORS_FILE)

# display number of selected and rejected anchors.
print(
    f"selected anchors: {len(anchors)} ({round(len(anchors) * 100 / (anchors_rejected+len(anchors)), 2)}%), rejected: {anchors_rejected}, geoloc rejected: {anchors_geoloc_disputed}")

selected anchors: 775 (62.55%), rejected: 464, geoloc rejected: 1


## test loading

In [18]:
probes = load_pickle(PROBES_FILE)
anchors = load_pickle(ANCHORS_FILE)
print(f"selected probes: {len(probes)}")
print(f"selected anchors: {len(anchors)}")

for i, probe in enumerate(probes):
    if i > 10:
        break
    print(probes[probe])

selected probes: 10083
selected anchors: 775
{'id': 60052, 'ip': '45.138.229.91', 'is_anchor': False, 'country_code': 'NL', 'latitude': 52.3685, 'longitude': 4.8995}
{'id': 7, 'ip': '82.217.219.124', 'is_anchor': False, 'country_code': 'NL', 'latitude': 51.2005, 'longitude': 6.0075}
{'id': 8, 'ip': '83.81.83.145', 'is_anchor': False, 'country_code': 'NL', 'latitude': 51.1915, 'longitude': 5.9975}
{'id': 14, 'ip': '95.232.13.43', 'is_anchor': False, 'country_code': 'IT', 'latitude': 41.8995, 'longitude': 12.4375}
{'id': 20, 'ip': '77.174.30.45', 'is_anchor': False, 'country_code': 'NL', 'latitude': 52.0075, 'longitude': 5.9585}
{'id': 24, 'ip': '89.115.7.184', 'is_anchor': False, 'country_code': 'PT', 'latitude': 38.7295, 'longitude': -9.1515}
{'id': 26, 'ip': '87.195.139.46', 'is_anchor': False, 'country_code': 'NL', 'latitude': 52.3605, 'longitude': 4.8485}
{'id': 28, 'ip': '98.97.56.50', 'is_anchor': False, 'country_code': 'US', 'latitude': 35.9515, 'longitude': -115.0215}
{'id': 32,

## get country goeloc dataset

In [19]:
countries = {}
with open(COUNTRIES_TXT_FILE, "r") as f:
    for i, row in enumerate(f.readlines()):

        row = [value.strip() for value in row.split(" ")]
        countries[row[0]] = {
            "latitude": row[1],
            "longitude": row[2],
            "name": row[3],
        }

for i, (country_code, geoloc) in enumerate(countries.items()):
    if i > 10:
        break
    print(f"{country_code} : {geoloc}")

# save results
dump_pickle(countries, COUNTRIES_PICKLE_FILE)

AD : {'latitude': '42.546245', 'longitude': '1.601554', 'name': 'Andorra'}
AE : {'latitude': '23.424076', 'longitude': '53.847818', 'name': 'United'}
AF : {'latitude': '33.93911', 'longitude': '67.709953', 'name': 'Afghanistan'}
AG : {'latitude': '17.060816', 'longitude': '-61.796428', 'name': 'Antigua'}
AI : {'latitude': '18.220554', 'longitude': '-63.068615', 'name': 'Anguilla'}
AL : {'latitude': '41.153332', 'longitude': '20.168331', 'name': 'Albania'}
AM : {'latitude': '40.069099', 'longitude': '45.038189', 'name': 'Armenia'}
AN : {'latitude': '12.226079', 'longitude': '-69.060087', 'name': 'Netherlands'}
AO : {'latitude': '-11.202692', 'longitude': '17.873887', 'name': 'Angola'}
AQ : {'latitude': '-75.250973', 'longitude': '-0.071389', 'name': 'Antarctica'}
AR : {'latitude': '-38.416097', 'longitude': '-63.616672', 'name': 'Argentina'}


## eliminate default geoloc probes

In [21]:
def country_filtering(probes: dict, countries: dict) -> dict:
    filtered_probes = {}
    for probe, probe_description in probes.items():

        # check if probe coordinates are close to default location
        try:
            country_geo = countries[probe_description["country_code"]]
        except KeyError as e:
            print(f"error country code {country_code} is unknown")
            continue

        # if the country code is unknown, remove probe from dataset
        country_lat = float(country_geo["latitude"])
        country_lon = float(country_geo["longitude"])

        probe_lat = float(probe_description["latitude"])
        probe_lon = float(probe_description["longitude"])

        dist = distance(country_lat, probe_lat, country_lon, probe_lon)

        if dist > 5:
            filtered_probes[probe] = probe_description

    return filtered_probes

In [23]:
countries = load_pickle(COUNTRIES_PICKLE_FILE)

filtered_probes = country_filtering(probes, countries)
filtered_anchors = country_filtering(anchors, countries)

print(
    f"Number of Atlas probes kept: {len(filtered_probes)}, rejected: {len(probes) - len(filtered_probes)}")
print(
    f"Number of Atlas anchors kept: {len(filtered_anchors)}, rejected: {len(anchors) - len(filtered_anchors)}")

# save results
dump_pickle(filtered_anchors, ANCHORS_FILE)
dump_pickle(filtered_probes, PROBES_FILE)

error country code AS is unknown
error country code AS is unknown
error country code AS is unknown
error country code AS is unknown
Number of Atlas probes kept: 10009, rejected: 74
Number of Atlas anchors kept: 767, rejected: 8


In [None]:
# create "removed_probes.json"

In [5]:
from clickhouse_driver import Client

In [6]:
anchors = load_pickle(ANCHORS_FILE)
probes = load_pickle(PROBES_FILE)

miss_placed_probes_ip = load_json(REMOVED_PROBES_FILE)

In [7]:
print(len(anchors))

print(len(probes))


print(len(miss_placed_probes_ip))

767
10009
105


In [10]:
good_anchors = []
bad_anchors = []
incorrect_geolocation_count = 0
not_enough_vps = 0
client = Client('127.0.0.1')
for p in anchors:
    print(p)
    if anchors[p]['ip'] in miss_placed_probes_ip:
        bad_anchors.append(anchors[p])
        bad_anchors[-1]['remove_reason'] = "Incorrect geolocation"
        incorrect_geolocation_count += 1
        continue
    target_ip = anchors[p]['ip']
    tmp_res_db = client.execute(f'select distinct src_addr from bgp_interdomain_te.street_lvl_traceroutes where resp_addr = \'{target_ip}\' and dst_addr = \'{target_ip}\' and src_addr <> \'{target_ip}\'')
    if len(tmp_res_db) < 100:
        bad_anchors.append(anchors[p])
        bad_anchors[-1]['remove_reason'] = "Not enought traceroute data towards"
        bad_anchors[-1]['vp_traceroutes_count'] = len(tmp_res_db)
        not_enough_vps += 1
        continue
    good_anchors.append(anchors[p])

print(f"{len(good_anchors)} anchors to keep")
print(f"{incorrect_geolocation_count} anchors removed because of incorrect geolocation")
print(f"{not_enough_vps} anchors removed because the lack of traceroute data towards")


213.225.160.239


UnexpectedPacketFromServerError: Code: 102. Unexpected packet from server 127.0.0.1:9000 (expected Hello or Exception, got Unknown packet)

In [8]:
good_probes = []
bad_probes = []
incorrect_geolocation_count = 0
not_enough_vps = 0
for _, p in enumerate(probes):
    if probes[p]['ip'] in wrong_geolocated_probes_and_anchors:
        bad_probes.append(p)
        incorrect_geolocation_count += 1
        continue
    good_probes.append(p)

print(f"{len(good_probes)} anchors to keep")
print(f"{incorrect_geolocation_count} anchors removed because of incorrect geolocation")

print(len(good_probes))

9953 anchors to keep
56 anchors removed because of incorrect geolocation
9953


## generate ip level target list for all /24 prefixes

In [2]:
verfploeter_hitlist_file = ADDRESS_FILE

targets_per_prefix = {}

with open(verfploeter_hitlist_file, "r") as f:
    for i, row in enumerate(f.readlines()[1:]):
        row = row.split("\t")

        # get prefix from hex value
        prefix_hex = row[0]
        prefix = ["".join(x) for x in zip(*[iter(prefix_hex)]*2)]
        prefix = [int(x, 16) for x in prefix]
        prefix = ".".join(str(x) for x in prefix)

        target_list = row[-1].strip("\n")
        target_list = target_list.split(",")

        # parse and save targets
        if target_list[0] != '-':
            for i, target in enumerate(target_list):
                target_list[i] = prefix.split(".")[:-1]
                target_list[i].append(str(int(target, 16)))
                target_list[i] = ".".join(target_list[i])

            try:
                targets_per_prefix[prefix].extend(target_list)
            except KeyError:
                targets_per_prefix[prefix] = target_list

In [3]:
dump_pickle(targets_per_prefix, HITLIST_FILE)

print("target hitlist")
for i, prefix in enumerate(targets_per_prefix):
    if i > 10:
        break
    print("prefix:", prefix, "target hitlist:", targets_per_prefix[prefix])

target hitlist
prefix: 1.0.0.0 target hitlist: ['1.0.0.0', '1.0.0.1', '1.0.0.2']
prefix: 1.0.4.0 target hitlist: ['1.0.4.1', '1.0.4.4']
prefix: 1.0.5.0 target hitlist: ['1.0.5.1', '1.0.5.5']
prefix: 1.0.6.0 target hitlist: ['1.0.6.1', '1.0.6.6']
prefix: 1.0.7.0 target hitlist: ['1.0.7.1', '1.0.7.7']
prefix: 1.0.16.0 target hitlist: ['1.0.16.14', '1.0.16.9', '1.0.16.10', '1.0.16.11']
prefix: 1.0.64.0 target hitlist: ['1.0.64.25', '1.0.64.94', '1.0.64.95']
prefix: 1.0.65.0 target hitlist: ['1.0.65.6', '1.0.65.176', '1.0.65.243']
prefix: 1.0.66.0 target hitlist: ['1.0.66.10', '1.0.66.13', '1.0.66.205']
prefix: 1.0.67.0 target hitlist: ['1.0.67.15', '1.0.67.23', '1.0.67.43']
prefix: 1.0.68.0 target hitlist: ['1.0.68.21', '1.0.68.68', '1.0.68.131']
