# 1) ATLAS DATASETS

In [1]:
from pprint import pprint
from random import shuffle

from utils.file_utils import load_json, dump_json
from utils.atlas_api import get_atlas_anchors, get_atlas_probes
from default import ANCHORS_FILE, PROBES_FILE, PROBES_AND_ANCHORS_FILE

## Retrieve atlas probes and anchors

In [2]:
# create and fill the probes.json dataset.
probes, probes_rejected, probes_geoloc_disputed = get_atlas_probes()
dump_json(probes, PROBES_FILE)

# display number of selected and rejected probes.
print(
    f"selected probes: {len(probes)} ({round(len(probes) * 100 / (probes_rejected+len(probes)), 2)}%), rejected: {probes_rejected}, geoloc rejected: {probes_geoloc_disputed}")

selected probes: 10344 (22.82%), rejected: 34986, geoloc rejected: 1


In [3]:
# create and fill the anchors.json dataset.
anchors, anchors_rejected, anchors_geoloc_disputed = get_atlas_anchors()
dump_json(anchors, ANCHORS_FILE)

# display number of selected and rejected anchors.
print(
    f"selected anchors: {len(anchors)} ({round(len(anchors) * 100 / (anchors_rejected+len(anchors)), 2)}%), rejected: {anchors_rejected}, geoloc rejected: {anchors_geoloc_disputed}")

selected anchors: 780 (62.4%), rejected: 470, geoloc rejected: 1


These two files will be filtered and updated step by step during the execution of this notebook

### test loading

In [4]:
probes = load_json(PROBES_FILE)
anchors = load_json(ANCHORS_FILE)

print(f"selected probes: {len(probes)}")
print(f"selected anchors: {len(anchors)}")

for i, probe in enumerate(probes):
    if i > 10:
        break
    pprint(probe)

selected probes: 9996
selected anchors: 731
{'address_v4': '82.217.219.124',
 'asn_v4': 33915,
 'country_code': 'NL',
 'geometry': {'coordinates': [6.0075, 51.2005], 'type': 'Point'}}
{'address_v4': '83.81.82.33',
 'asn_v4': 33915,
 'country_code': 'NL',
 'geometry': {'coordinates': [6.0375, 51.2315], 'type': 'Point'}}
{'address_v4': '95.247.234.173',
 'asn_v4': 3269,
 'country_code': 'IT',
 'geometry': {'coordinates': [12.4375, 41.8995], 'type': 'Point'}}
{'address_v4': '193.0.0.78',
 'asn_v4': 3333,
 'country_code': 'NL',
 'geometry': {'coordinates': [4.8975, 52.3795], 'type': 'Point'}}
{'address_v4': '77.174.30.45',
 'asn_v4': 1136,
 'country_code': 'NL',
 'geometry': {'coordinates': [5.9585, 52.0075], 'type': 'Point'}}
{'address_v4': '93.108.219.48',
 'asn_v4': 12353,
 'country_code': 'PT',
 'geometry': {'coordinates': [-9.1515, 38.7295], 'type': 'Point'}}
{'address_v4': '86.89.224.211',
 'asn_v4': 12414,
 'country_code': 'NL',
 'geometry': {'coordinates': [4.8485, 52.3605], 'type'

In [5]:
probes_and_anchors = probes + anchors
print(len(probes_and_anchors))
shuffle(probes_and_anchors)

dump_json(probes_and_anchors, PROBES_AND_ANCHORS_FILE)

10727


# 2) GEOGRAPHIC DATASETS

In [1]:
from utils.file_utils import load_json, dump_json
from utils.common import country_filtering
from default import COUNTRIES_TXT_FILE, COUNTRIES_JSON_FILE, ANCHORS_FILE, PROBES_FILE

## Get country dataset

In [2]:
countries = {}
with open(COUNTRIES_TXT_FILE, "r", encoding="utf8") as f:
    for i, row in enumerate(f.readlines()):

        row = [value.strip() for value in row.split(" ")]
        countries[row[0]] = {
            "latitude": row[1],
            "longitude": row[2],
            "name": row[3],
        }

for i, (country_code, geoloc) in enumerate(countries.items()):
    if i > 10:
        break
    print(f"{country_code} : {geoloc}")

# save results
dump_json(countries, COUNTRIES_JSON_FILE)

AD : {'latitude': '42.546245', 'longitude': '1.601554', 'name': 'Andorra'}
AE : {'latitude': '23.424076', 'longitude': '53.847818', 'name': 'United'}
AF : {'latitude': '33.93911', 'longitude': '67.709953', 'name': 'Afghanistan'}
AG : {'latitude': '17.060816', 'longitude': '-61.796428', 'name': 'Antigua'}
AI : {'latitude': '18.220554', 'longitude': '-63.068615', 'name': 'Anguilla'}
AL : {'latitude': '41.153332', 'longitude': '20.168331', 'name': 'Albania'}
AM : {'latitude': '40.069099', 'longitude': '45.038189', 'name': 'Armenia'}
AN : {'latitude': '12.226079', 'longitude': '-69.060087', 'name': 'Netherlands'}
AO : {'latitude': '-11.202692', 'longitude': '17.873887', 'name': 'Angola'}
AQ : {'latitude': '-75.250973', 'longitude': '-0.071389', 'name': 'Antarctica'}
AR : {'latitude': '-38.416097', 'longitude': '-63.616672', 'name': 'Argentina'}


## Eliminate default geolocated probes

In [3]:
probes = load_json(PROBES_FILE)

anchors = load_json(ANCHORS_FILE)

In [4]:
filtered_probes = country_filtering(probes, countries)
filtered_anchors = country_filtering(anchors, countries)

print(
    f"Number of Atlas probes kept: {len(filtered_probes)}, rejected: {len(probes) - len(filtered_probes)}")
print(
    f"Number of Atlas anchors kept: {len(filtered_anchors)}, rejected: {len(anchors) - len(filtered_anchors)}")

# save results
dump_json(filtered_anchors, ANCHORS_FILE)
dump_json(filtered_probes, PROBES_FILE)

Number of Atlas probes kept: 9996, rejected: 0
Number of Atlas anchors kept: 731, rejected: 0


# 3) OTHER VARIOUS FILES

In [19]:
import math
import pickle
import requests

from clickhouse_driver import Client
from multiprocessing import Pool

from utils.file_utils import load_json, dump_json
from utils.helpers import haversine
from utils.common import compute_remove_wrongly_geolocated_probes, compute_rtts_per_dst_src
from utils.clickhouse_query import get_min_rtt_per_src_dst_query_ping_table, get_min_rtt_per_src_dst_prefix_query_ping_table
from default import ADDRESS_FILE, HITLIST_FILE, ANCHORS_FILE, PROBES_FILE, PAIRWISE_DISTANCE_ANCHOR_FILE, PAIRWISE_DISTANCE_PROBE_FILE, PROBES_AND_ANCHORS_FILE, REMOVED_PROBES_FILE, GREEDY_PROBES_FILE, IP_INFO_GEO_FILE, MAXMIND_GEO_FILE

DB_HOST = "localhost"
GEO_REPLICATION_DB = "geolocation_replication"
ANCHORS_MESHED_PING_TABLE = f"anchors_meshed_pings"
PROBES_TO_ANCHORS_PING_TABLE = f"ping_10k_to_anchors"

LIMIT = 1000

## Generate ip level target list for all /24 prefixes

In [None]:
targets_per_prefix = {}

with open(ADDRESS_FILE, "r") as f:
    for i, row in enumerate(f.readlines()[1:]):
        row = row.split("\t")

        # get prefix from hex value
        prefix_hex = row[0]
        prefix = ["".join(x) for x in zip(*[iter(prefix_hex)]*2)]
        prefix = [int(x, 16) for x in prefix]
        prefix = ".".join(str(x) for x in prefix)

        target_list = row[-1].strip("\n")
        target_list = target_list.split(",")

        # parse and save targets
        if target_list[0] != '-':
            for i, target in enumerate(target_list):
                target_list[i] = prefix.split(".")[:-1]
                target_list[i].append(str(int(target, 16)))
                target_list[i] = ".".join(target_list[i])

            try:
                targets_per_prefix[prefix].extend(target_list)
            except KeyError:
                targets_per_prefix[prefix] = target_list

In [None]:
dump_json(targets_per_prefix, HITLIST_FILE)

print("target hitlist")
for i, prefix in enumerate(targets_per_prefix):
    if i > 10:
        break
    print("prefix:", prefix, "target hitlist:", targets_per_prefix[prefix])

## Build pairwise matrices

### anchors

In [3]:
anchors = load_json(ANCHORS_FILE)

In [4]:
vp_coordinates_per_ip = {}

for anchor in anchors:
    if "address_v4" in anchor and "geometry" in anchor and "coordinates" in anchor["geometry"]:
        ip_v4_address = anchor["address_v4"]
        if ip_v4_address is None:
            continue
        long, lat = anchor["geometry"]["coordinates"]
        vp_coordinates_per_ip[ip_v4_address] = lat, long


vp_distance_matrix = {}
vp_coordinates_per_ip_l = sorted(
    vp_coordinates_per_ip.items(), key=lambda x: x[0])
for i in range(len(vp_coordinates_per_ip_l)):
    vp_i, vp_i_coordinates = vp_coordinates_per_ip_l[i]
    for j in range(len(vp_coordinates_per_ip)):
        vp_j, vp_j_coordinates = vp_coordinates_per_ip_l[j]
        distance = haversine(vp_i_coordinates, vp_j_coordinates)
        vp_distance_matrix.setdefault(vp_i, {})[vp_j] = distance
        vp_distance_matrix.setdefault(vp_j, {})[vp_i] = distance


dump_json(vp_distance_matrix, PAIRWISE_DISTANCE_ANCHOR_FILE)

### probes

In [5]:
probes = load_json(PROBES_FILE)

In [None]:
vp_coordinates_per_ip = {}

for probe in probes:
    if "address_v4" in probe and "geometry" in probe and "coordinates" in probe["geometry"]:
        ip_v4_address = probe["address_v4"]
        if ip_v4_address is None:
            continue
        long, lat = probe["geometry"]["coordinates"]
        vp_coordinates_per_ip[ip_v4_address] = lat, long


vp_distance_matrix = {}
vp_coordinates_per_ip_l = sorted(
    vp_coordinates_per_ip.items(), key=lambda x: x[0])
for i in range(len(vp_coordinates_per_ip_l)):
    vp_i, vp_i_coordinates = vp_coordinates_per_ip_l[i]
    for j in range(len(vp_coordinates_per_ip)):
        vp_j, vp_j_coordinates = vp_coordinates_per_ip_l[j]
        distance = haversine(vp_i_coordinates, vp_j_coordinates)
        vp_distance_matrix.setdefault(vp_i, {})[vp_j] = distance
        vp_distance_matrix.setdefault(vp_j, {})[vp_i] = distance


dump_json(vp_distance_matrix, PAIRWISE_DISTANCE_PROBE_FILE)

## Find wrongly geolocated probes

### anchors

In [7]:
anchors = load_json(ANCHORS_FILE)

vp_distance_matrix = load_json(PAIRWISE_DISTANCE_ANCHOR_FILE)

In [8]:
vp_coordinates_per_ip = {}

for anchor in anchors:
    if "address_v4" in anchor and "geometry" in anchor and "coordinates" in anchor["geometry"]:
        ip_v4_address = anchor["address_v4"]
        if ip_v4_address is None:
            continue
        long, lat = anchor["geometry"]["coordinates"]
        vp_coordinates_per_ip[ip_v4_address] = lat, long

In [None]:
removed_anchors = set()
filter = ""

rtt_per_srcs_dst = compute_rtts_per_dst_src(ANCHORS_MESHED_PING_TABLE, filter, threshold=300)

removed_anchors = compute_remove_wrongly_geolocated_probes(rtt_per_srcs_dst,
                                                            vp_coordinates_per_ip,
                                                            vp_distance_matrix,
                                                            removed_anchors)

### probes

In [None]:
probes = load_json(PROBES_FILE)

vp_distance_matrix = load_json(PAIRWISE_DISTANCE_PROBE_FILE)

In [None]:
vp_coordinates_per_ip = {}

for probe in probes:
    if "address_v4" in probe and "geometry" in probe and "coordinates" in probe["geometry"]:
        ip_v4_address = probe["address_v4"]
        if ip_v4_address is None:
            continue
        long, lat = probe["geometry"]["coordinates"]
        vp_coordinates_per_ip[ip_v4_address] = lat, long

In [None]:
removed_probes = set()
filter = ""

vp_coordinates_per_ip = {ip: vp_coordinates_per_ip[ip]
                            for ip in vp_coordinates_per_ip}

rtt_per_srcs_dst = compute_rtts_per_dst_src(PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=300)

removed_probes = compute_remove_wrongly_geolocated_probes(rtt_per_srcs_dst,
                                                            vp_coordinates_per_ip,
                                                            vp_distance_matrix,
                                                            removed_anchors)

removed_probes.update(removed_anchors)

print(f"Removing {len(removed_probes)} probes")
dump_json(removed_probes, REMOVED_PROBES_FILE)


## Remove bad results

### anchors

In [10]:
anchors = load_json(ANCHORS_FILE)

print(f"{len(anchors)} total anchors")

removed_probes = load_json(REMOVED_PROBES_FILE)

731 total anchors


In [None]:
good_anchors = []
incorrect_geolocation_count = 0
not_enough_vps = 0
client = Client('127.0.0.1')

for anchor in anchors:
    if anchor['address_v4'] in removed_probes:
        incorrect_geolocation_count += 1
        continue
    
    target_ip = anchor['address_v4']
    tmp_res_db = client.execute(
        f'select distinct src_addr from bgp_interdomain_te.street_lvl_traceroutes where resp_addr = \'{target_ip}\' and dst_addr = \'{target_ip}\' and src_addr <> \'{target_ip}\'')
    if len(tmp_res_db) < 100:
        not_enough_vps += 1
        continue
    good_anchors.append(anchor)

print(f"{len(good_anchors)} anchors to keep")
print(f"{incorrect_geolocation_count} anchors removed because of incorrect geolocation")
print(f"{not_enough_vps} anchors removed because the lack of traceroute data towards")

dump_json(good_anchors, ANCHORS_FILE)

### probes

In [13]:
probes = load_json(PROBES_FILE)

print(f"{len(probes)} total probes")

removed_probes = load_json(REMOVED_PROBES_FILE)

9996 total probes


In [14]:
good_probes = []
incorrect_geolocation_count = 0

for probe in probes:
    if probe['address_v4'] in removed_probes:
        incorrect_geolocation_count += 1
        continue
    good_probes.append(probe)

print(f"{len(good_probes)} probes to keep")
print(f"{incorrect_geolocation_count} probes removed because of incorrect geolocation")

dump_json(good_probes, PROBES_FILE)

9996 probes to keep
0 probes removed because of incorrect geolocation


## Create removed probes file

In [15]:
original_probes = load_json(PROBES_AND_ANCHORS_FILE)
good_anchors = load_json(ANCHORS_FILE)
good_probes = load_json(PROBES_FILE)

final_probes = []
removed_probes = []
for probe in original_probes:
    if probe in good_anchors or probe in good_probes:
        final_probes.append(probe)
    else:
        removed_probes.append(probe)

dump_json(final_probes, PROBES_AND_ANCHORS_FILE)
dump_json(removed_probes, REMOVED_PROBES_FILE)

## Select greedy probes

In [None]:
removed_probes = load_json(REMOVED_PROBES_FILE)

vp_distance_matrix = load_json(PAIRWISE_DISTANCE_PROBE_FILE)

In [None]:
# Greedily compute the probe with the greatest distance to other probes

# First of all remove entries with removed probes
for probe in removed_probes:
    if probe in vp_distance_matrix:
        del vp_distance_matrix[probe]

for probe, distance_per_probe in vp_distance_matrix.items():
    for removed_probe in removed_probes:
        if removed_probe in distance_per_probe:
            del distance_per_probe[removed_probe]


print("Starting greedy algorithm")
selected_probes = []
remaining_probes = set(vp_distance_matrix.keys())
with Pool(12) as p:
    while len(remaining_probes) > 0 and len(selected_probes) < LIMIT:
        args = []
        for probe in remaining_probes:
            args.append(
                (probe, vp_distance_matrix[probe], selected_probes))
        
        distances_log = [math.log(distance_per_probe[p]) for p in selected_probes
                     if p in distance_per_probe and distance_per_probe[p] > 0]
        total_distance = sum(distances_log)
        impl = probe, total_distance
        results = p.starmap(impl, args)

        furthest_probe_from_selected, _ = max(results, key=lambda x: x[1])
        selected_probes.append(furthest_probe_from_selected)
        remaining_probes.remove(furthest_probe_from_selected)


dump_json(selected_probes, GREEDY_PROBES_FILE)

## Find IP info and maxmind results

In [18]:
snapshot_date = "20230516"
token = "4f6c895ec9224f"

maxmind_block_file = f"GeoLite2-City-CSV_{snapshot_date}/GeoLite2-City-Blocks-IPv4.csv"
maxmind_tree_file = f"{maxmind_block_file[:-4]}_{snapshot_date}.tree"

In [16]:
anchors = load_json(ANCHORS_FILE)

removed_probes = load_json(REMOVED_PROBES_FILE)

In [None]:
with open(maxmind_tree_file, "rb") as f:
    tree = pickle.load(f)

ip_info_geo = {}
maxmind_geo = {}

for i, anchor in enumerate(sorted(anchors, key=lambda x: x["address_v4"])):
    ip = anchor["address_v4"]

    # ip_info
    url = f"https://ipinfo.io/{ip}?token={token}"
    result = requests.get(url).json()
    ip_info_geo[ip] = result

    # maxmind_results
    node = tree.search_best(ip)
    if node is not None:
        if "city" in node.data:
            maxmind_geo[ip] = node.data["coordinates"]

dump_json(ip_info_geo, IP_INFO_GEO_FILE)
dump_json(maxmind_geo, MAXMIND_GEO_FILE)

# 4) FINISH COMPLETING GEOGRAPHIC DATASETS

The following file needed the final version of probes and anchors dataset

In [12]:
import requests

from pprint import pprint
from subprocess import Popen, PIPE, STDOUT

from utils.file_utils import load_json, dump_json
from default import ANCHORS_FILE, POPULATION_CITY_FILE, CITIES_500_FILE, POPULATION_THRESHOLD

'''import geopandas as gpd
import rasterio

from rasterio.transform import from_bounds'''

'import geopandas as gpd\nimport rasterio\n\nfrom rasterio.transform import from_bounds'

## Get population data

### city coordinates

In [2]:
anchors = load_json(ANCHORS_FILE)

In [None]:
anchors_with_location = []

for anchor in anchors:
    ip = anchor['address_v4']
    lat = anchor['geometry']['coordinates'][1]
    lon = anchor['geometry']['coordinates'][0]
    country_code = anchor['country_code']
    anchors_with_location.append({'target_ip': ip, 'lat': lat, 'lon': lon,
                'country_code': country_code})


anchors_with_city = []
for anchor in anchors_with_location:
    url = f"http://nominatim.openstreetmap.org/reverse?format=geojson&lat={anchor['lat']}&lon={anchor['lon']}"
    r = requests.get(url)
    elem = r.json()
    if 'features' not in elem or len(elem['features']) != 1:
        continue
    info = elem['features'][0]
    if 'properties' not in info or 'address' not in info['properties']:
        continue
    address = info['properties']['address']
    if 'city' in address:
        anchor['city'] = address['city']
    elif 'village' in address:
        anchor['city'] = address['village']
    elif 'town' in address:
        anchor['city'] = address['town']
    elif 'country' in address:
        anchor['city'] = address['country']
    else:
        pprint(info)
        break
    anchors_with_city.append(anchor)

### city population

In [7]:
population_by_city = {}


with open(CITIES_500_FILE, "r", encoding="utf8") as f:
    for row in f.readlines():
        row = [value.strip() for value in row.split("\t")]
        city = row[1]
        ascii_city = row[2]
        # Iso2 code
        country = row[8]
        population = row[14]
        city_key = f"{city}_{country}"
        ascii_city_key = f"{ascii_city}_{country}"
        if population != "":
            population = int(float(population))
            if population >= POPULATION_THRESHOLD:
                population_by_city[ascii_city_key] = population
                population_by_city[city_key] = population


print(len(population_by_city)//2)

2601


In [13]:
anchors_with_pop = []
c1 = 0
c2 = 0
c3 = 0

for anchor in anchors_with_city:
    try:
        anchor['population'] = population_by_city[f"{anchor['city']}_{anchor['country_code']}"]
        c1 += 1
    except:
        try:
            cmd = f"cat {CITIES_500_FILE} | grep \"{anchor['city']}\""
            print(cmd)
            process = Popen(cmd, stdout=PIPE, stderr=STDOUT, shell=True)
            output, err = process.communicate()
            row = output.decode().split("\t")
            anchor['population'] = int(row[14])
            anchors_with_pop.append(anchor)
            c2 += 1
        except Exception as e:
            pprint(e)
            pprint(row)
            pprint(anchor)
            print(f"{anchor['lat']}, {anchor['lon']}")
            anchor['population'] = 0
            anchors_with_pop.append(anchor)
            c3 += 1

print(c1)
print(c2)
print(c3)
dump_json(anchors_with_pop, POPULATION_CITY_FILE)

cat C:\Users\milo2\Desktop\review\datasets\geography\cities500.txt | grep "Wien"
UnicodeDecodeError('utf-8', b"'cat' n'est pas reconnu en tant que commande interne\r\nou externe, un programme ex\x82cutable ou un fichier de commandes.\r\n", 81, 82, 'invalid start byte')
['1106542',
 'Chitungwiza',
 'Chitungwiza',
 'Chitungviza,Chitungwiza,Chytungviza,Citungviza,chytwngwyza,Čitungviza,Читунгвиза,Читунгвіза,Чытунгвіза,چیتونگویزا',
 '-18.01274',
 '31.07555',
 'P',
 'PPL',
 'ZW',
 '',
 '10',
 '',
 '',
 '',
 '340360',
 '',
 '1435',
 'Africa/Harare',
 '2019-09-05']
{'city': 'Wien',
 'country_code': 'AT',
 'lat': 48.2085,
 'lon': 16.3695,
 'target_ip': '193.171.255.2'}
48.2085, 16.3695
cat C:\Users\milo2\Desktop\review\datasets\geography\cities500.txt | grep "Satigny"
UnicodeDecodeError('utf-8', b"'cat' n'est pas reconnu en tant que commande interne\r\nou externe, un programme ex\x82cutable ou un fichier de commandes.\r\n", 81, 82, 'invalid start byte')
['1106542',
 'Chitungwiza',
 'Chitungwiz

### city density

In [None]:
# Load the population density data
with rasterio.open(f'{resources_dir}/gpw_v4_population_density_rev11_2020_30_sec.tif') as dataset:
    population_density = dataset.read(1)

In [None]:
anchors_with_pop = load_json(POPULATION_CITY_FILE)

anchors_with_density = []
for anchor in anchors_with_pop:
    lat, lon = anchor['lat'], anchor['lon']
    point = gpd.GeoDataFrame(geometry=gpd.points_from_xy([lon], [lat]))

    # Convert lat-lon to pixel coordinates
    xmin, ymin, xmax, ymax = dataset.bounds
    transform = from_bounds(
        xmin, ymin, xmax, ymax, dataset.width, dataset.height)
    row, col = dataset.index(lon, lat)

    # Extract the population density value
    population_density_value = population_density[row, col]
    anchor['density'] = float(population_density_value)

    anchors_with_density.append(anchor)

dump_json(anchors_with_density, POPULATION_CITY_FILE)

In [None]:
max_pop = 0
max_v = None
for p in anchors_with_density:
    if p['density'] > max_pop:
        max_pop = p['density']
        max_v = p

pprint(max_v)