# 1) ATLAS DATASETS

In [1]:
from pprint import pprint
from random import shuffle

from scripts.utils.file_utils import load_json, dump_json
from scripts.ripe_atlas.atlas_api import get_atlas_anchors, get_atlas_probes
from default import ANCHORS_FILE, PROBES_FILE, PROBES_AND_ANCHORS_FILE

## Retrieve atlas probes and anchors

In [None]:
# create and fill the probes.json dataset.
probes, probes_rejected, probes_geoloc_disputed = get_atlas_probes()
dump_json(probes, PROBES_FILE)

# display number of selected and rejected probes.
print(
    f"selected probes: {len(probes)} ({round(len(probes) * 100 / (probes_rejected+len(probes)), 2)}%), rejected: {probes_rejected}, geoloc rejected: {probes_geoloc_disputed}")

In [None]:
# create and fill the anchors.json dataset.
anchors, anchors_rejected, anchors_geoloc_disputed = get_atlas_anchors()
dump_json(anchors, ANCHORS_FILE)

# display number of selected and rejected anchors.
print(
    f"selected anchors: {len(anchors)} ({round(len(anchors) * 100 / (anchors_rejected+len(anchors)), 2)}%), rejected: {anchors_rejected}, geoloc rejected: {anchors_geoloc_disputed}")

These two files will be filtered and updated step by step during the execution of this notebook

### test loading

In [3]:
probes = load_json(PROBES_FILE)
anchors = load_json(ANCHORS_FILE)

print(f"selected probes: {len(probes)}")
print(f"selected anchors: {len(anchors)}")

for i, probe in enumerate(probes):
    if i > 10:
        break
    pprint(probe)

selected probes: 9973
selected anchors: 785
{'address_v4': '82.217.219.124',
 'asn_v4': 33915,
 'country_code': 'NL',
 'geometry': {'coordinates': [6.0075, 51.2005], 'type': 'Point'}}
{'address_v4': '83.81.82.33',
 'asn_v4': 33915,
 'country_code': 'NL',
 'geometry': {'coordinates': [6.0375, 51.2315], 'type': 'Point'}}
{'address_v4': '95.247.234.173',
 'asn_v4': 3269,
 'country_code': 'IT',
 'geometry': {'coordinates': [12.4375, 41.8995], 'type': 'Point'}}
{'address_v4': '193.0.0.78',
 'asn_v4': 3333,
 'country_code': 'NL',
 'geometry': {'coordinates': [4.8975, 52.3795], 'type': 'Point'}}
{'address_v4': '77.174.30.45',
 'asn_v4': 1136,
 'country_code': 'NL',
 'geometry': {'coordinates': [5.9585, 52.0075], 'type': 'Point'}}
{'address_v4': '93.108.219.48',
 'asn_v4': 12353,
 'country_code': 'PT',
 'geometry': {'coordinates': [-9.1515, 38.7295], 'type': 'Point'}}
{'address_v4': '86.89.224.211',
 'asn_v4': 12414,
 'country_code': 'NL',
 'geometry': {'coordinates': [4.8485, 52.3605], 'type'

In [3]:
probes_and_anchors = probes + anchors
print(len(probes_and_anchors))
shuffle(probes_and_anchors)

dump_json(probes_and_anchors, PROBES_AND_ANCHORS_FILE)

10900


# 2) GEOGRAPHIC DATASETS

In [5]:
from scripts.utils.file_utils import load_json, dump_json
from scripts.utils.helpers import distance
from default import COUNTRIES_TXT_FILE, COUNTRIES_JSON_FILE, ANCHORS_FILE, PROBES_FILE

## Get country dataset

In [6]:
countries = {}
with open(COUNTRIES_TXT_FILE, "r", encoding="utf8") as f:
    for i, row in enumerate(f.readlines()):

        row = [value.strip() for value in row.split(" ")]
        countries[row[0]] = {
            "latitude": row[1],
            "longitude": row[2],
            "name": row[3],
        }

for i, (country_code, geoloc) in enumerate(countries.items()):
    if i > 10:
        break
    print(f"{country_code} : {geoloc}")

# save results
dump_json(countries, COUNTRIES_JSON_FILE)

AD : {'latitude': '42.546245', 'longitude': '1.601554', 'name': 'Andorra'}
AE : {'latitude': '23.424076', 'longitude': '53.847818', 'name': 'United'}
AF : {'latitude': '33.93911', 'longitude': '67.709953', 'name': 'Afghanistan'}
AG : {'latitude': '17.060816', 'longitude': '-61.796428', 'name': 'Antigua'}
AI : {'latitude': '18.220554', 'longitude': '-63.068615', 'name': 'Anguilla'}
AL : {'latitude': '41.153332', 'longitude': '20.168331', 'name': 'Albania'}
AM : {'latitude': '40.069099', 'longitude': '45.038189', 'name': 'Armenia'}
AN : {'latitude': '12.226079', 'longitude': '-69.060087', 'name': 'Netherlands'}
AO : {'latitude': '-11.202692', 'longitude': '17.873887', 'name': 'Angola'}
AQ : {'latitude': '-75.250973', 'longitude': '-0.071389', 'name': 'Antarctica'}
AR : {'latitude': '-38.416097', 'longitude': '-63.616672', 'name': 'Argentina'}


## Eliminate default geolocated probes

In [7]:
probes = load_json(PROBES_FILE)

anchors = load_json(ANCHORS_FILE)

In [8]:
def country_filtering(probes: list, countries: dict) -> list:
    filtered_probes = []
    for probe in probes:

        # check if probe coordinates are close to default location
        try:
            country_geo = countries[probe["country_code"]]
        except KeyError as e:
            print(
                f"error country code {probe['country_code']} is unknown")
            continue

        # if the country code is unknown, remove probe from dataset
        country_lat = float(country_geo["latitude"])
        country_lon = float(country_geo["longitude"])

        probe_lat = float(probe["geometry"]["coordinates"][1])
        probe_lon = float(probe["geometry"]["coordinates"][0])

        dist = distance(country_lat, probe_lat, country_lon, probe_lon)

        if dist > 5:
            filtered_probes.append(probe)

    return filtered_probes

In [9]:
filtered_probes = country_filtering(probes, countries)
filtered_anchors = country_filtering(anchors, countries)

print(
    f"Number of Atlas probes kept: {len(filtered_probes)}, rejected: {len(probes) - len(filtered_probes)}")
print(
    f"Number of Atlas anchors kept: {len(filtered_anchors)}, rejected: {len(anchors) - len(filtered_anchors)}")

# save results
dump_json(filtered_anchors, ANCHORS_FILE)
dump_json(filtered_probes, PROBES_FILE)

error country code SX is unknown
Number of Atlas anchors kept: 777, rejected: 8


# 3) OTHER VARIOUS FILES

In [1]:
import math
import pickle
import radix
import requests
from copy import deepcopy

from clickhouse_driver import Client
from multiprocessing import Pool

from scripts.utils.file_utils import load_json, dump_json
from scripts.utils.helpers import haversine
from scripts.analysis.analysis import compute_remove_wrongly_geolocated_probes, compute_rtts_per_dst_src
from default import *

DB_HOST = "localhost"
GEO_REPLICATION_DB = "geolocation_replication"
ANCHORS_MESHED_PING_TABLE = f"anchors_meshed_pings"
PROBES_TO_ANCHORS_PING_TABLE = f"ping_10k_to_anchors"

LIMIT = 1000

## Generate ip level target list for all /24 prefixes

In [None]:
targets_per_prefix = {}

with open(ADDRESS_FILE, "r") as f:
    for i, row in enumerate(f.readlines()[1:]):
        row = row.split("\t")

        # get prefix from hex value
        prefix_hex = row[0]
        prefix = ["".join(x) for x in zip(*[iter(prefix_hex)]*2)]
        prefix = [int(x, 16) for x in prefix]
        prefix = ".".join(str(x) for x in prefix)

        target_list = row[-1].strip("\n")
        target_list = target_list.split(",")

        # parse and save targets
        if target_list[0] != '-':
            for i, target in enumerate(target_list):
                target_list[i] = prefix.split(".")[:-1]
                target_list[i].append(str(int(target, 16)))
                target_list[i] = ".".join(target_list[i])

            try:
                targets_per_prefix[prefix].extend(target_list)
            except KeyError:
                targets_per_prefix[prefix] = target_list

In [None]:
dump_json(targets_per_prefix, HITLIST_FILE)

print("target hitlist")
for i, prefix in enumerate(targets_per_prefix):
    if i > 10:
        break
    print("prefix:", prefix, "target hitlist:", targets_per_prefix[prefix])

## Build pairwise matrix

In [6]:
probes = load_json(PROBES_FILE)
anchors = load_json(ANCHORS_FILE)
anchors_ip_list = [anchor["address_v4"] for anchor in anchors]
probes.extend(anchors)

In [None]:
vp_coordinates_per_ip = {}

for probe in probes:
    ip_v4_address = probe["address_v4"]
    long, lat = probe["geometry"]["coordinates"]
    vp_coordinates_per_ip[ip_v4_address] = lat, long


vp_distance_matrix = {}
vp_coordinates_per_ip_l = sorted(vp_coordinates_per_ip.items(), key=lambda x: x[0])

for i in range(len(vp_coordinates_per_ip_l)):
    vp_i, vp_i_coordinates = vp_coordinates_per_ip_l[i]
    if vp_i not in anchors_ip_list:
        continue
    for j in range(len(vp_coordinates_per_ip_l)):
        vp_j, vp_j_coordinates = vp_coordinates_per_ip_l[j]
        distance = haversine(vp_i_coordinates, vp_j_coordinates)
        vp_distance_matrix.setdefault(vp_i, {})[vp_j] = distance
        vp_distance_matrix.setdefault(vp_j, {})[vp_i] = distance


dump_json(vp_distance_matrix, PAIRWISE_DISTANCE_FILE)

## Find wrongly geolocated probes

In [11]:
anchors = load_json(ANCHORS_FILE)

vp_distance_matrix = load_json(PAIRWISE_DISTANCE_FILE)

In [12]:
vp_coordinates_per_ip = {}

for anchor in anchors:
    if "address_v4" in anchor and "geometry" in anchor and "coordinates" in anchor["geometry"]:
        ip_v4_address = anchor["address_v4"]
        if ip_v4_address is None:
            continue
        long, lat = anchor["geometry"]["coordinates"]
        vp_coordinates_per_ip[ip_v4_address] = lat, long

In [14]:
removed_anchors = set()
filter = ""

rtt_per_srcs_dst = compute_rtts_per_dst_src(ANCHORS_MESHED_PING_TABLE, filter, threshold=300)

removed_anchors = compute_remove_wrongly_geolocated_probes(rtt_per_srcs_dst,
                                                            vp_coordinates_per_ip,
                                                            vp_distance_matrix,
                                                            removed_anchors)

0


In [8]:
probes = load_json(PROBES_FILE)
probes.extend(anchors)

In [9]:
vp_coordinates_per_ip = {}

for probe in probes:
    if "address_v4" in probe and "geometry" in probe and "coordinates" in probe["geometry"]:
        ip_v4_address = probe["address_v4"]
        if ip_v4_address is None:
            continue
        long, lat = probe["geometry"]["coordinates"]
        vp_coordinates_per_ip[ip_v4_address] = lat, long

In [10]:
removed_probes = set()

filter = ""
in_clause = f"".join([f",toIPv4('{p}')" for p in removed_anchors])[1:]
filter += f"AND dst not in ({in_clause}) AND src not in ({in_clause}) "

vp_coordinates_per_ip = {ip: vp_coordinates_per_ip[ip]
                            for ip in vp_coordinates_per_ip
                            if ip not in removed_anchors}

rtt_per_srcs_dst = compute_rtts_per_dst_src(PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=300)

removed_probes = compute_remove_wrongly_geolocated_probes(rtt_per_srcs_dst,
                                                            vp_coordinates_per_ip,
                                                            vp_distance_matrix,
                                                            removed_anchors)

removed_probes.update(removed_anchors)

print(f"Removing {len(removed_probes)} probes")
dump_json(list(removed_probes), REMOVED_PROBES_FILE)


0
Removing 0 probes


## Remove bad results

### anchors

In [19]:
anchors = load_json(ANCHORS_FILE)

print(f"{len(anchors)} total anchors")

removed_probes = load_json(REMOVED_PROBES_FILE)

776 total anchors


In [15]:
removed_probes = load_json(REMOVED_PROBES_FILE)

In [None]:
good_anchors = []
incorrect_geolocation_count = 0
not_enough_vps = 0
client = Client('127.0.0.1')

for anchor in anchors:
    if anchor['address_v4'] in removed_probes:
        incorrect_geolocation_count += 1
        continue
    
    target_ip = anchor['address_v4']
    tmp_res_db = client.execute(
        f'select distinct src_addr from bgp_interdomain_te.street_lvl_traceroutes where resp_addr = \'{target_ip}\' and dst_addr = \'{target_ip}\' and src_addr <> \'{target_ip}\'')
    if len(tmp_res_db) < 100:
        not_enough_vps += 1
        continue
    good_anchors.append(anchor)

print(f"{len(good_anchors)} anchors to keep")
print(f"{incorrect_geolocation_count} anchors removed because of incorrect geolocation")
print(f"{not_enough_vps} anchors removed because the lack of traceroute data towards")

dump_json(good_anchors, ANCHORS_FILE)

### probes

In [22]:
probes = load_json(PROBES_FILE)

print(f"{len(probes)} total probes")

removed_probes = load_json(REMOVED_PROBES_FILE)

10046 total probes


In [23]:
good_probes = []
incorrect_geolocation_count = 0

for probe in probes:
    if probe['address_v4'] in removed_probes:
        incorrect_geolocation_count += 1
        continue
    good_probes.append(probe)

print(f"{len(good_probes)} probes to keep")
print(f"{incorrect_geolocation_count} probes removed because of incorrect geolocation")

dump_json(good_probes, PROBES_FILE)

9973 probes to keep
73 probes removed because of incorrect geolocation


## Create removed and filtered probes files

### Anchors

In [11]:
anchors = load_json(ANCHORS_FILE)

filter = ""
    
# clickhouse is required here
rtt_per_srcs_dst = compute_rtts_per_dst_src(ANCHORS_MESHED_PING_TABLE, filter, threshold=100)

In [13]:
for anchor in anchors:
    if anchor["address_v4"] not in rtt_per_srcs_dst:
        index = anchors.index(anchor)
        anchors.pop(index)
        print(anchor)

print(len(anchors))
dump_json(anchors, ANCHORS_FILE)

anchors_list = [anchor["address_v4"] for anchor in anchors]

711


In [15]:
# Remove probes that are in the meshed table but not in the dataset
filtered_probes = []
copy = deepcopy(rtt_per_srcs_dst)
for anchor in copy:
    if anchor not in anchors_list:
        filtered_probes.append(anchor)

copy = deepcopy(rtt_per_srcs_dst)
for anchor in copy:
    for element in copy[anchor]:
        if element not in anchors_list:
            filtered_probes.append(element)

### Probes

In [16]:
anchors = load_json(ANCHORS_FILE)
anchors_list = [anchor["address_v4"] for anchor in anchors]

probes = load_json(PROBES_FILE)
probes_list = [probe["address_v4"] for probe in probes]
probes_list.extend(anchors_list)


filter = ""
    
# clickhouse is required here
rtt_per_srcs_dst = compute_rtts_per_dst_src(PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=100)

In [17]:
# Remove probes that are in the meshed table but not in the dataset

copy = deepcopy(rtt_per_srcs_dst)
for anchor in copy:
    if anchor not in anchors_list:
        filtered_probes.append(anchor)

copy = deepcopy(rtt_per_srcs_dst)
for anchor in copy:
    for probe in copy[anchor]:
        if probe not in probes_list:
            filtered_probes.append(probe)

### Global

In [9]:
original_probes = load_json(PROBES_AND_ANCHORS_FILE)
good_anchors = load_json(ANCHORS_FILE)
good_probes = load_json(PROBES_FILE)

final_probes = []
removed_probes = []
for probe in original_probes:
    if probe in good_anchors or probe in good_probes:
        final_probes.append(probe)
    else:
        removed_probes.append(probe["address_v4"])

dump_json(final_probes, PROBES_AND_ANCHORS_FILE)
dump_json(removed_probes, REMOVED_PROBES_FILE)

In [11]:
filtered_probes.extend(removed_probes)
dump_json(list(set(filtered_probes)), FILTERED_PROBES_FILE)

## Update pairwise

In [24]:
pairwise_probes = load_json(PAIRWISE_DISTANCE_FILE)
removed_probes = load_json(REMOVED_PROBES_FILE)

In [None]:
# Remove entries with removed probes

for removed_probe in removed_probes:
    try:
        del pairwise_probes[removed_probe]
    except KeyError:
        continue

copy = deepcopy(pairwise_probes)
for probe in copy:
    for removed_probe in removed_probes:
        try:
            del pairwise_probes[probe][removed_probe]
        except KeyError:
            continue


In [32]:
dump_json(pairwise_probes, PAIRWISE_DISTANCE_FILE)

## Select greedy probes

In [2]:
def greedy_selection_probes_impl(probe, distance_per_probe, selected_probes):

    distances_log = [math.log(distance_per_probe[p]) for p in selected_probes
                     if p in distance_per_probe and distance_per_probe[p] > 0]
    total_distance = sum(distances_log)
    return probe, total_distance

In [5]:
removed_probes = load_json(REMOVED_PROBES_FILE)

vp_distance_matrix = load_json(PAIRWISE_DISTANCE_FILE)

In [5]:
# Greedily compute the probe with the greatest distance to other probes

print("Starting greedy algorithm")
selected_probes = []
remaining_probes = set(vp_distance_matrix.keys())
with Pool(12) as p:
    while len(remaining_probes) > 0 and len(selected_probes) < LIMIT:
        args = []
        for probe in remaining_probes:
            args.append((probe, vp_distance_matrix[probe], selected_probes))

        results = p.starmap(greedy_selection_probes_impl, args)

        furthest_probe_from_selected, _ = max(results, key=lambda x:x[1])
        selected_probes.append(furthest_probe_from_selected)
        remaining_probes.remove(furthest_probe_from_selected)

dump_json(selected_probes, GREEDY_PROBES_FILE)

Starting greedy algorithm
0


## Find IP info and maxmind results

In [3]:
token = "4f6c895ec9224f"

maxmind_tree_file = GEOLITE_FILE

In [4]:
anchors = load_json(ANCHORS_FILE)

removed_probes = load_json(REMOVED_PROBES_FILE)

In [5]:
with open(maxmind_tree_file, "rb") as f:
    tree = pickle.load(f)

ip_info_geo = {}
maxmind_geo = {}


for i, anchor in enumerate(sorted(anchors, key=lambda x: x["address_v4"])):
    ip = anchor["address_v4"]

    # ip_info
    url = f"https://ipinfo.io/{ip}?token={token}"
    result = requests.get(url).json()
    ip_info_geo[ip] = result

    # maxmind_results
    node = tree.search_best(ip)
    if node is not None:
        if "city" in node.data:
            maxmind_geo[ip] = node.data["coordinates"]

dump_json(ip_info_geo, IP_INFO_GEO_FILE)
dump_json(maxmind_geo, MAXMIND_GEO_FILE)

# 4) FINISH COMPLETING GEOGRAPHIC DATASETS

The following file needed the final version of probes and anchors dataset

In [1]:
import requests
import geopandas as gpd
import rasterio

from rasterio.transform import from_bounds
from pprint import pprint
from subprocess import Popen, PIPE, STDOUT

from scripts.utils.file_utils import load_json, dump_json
from default import ANCHORS_FILE, POPULATION_CITY_FILE, CITIES_500_FILE, POPULATION_DENSITY_FILE

## Get population data

### city coordinates

In [8]:
anchors = load_json(ANCHORS_FILE)

In [9]:
anchors_with_location = []

for anchor in anchors:
    ip = anchor['address_v4']
    lat = anchor['geometry']['coordinates'][1]
    lon = anchor['geometry']['coordinates'][0]
    country_code = anchor['country_code']
    anchors_with_location.append({'target_ip': ip, 'lat': lat, 'lon': lon,
                'country_code': country_code})


anchors_with_city = []
for anchor in anchors_with_location:
    url = f"http://nominatim.openstreetmap.org/reverse?format=geojson&lat={anchor['lat']}&lon={anchor['lon']}"
    r = requests.get(url)
    elem = r.json()
    if 'features' not in elem or len(elem['features']) != 1:
        continue
    info = elem['features'][0]
    if 'properties' not in info or 'address' not in info['properties']:
        continue
    address = info['properties']['address']
    if 'city' in address:
        anchor['city'] = address['city']
    elif 'village' in address:
        anchor['city'] = address['village']
    elif 'town' in address:
        anchor['city'] = address['town']
    elif 'country' in address:
        anchor['city'] = address['country']
    else:
        pprint(info)
        break
    anchors_with_city.append(anchor)

dump_json(anchors_with_city, POPULATION_CITY_FILE)

### city population

In [10]:
population_by_city = {}
POPULATION_THRESHOLD = 100000

with open(CITIES_500_FILE, "r", encoding="utf8") as f:
    for row in f.readlines():
        row = [value.strip() for value in row.split("\t")]
        city = row[1]
        ascii_city = row[2]
        # Iso2 code
        country = row[8]
        population = row[14]
        city_key = f"{city}_{country}"
        ascii_city_key = f"{ascii_city}_{country}"
        if population != "":
            population = int(float(population))
            if population >= POPULATION_THRESHOLD:
                population_by_city[ascii_city_key] = population
                population_by_city[city_key] = population


print(len(population_by_city)//2)

2601


In [11]:
anchors_with_city = load_json(POPULATION_CITY_FILE)
anchors_with_pop = []

for anchor in anchors_with_city:
    try:
        anchor['population'] = population_by_city[f"{anchor['city']}_{anchor['country_code']}"]
        anchors_with_pop.append(anchor)
    except:
        try:
            # unix
            cmd = f"cat {CITIES_500_FILE} | grep \"{anchor['city']}\""
            # windows
            # t = "type"
            # cmd = f"{t} {CITIES_500_FILE} | findstr \"{anchor['city']}\""
            process = Popen(cmd, stdout=PIPE, stderr=STDOUT, shell=True)
            output, err = process.communicate()
            row = output.decode().split("\t")
            anchor['population'] = int(row[14])
            anchors_with_pop.append(anchor)
        except Exception as e:
            anchor['population'] = 0
            anchors_with_pop.append(anchor)

dump_json(anchors_with_pop, POPULATION_CITY_FILE)

### city density

In [3]:
anchors_with_pop = load_json(POPULATION_CITY_FILE)

# Load the population density data
with rasterio.open(POPULATION_DENSITY_FILE) as dataset:
    population_density = dataset.read(1)

In [4]:
anchors_with_density = []
for anchor in anchors_with_pop:
    lat, lon = anchor['lat'], anchor['lon']
    point = gpd.GeoDataFrame(geometry=gpd.points_from_xy([lon], [lat]))

    # Convert lat-lon to pixel coordinates
    xmin, ymin, xmax, ymax = dataset.bounds
    transform = from_bounds(xmin, ymin, xmax, ymax, dataset.width, dataset.height)
    row, col = dataset.index(lon, lat)

    # Extract the population density value
    population_density_value = population_density[row, col]
    anchor['density'] = float(population_density_value)

    anchors_with_density.append(anchor)

dump_json(anchors_with_density, POPULATION_CITY_FILE)