In [1]:
from utils.file_utils import load_json, dump_json
from utils.common import compute_geo_info, compute_rtts_per_dst_src, compute_remove_wrongly_geolocated_anchors
from utils.helpers import haversine
from default import *

## Pairwise files

### Probes

In [3]:
probes = load_json(PROBES_FILE)

In [None]:
vp_coordinates_per_ip = {}

for probe in probes:
    if "address_v4" in probe and "geometry" in probe and "coordinates" in probe["geometry"]:
        ip_v4_address = probe["address_v4"]
        if ip_v4_address is None:
            continue
        long, lat = probe["geometry"]["coordinates"]
        vp_coordinates_per_ip[ip_v4_address] = lat, long


vp_distance_matrix = {}
vp_coordinates_per_ip_l = sorted(
    vp_coordinates_per_ip.items(), key=lambda x: x[0])
for i in range(len(vp_coordinates_per_ip_l)):
    vp_i, vp_i_coordinates = vp_coordinates_per_ip_l[i]
    for j in range(len(vp_coordinates_per_ip)):
        vp_j, vp_j_coordinates = vp_coordinates_per_ip_l[j]
        distance = haversine(vp_i_coordinates, vp_j_coordinates)
        vp_distance_matrix.setdefault(vp_i, {})[vp_j] = distance
        vp_distance_matrix.setdefault(vp_j, {})[vp_i] = distance


dump_json(vp_distance_matrix, PAIRWISE_DISTANCE_PROBE_FILE)

### Anchors

In [None]:
anchors = load_json(ANCHORS_FILE)

In [None]:
vp_coordinates_per_ip = {}

for anchor in anchors:
    if "address_v4" in anchor and "geometry" in anchor and "coordinates" in anchor["geometry"]:
        ip_v4_address = anchor["address_v4"]
        if ip_v4_address is None:
            continue
        long, lat = anchor["geometry"]["coordinates"]
        vp_coordinates_per_ip[ip_v4_address] = lat, long


vp_distance_matrix = {}
vp_coordinates_per_ip_l = sorted(
    vp_coordinates_per_ip.items(), key=lambda x: x[0])
for i in range(len(vp_coordinates_per_ip_l)):
    vp_i, vp_i_coordinates = vp_coordinates_per_ip_l[i]
    for j in range(len(vp_coordinates_per_ip)):
        vp_j, vp_j_coordinates = vp_coordinates_per_ip_l[j]
        distance = haversine(vp_i_coordinates, vp_j_coordinates)
        vp_distance_matrix.setdefault(vp_i, {})[vp_j] = distance
        vp_distance_matrix.setdefault(vp_j, {})[vp_i] = distance


dump_json(vp_distance_matrix, PAIRWISE_DISTANCE_ANCHOR_FILE)

## Find probes that disrespect the internet speed rule

### Anchors

In [None]:
anchors = load_json(ANCHORS_FILE)

In [None]:
vp_coordinates_per_ip = {}

for anchor in anchors:
    if "address_v4" in anchor and "geometry" in anchor and "coordinates" in anchor["geometry"]:
        ip_v4_address = anchor["address_v4"]
        if ip_v4_address is None:
            continue
        long, lat = anchor["geometry"]["coordinates"]
        vp_coordinates_per_ip[ip_v4_address] = lat, long

In [None]:
removed_anchors = set()

vp_distance_matrix = load_json(PAIRWISE_DISTANCE_ANCHOR_FILE)

rtt_per_srcs_dst = compute_rtts_per_dst_src(ANCHORS_MESHED_PING_TABLE, filter, threshold=300)

removed_anchors = compute_remove_wrongly_geolocated_probes(rtt_per_srcs_dst,
                                                            vp_coordinates_per_ip,
                                                            vp_distance_matrix,
                                                            removed_anchors)

### Probes

In [None]:
probes = load_json(PROBES_FILE)

In [5]:
vp_coordinates_per_ip = {}

for probe in probes:
    if "address_v4" in probe and "geometry" in probe and "coordinates" in probe["geometry"]:
        ip_v4_address = probe["address_v4"]
        if ip_v4_address is None:
            continue
        long, lat = probe["geometry"]["coordinates"]
        vp_coordinates_per_ip[ip_v4_address] = lat, long

In [None]:
removed_probes = set()

vp_coordinates_per_ip = {ip: vp_coordinates_per_ip[ip]
                            for ip in vp_coordinates_per_ip
                            if ip not in removed_anchors}

rtt_per_srcs_dst = compute_rtts_per_dst_src(PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=300)

removed_probes = compute_remove_wrongly_geolocated_probes(rtt_per_srcs_dst,
                                                            vp_coordinates_per_ip,
                                                            vp_distance_matrix,
                                                            removed_anchors)

removed_probes.update(removed_anchors)

print(f"Removing {len(removed_probes)} probes")
dump_json(list(removed_probes), REMOVED_PROBES_FILE)

## Bad anchors

In [None]:
def remove_bad_anchors():
    with open(ALL_ANCHORS_FILE_PATH, 'r') as json_file:
        anchors = json.load(json_file)
    with open(MISLOCATED_PROBES_FILE_PATH, 'r') as json_file:
        miss_placed_probes_ip = json.load(json_file)

    print(f"{len(anchors)} total anchors")
    good_anchors = []
    bad_anchors = []
    incorrect_geolocation_count = 0
    not_enough_vps = 0
    client = Client('127.0.0.1')
    for p in anchors:
        if p['address_v4'] in miss_placed_probes_ip:
            bad_anchors.append(p)
            bad_anchors[-1]['remove_reason'] = "Incorrect geolocation"
            incorrect_geolocation_count += 1
            continue
        target_ip = p['address_v4']
        tmp_res_db = client.execute(
            f'select distinct src_addr from bgp_interdomain_te.street_lvl_traceroutes where resp_addr = \'{target_ip}\' and dst_addr = \'{target_ip}\' and src_addr <> \'{target_ip}\'')
        if len(tmp_res_db) < 100:
            bad_anchors.append(p)
            bad_anchors[-1]['remove_reason'] = "Not enought traceroute data towards"
            bad_anchors[-1]['vp_traceroutes_count'] = len(tmp_res_db)
            not_enough_vps += 1
            continue
        good_anchors.append(p)

    print(f"{len(good_anchors)} anchors to keep")
    print(f"{incorrect_geolocation_count} anchors removed because of incorrect geolocation")
    print(f"{not_enough_vps} anchors removed because the lack of traceroute data towards")

    print(len(good_anchors))
    # with open(GOOD_ANCHORS_FILE_PATH, 'w') as outfile:
    #     json.dump(good_anchors, outfile)
    # with open(BAD_ANCHORS_FILE_PATH, 'w') as outfile:
    #     json.dump(bad_anchors, outfile)

## Greedy probes

In [None]:
# Greedily compute the probe with the greatest distance to other probes
LIMIT = 1000

removed_probes = load_json(REMOVED_PROBES_FILE)
# First of all remove entries with removed probes
for probe in removed_probes:
    if probe in vp_distance_matrix:
        del vp_distance_matrix[probe]

for probe, distance_per_probe in vp_distance_matrix.items():
    for removed_probe in removed_probes:
        if removed_probe in distance_per_probe:
            del distance_per_probe[removed_probe]

logging.info("Starting greedy algorithm")
selected_probes = []
remaining_probes = set(vp_distance_matrix.keys())
with Pool(12) as p:
    while len(remaining_probes) > 0 and len(selected_probes) < LIMIT:
        args = []
        for probe in remaining_probes:
            args.append(
                (probe, vp_distance_matrix[probe], selected_probes))
        
        distances_log = [math.log(distance_per_probe[p]) for p in selected_probes
                     if p in distance_per_probe and distance_per_probe[p] > 0]
        total_distance = sum(distances_log)
        impl = probe, total_distance
        results = p.starmap(impl, args)

        furthest_probe_from_selected, _ = max(results, key=lambda x: x[1])
        selected_probes.append(furthest_probe_from_selected)
        remaining_probes.remove(furthest_probe_from_selected)


dump_json(selected_probes, GREEDY_PROBES_FILE)

## IP list

In [None]:
def get_ip_list(anchors_file):
    with open(anchors_file, 'r') as json_file:
        anchors = json.load(json_file)

    ip_lst = []
    for anchor in anchors:
        ip_lst.append(anchor['address_v4'])
    print(len(ip_lst))

    with open("anchors_ip_lst.json", 'w') as outfile:
        json.dump(ip_lst, outfile)

    get_ip_list(ANCHORS_FILE)

## DEnsity

In [None]:
import rasterio
import geopandas as gpd
from rasterio.transform import from_bounds
import ujson as json


if __name__ == "__main__":
    resources_dir = "resources/replicability/street/"
    # Load the population density data
    with rasterio.open(f'{resources_dir}/gpw_v4_population_density_rev11_2020_30_sec.tif') as dataset:
        population_density = dataset.read(1)

    ofile = f"{resources_dir}/population_target.json"

    with open(ofile) as f:
        all_data = json.load(f)

    res = []
    for d in all_data:
        if 'density' not in d:
            lat, lon = d['lat'], d['lon']
            point = gpd.GeoDataFrame(geometry=gpd.points_from_xy([lon], [lat]))

            # Convert lat-lon to pixel coordinates
            xmin, ymin, xmax, ymax = dataset.bounds
            transform = from_bounds(
                xmin, ymin, xmax, ymax, dataset.width, dataset.height)
            row, col = dataset.index(lon, lat)

            # Extract the population density value
            population_density_value = population_density[row, col]
            d['density'] = float(population_density_value)

        res.append(d)
    with open(ofile, "w") as f:
        json.dump(res, f)