In [1]:
from scripts.utils.file_utils import load_json, dump_json
from scripts.analysis.analysis import *
from default import *


TIER1_VPS = [10, 100, 300, 500, 1000]

### loading data

In [2]:
anchors = load_json(ANCHORS_FILE)

probes = load_json(PROBES_FILE)

all_probes = load_json(PROBES_AND_ANCHORS_FILE)

greedy_probes = load_json(GREEDY_PROBES_FILE)

filtered_probes = load_json(FILTERED_PROBES_FILE)

filter = ""
if len(filtered_probes) > 0:
    # Remove probes that are wrongly geolocated
    in_clause = f"".join(
        [f",toIPv4('{p}')" for p in filtered_probes])[1:]
    filter += f"AND dst not in ({in_clause}) AND src not in ({in_clause}) "

## Round Algorithm

In [None]:
asn_per_vp_ip = {}
vp_coordinates_per_ip = {}

for probe in all_probes:
    if "address_v4" in probe and "geometry" in probe and "coordinates" in probe["geometry"]:
        ip_v4_address = probe["address_v4"]
        if ip_v4_address is None:
            continue
        long, lat = probe["geometry"]["coordinates"]
        asn_v4 = probe["asn_v4"]
        asn_per_vp_ip[ip_v4_address] = asn_v4
        vp_coordinates_per_ip[ip_v4_address] = lat, long


In [None]:
filter = ""
if len(filtered_probes) > 0:
    # Remove probes that are wrongly geolocated
    in_clause = f"".join(
        [f",toIPv4('{p}')" for p in filtered_probes])[1:]
    filter += f"AND dst not in ({in_clause}) AND src not in ({in_clause}) "
    
# clickhouse is required here
rtt_per_srcs_dst = compute_rtts_per_dst_src(PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=100)

In [None]:
error_cdf_per_tier1_vps = {}
for tier1_vps in TIER1_VPS:
    print(f"Using {tier1_vps} tier1_vps")
    error_cdf = round_based_algorithm(greedy_probes, rtt_per_srcs_dst, vp_coordinates_per_ip,
                                        asn_per_vp_ip,
                                        tier1_vps,
                                        threshold=40)
    error_cdf_per_tier1_vps[tier1_vps] = error_cdf
    
dump_json(error_cdf_per_tier1_vps, ROUND_BASED_ALGORITHM_FILE)

## Evaluate

### Accuracy vs number of vps anchors

Fig. 2 and 3 of the millions paper

### Accuracy vs number of vps probes

In [3]:
vp_coordinates_per_ip, ip_per_coordinates, country_per_vp, asn_per_vp, \
        vp_distance_matrix, probe_per_ip = compute_geo_info(
            all_probes, serialized_file=PAIRWISE_DISTANCE_FILE)

In [4]:
subset_sizes = []
subset_sizes.extend([i for i in range(100, 1000, 100)])
subset_sizes.extend([i for i in range(1000, 10001, 1000)])

rtt_per_srcs_dst = compute_rtts_per_dst_src(PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=50)

available_vps = list(vp_coordinates_per_ip.keys())
accuracy_vs_nb_vps = compute_accuracy_vs_number_of_vps(available_vps, rtt_per_srcs_dst, vp_coordinates_per_ip,
                                    vp_distance_matrix, subset_sizes)

dump_json(accuracy_vs_nb_vps, ACCURACY_VS_N_VPS_PROBES_FILE)

Starting computing for random VPs 100


### Compute errors

In [3]:
vp_coordinates_per_ip, ip_per_coordinates, country_per_vp, asn_per_vp, vp_distance_matrix, probes_per_ip = compute_geo_info(all_probes, PAIRWISE_DISTANCE_FILE)

In [4]:
rtt_per_srcs_dst = compute_rtts_per_dst_src(PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=70)

vps_per_target = {dst: set(vp_coordinates_per_ip.keys())
                    for dst in rtt_per_srcs_dst}
features = compute_geolocation_features_per_ip(rtt_per_srcs_dst, vp_coordinates_per_ip, THRESHOLD_DISTANCES,
                                                vps_per_target=vps_per_target,
                                                distance_operator=">", max_vps=100000,
                                                is_use_prefix=False,
                                                vp_distance_matrix=vp_distance_matrix,
                                                )

dump_json(features, PROBES_TO_ANCHORS_RESULT_FILE)

here
711
pool
ok
711


### VPs selection algorithm

Fig.5 of the millions paper.

In [6]:
vp_coordinates_per_ip, ip_per_coordinates, country_per_vp, asn_per_vp, vp_distance_matrix, probes_per_ip = compute_geo_info(all_probes, PAIRWISE_DISTANCE_FILE)

In [None]:
def compute_closest_rtt_probes(rtts_per_dst_prefix, vp_coordinates_per_ip, vp_distance_matrix, is_prefix, n_shortest=10):

    vps_per_prefix = {}
    for dst, src_min_rtt in rtts_per_dst_prefix.items():
        if not is_prefix:
            if dst not in vp_coordinates_per_ip:
                continue
        sorted_probes = sorted(src_min_rtt.items(), key=lambda x: x[1][0])

        n_shortest_probes = dict(sorted_probes[:n_shortest])
        # Check if the shortest probes respect the speed of Internet
        n_shortest_probes_checked = {}
        min_rtt_probe, min_rtt = None, 1000
        if not is_prefix:
            for probe, rtts in n_shortest_probes.items():
                min_rtt_probe = min(rtts)
                if probe not in vp_distance_matrix[dst]:
                    continue
                max_theoretical_distance = (
                    SPEED_OF_INTERNET * min_rtt_probe/1000) / 2
                if vp_distance_matrix[dst][probe] > max_theoretical_distance:
                    # Impossible distance
                    continue
                n_shortest_probes_checked[probe] = n_shortest_probes[probe]
        else:
            n_shortest_probes_checked = n_shortest_probes

        vps_per_prefix[dst] = n_shortest_probes_checked

    return vps_per_prefix

In [7]:
ping_table_prefix = PROBES_TO_PREFIX_TABLE
ping_table = PROBES_TO_ANCHORS_PING_TABLE
N_VPS_SELECTION_ALGORITHM = [1, 3, 10]
results_files = [VP_SELECTION_ALGORITHM_PROBES_1_FILE, VP_SELECTION_ALGORITHM_PROBES_3_FILE, VP_SELECTION_ALGORITHM_PROBES_10_FILE]

rtt_per_srcs_dst_prefix = compute_rtts_per_dst_src(ping_table_prefix, filter, threshold=100, is_per_prefix=True)
rtt_per_srcs_dst = compute_rtts_per_dst_src(ping_table, filter, threshold=70)

for i, n_vp in enumerate(N_VPS_SELECTION_ALGORITHM):
    vps_per_target, _ = compute_closest_rtt_probes(rtt_per_srcs_dst_prefix,
                                                    vp_coordinates_per_ip,
                                                    vp_distance_matrix,
                                                    n_shortest=n_vp,
                                                    is_prefix=True)
    features = compute_geolocation_features_per_ip(rtt_per_srcs_dst, vp_coordinates_per_ip,
                                                    [0],
                                                    vps_per_target=vps_per_target,
                                                    distance_operator=">", max_vps=100000,
                                                    is_use_prefix=True,
                                                    vp_distance_matrix=vp_distance_matrix,
                                                    is_multiprocess=True)
    
    ofile = results_files[i]
    dump_json(features, ofile)