In [None]:
import ipaddress
import time
import numpy as np
import csv
import random
import os.path

from clickhouse_driver import Client
from multiprocessing.pool import Pool, ThreadPool

from utils.file_utils import load_json, dump_json
#from clickhouse_utils.traceroutes.queries import select_traceroutes_query
from utils.clickhouse_query import get_min_rtt_per_src_dst_query_ping_table, get_min_rtt_per_src_dst_prefix_query_ping_table
from utils.helpers import select_best_guess_centroid, haversine
from utils.common import compute_rtts_per_dst_src, compute_geo_info, compute_error, get_prefix_from_ip
from utils.plot_utils import plot_multiple_cdf, plot_save, homogenize_legend, plot_multiple_error_bars
from default import *

## Compute cities

In [None]:
resources_dir = "resources/replicability"
files = [f"{resources_dir}/geocoded_by_geoapify-10_05_2023_0_500.csv",
            f"{resources_dir}/geocoded_by_geoapify-10_05_2023_500_last.csv"]

cities = set()
for file in files:
    with open(file) as f:
        reader = csv.reader(f, quotechar='"', delimiter=',')
        headers = next(reader, None)
        city_index = headers.index("city")
        for row in reader:
            city = row[city_index]
            cities.add(city)

print(len(cities), cities)

## Evaluate

### loading data

In [None]:
anchors = load_json(ANCHORS_FILE)

probes = load_json(PROBES_FILE)

all_probes = load_json(PROBES_AND_ANCHORS_FILE)

removed_probes = load_json(REMOVED_PROBES_FILE)

filter = ""
if len(removed_probes) > 0:
    # Remove probes that are wrongly geolocated
    in_clause = f"".join(
        [f",toIPv4('{p}')" for p in removed_probes])[1:]
    filter += f"AND dst not in ({in_clause}) AND src not in ({in_clause}) "

In [None]:
# Fig.2 and 3 of the millions paper

### Accuracy vs number of vps anchors

In [None]:
vp_coordinates_per_ip, ip_per_coordinates, country_per_vp, asn_per_vp, \
        vp_distance_matrix, probe_per_ip = compute_geo_info(
            anchors, serialized_file=PAIRWISE_DISTANCE_ANCHOR_FILE)

In [None]:
subset_sizes = [10]
subset_sizes.extend([i for i in range(100, 800, 100)])
rtt_per_srcs_dst = compute_rtts_per_dst_src(ANCHORS_MESHED_PING_TABLE, filter, threshold=50)
available_vps = [vp for vp in rtt_per_srcs_dst.keys() if vp in vp_coordinates_per_ip]

compute_accuracy_vs_number_of_vps(available_vps, rtt_per_srcs_dst, vp_coordinates_per_ip,
                                    ip_per_coordinates, country_per_vp,
                                    vp_distance_matrix, probe_per_ip, subset_sizes, ACCURACY_VS_N_VPS_FILE)

### Accuracy vs number of vps probes

In [None]:
vp_coordinates_per_ip, ip_per_coordinates, country_per_vp, asn_per_vp, \
        vp_distance_matrix, probe_per_ip = compute_geo_info(
            anchors, serialized_file=PAIRWISE_DISTANCE_PROBE_FILE)

In [None]:
# subset_sizes = [10]
subset_sizes = []
subset_sizes.extend([i for i in range(100, 1000, 100)])
# subset_sizes.extend([i for i in range(1000, 10001, 1000)])

rtt_per_srcs_dst = compute_rtts_per_dst_src(PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=50)
available_vps = set(vp_coordinates_per_ip.keys())
available_vps -= removed_probes
compute_accuracy_vs_number_of_vps(available_vps, rtt_per_srcs_dst, vp_coordinates_per_ip,
                                    ip_per_coordinates, country_per_vp,
                                    vp_distance_matrix, probe_per_ip, subset_sizes,
                                    ACCURACY_VS_N_VPS_PROBES_FILE)

### Compute errors

In [None]:
rtt_per_srcs_dst = compute_rtts_per_dst_src(
    ANCHORS_MESHED_PING_TABLE, filter, threshold=70)
vps_per_target = {dst: set(vp_coordinates_per_ip.keys())
                    for dst in rtt_per_srcs_dst}
features = compute_geolocation_features_per_ip(rtt_per_srcs_dst, vp_coordinates_per_ip, THRESHOLD_DISTANCES,
                                                vps_per_target=vps_per_target,
                                                distance_operator=">", max_vps=100000,
                                                is_use_prefix=False,
                                                ip_per_coordinates=ip_per_coordinates,
                                                country_per_vp=country_per_vp,
                                                vp_distance_matrix=vp_distance_matrix,
                                                anchor_per_ip=probe_per_ip
                                                )

dump_json(features, ANCHORS_TO_ANCHORS_RESULT_FILE)

In [None]:
rtt_per_srcs_dst = compute_rtts_per_dst_src(
    PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=70)
vps_per_target = {dst: set(vp_coordinates_per_ip.keys())
                    for dst in rtt_per_srcs_dst}
features = compute_geolocation_features_per_ip(rtt_per_srcs_dst, vp_coordinates_per_ip, THRESHOLD_DISTANCES,
                                                vps_per_target=vps_per_target,
                                                distance_operator=">", max_vps=100000,
                                                is_use_prefix=False,
                                                ip_per_coordinates=ip_per_coordinates,
                                                country_per_vp=country_per_vp,
                                                vp_distance_matrix=vp_distance_matrix,
                                                anchor_per_ip=probe_per_ip
                                                )

dump_json(features, PROBES_TO_ANCHORS_RESULT_FILE)

### VPs selection algorithm

In [None]:
# Fig.5 of the millions paper.

In [None]:
ping_table_prefix = ANCHORS_TO_PREFIX_TABLE
ping_table = ANCHORS_MESHED_PING_TABLE
results_files = [VP_SELECTION_ALGORITHM_1_FILE, VP_SELECTION_ALGORITHM_3_FILES, VP_SELECTION_ALGORITHM_10_FILES]

rtt_per_srcs_dst_prefix = compute_rtts_per_dst_src(ping_table_prefix, filter, threshold=100, is_per_prefix=True)
rtt_per_srcs_dst = compute_rtts_per_dst_src(ping_table, filter, threshold=70)

for i, n_vp in enumerate(N_VPS_SELECTION_ALGORITHM):
    vps_per_target, _ = compute_closest_rtt_probes(rtt_per_srcs_dst_prefix,
                                                    vp_coordinates_per_ip,
                                                    vp_distance_matrix,
                                                    n_shortest=n_vp,
                                                    is_prefix=True)
    features = compute_geolocation_features_per_ip(rtt_per_srcs_dst, vp_coordinates_per_ip,
                                                    [0],
                                                    vps_per_target=vps_per_target,
                                                    distance_operator=">", max_vps=100000,
                                                    is_use_prefix=True,
                                                    ip_per_coordinates=ip_per_coordinates,
                                                    country_per_vp=country_per_vp,
                                                    vp_distance_matrix=vp_distance_matrix,
                                                    anchor_per_ip=probe_per_ip,
                                                    is_multiprocess=True)
    
    ofile = results_files[i]
    dump_json(features, ofile)

In [None]:
ping_table_prefix = PROBES_TO_PREFIX_TABLE
ping_table = PROBES_TO_ANCHORS_PING_TABLE
results_files = [VP_SELECTION_ALGORITHM_PROBES_1_FILE, VP_SELECTION_ALGORITHM_PROBES_3_FILE, VP_SELECTION_ALGORITHM_PROBES_10_FILE]

rtt_per_srcs_dst_prefix = compute_rtts_per_dst_src(ping_table_prefix, filter, threshold=100,
                                                    is_per_prefix=True)
rtt_per_srcs_dst = compute_rtts_per_dst_src(
    ping_table, filter, threshold=70)

for i, n_vp in enumerate(N_VPS_SELECTION_ALGORITHM):
    vps_per_target, _ = compute_closest_rtt_probes(rtt_per_srcs_dst_prefix,
                                                    vp_coordinates_per_ip,
                                                    vp_distance_matrix,
                                                    n_shortest=n_vp,
                                                    is_prefix=True)
    features = compute_geolocation_features_per_ip(rtt_per_srcs_dst, vp_coordinates_per_ip,
                                                    [0],
                                                    vps_per_target=vps_per_target,
                                                    distance_operator=">", max_vps=100000,
                                                    is_use_prefix=True,
                                                    ip_per_coordinates=ip_per_coordinates,
                                                    country_per_vp=country_per_vp,
                                                    vp_distance_matrix=vp_distance_matrix,
                                                    anchor_per_ip=probe_per_ip,
                                                    is_multiprocess=True)
    
    ofile = results_files[i]
    dump_json(features, ofile)

### Closest rtt probe

In [None]:
rtt_per_srcs_dst = compute_rtts_per_dst_src(PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=300)

_, min_rtt_per_dst = compute_closest_rtt_probes(rtt_per_srcs_dst, vp_coordinates_per_ip, vp_distance_matrix,
                                                is_prefix=False,
                                                n_shortest=1)

dump_json(min_rtt_per_dst, MIN_RTT_PER_DIST_FILE)

### Fixed set

In [None]:
# Compute the min_rtt for each set of VPs if we take different sets

random.seed(42)
min_rtt_per_target_one_per_city = {}
min_rtt_per_target_one_per_city_asn = {}

rtt_per_srcs_dst = compute_rtts_per_dst_src(PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=300)

for n_vps_per_granularity in [1, 3, 10, 100]:
    vps_per_target_with_asn, min_rtt_per_target_with_asn = compute_fixed_set_of_probes(rtt_per_srcs_dst,
                                                                                        vp_coordinates_per_ip,
                                                                                        vp_distance_matrix,
                                                                                        n_vps_per_granularity=n_vps_per_granularity,
                                                                                        threshold=40,
                                                                                        is_with_as=True,
                                                                                        asn_per_vp=asn_per_vp)
    vps_per_target, min_rtt_per_target = compute_fixed_set_of_probes(rtt_per_srcs_dst,
                                                                        vp_coordinates_per_ip,
                                                                        vp_distance_matrix,
                                                                        n_vps_per_granularity=n_vps_per_granularity,
                                                                        threshold=40,
                                                                        is_with_as=False,
                                                                        asn_per_vp=asn_per_vp)

    min_rtt_per_target_one_per_city[n_vps_per_granularity] = vps_per_target, min_rtt_per_target
    min_rtt_per_target_one_per_city_asn[n_vps_per_granularity] = vps_per_target_with_asn, min_rtt_per_target_with_asn

dump_json(min_rtt_per_target_one_per_city, FIXED_SET_ONE_PROBE_PER_CITY_FILE)
dump_json(min_rtt_per_target_one_per_city_asn, FIXED_SET_ONE_PROBE_PER_CITY_ASN_FILE)

### Probe redundancy

In [None]:
rtt_per_srcs_dst = compute_rtts_per_dst_src(PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=300)
# Do we have probes that are always worse in terms of RTT than others in the same city?

closest_probes_per_vp = compute_closest_probes(rtt_per_srcs_dst, vp_coordinates_per_ip, vp_distance_matrix,
                                                threshold=40,
                                                probes=probes)
redundant_probes = compute_redundancy_probes(
    rtt_per_srcs_dst, vp_coordinates_per_ip, closest_probes_per_vp)

print(len(redundant_probes))

### Closest probe

In [None]:
rtt_per_srcs_dst = compute_rtts_per_dst_src(PROBES_TO_ANCHORS_PING_TABLE, filter, threshold=100)

rtts_per_closest_probes = compute_closest_probes(rtt_per_srcs_dst, vp_coordinates_per_ip,
                                                    vp_distance_matrix,
                                                    threshold=40, probes=probe_per_ip)

dump_json(rtts_per_closest_probes, RTTS_PER_CLOSEST_PROBES_FILE)