In [None]:
import numpy as np
import matplotlib.pyplot as plt

from clickhouse_driver import Client

from utils.file_utils import load_json
from utils.common import compute_geo_info, compute_error_threshold_cdfs, iso_code_2_to_country, every_tier_result_and_errors
from utils.plot_utils import plot_multiple_cdf, homogenize_legend, plot_save, plot_multiple_error_bars
from utils.clickhouse_query import get_min_rtt_per_src_dst_query_ping_table
from utils.helpers import haversine
from default import *

In [None]:
anchors = load_json(ANCHORS_FILE)

probes = load_json(PROBES_FILE)

all_probes = load_json(PROBES_AND_ANCHORS_FILE)

removed_probes = load_json(REMOVED_PROBES_FILE)


results_file = [ANCHORS_TO_ANCHORS_RESULT_FILE, PROBES_TO_ANCHORS_RESULT_FILE]

# results_file.extend(vp_selection_algorithms_results_files)
results_file.extend([VP_SELECTION_ALGORITHM_PROBES_1_FILE, VP_SELECTION_ALGORITHM_PROBES_3_FILE, VP_SELECTION_ALGORITHM_PROBES_10_FILE])
results_file.append(FIXED_SET_ONE_PROBE_PER_CITY_FILE)

## Step 1:

In [None]:
vp_coordinates_per_ip, ip_per_coordinates, country_per_vp, asn_per_vp_ip, vp_distance_matrix, probe_per_ip = compute_geo_info(
    all_probes, serialized_file=PAIRWISE_DISTANCE_PROBE_FILE)

# Compute the number of VPs per country
vps_per_country = {}
for vp, country in country_per_vp.items():
    if vp not in removed_probes:
        vps_per_country.setdefault(country, set()).add(vp)

In [None]:
errors_threshold_anchors_to_anchors = load_json(results_file[0])
error_threshold_cdfs_a_to_a, circles_threshold_cdfs_a_to_a, _ = \
    compute_error_threshold_cdfs(errors_threshold_anchors_to_anchors)


Ys = error_threshold_cdfs_a_to_a
labels = [f"{t}" for t in THRESHOLD_DISTANCES]
fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                            "Geolocation error (km)",
                            "CDF of targets",
                            xscale="log",
                            yscale="linear",
                            legend=labels)

homogenize_legend(ax, "lower right")
ofile = CBG_THRESHOLD_BASIC_FILE
plot_save(ofile, is_tight_layout=True)


Ys = circles_threshold_cdfs_a_to_a
labels = [f"{t}" for t in THRESHOLD_DISTANCES]
fig, ax = plot_multiple_cdf(Ys, 10000, 1, max(max(Y) for Y in Ys),
                            "Number of useful VPs",
                            "CDF of targets",
                            xscale="log",
                            yscale="linear",
                            legend=labels)

homogenize_legend(ax, "lower right")
ofile = CBG_THRESHOLD_CIRCLES_FILE
plot_save(ofile, is_tight_layout=True)

## Step 2 :

In [None]:
print("Computing probes to anchors")

errors_threshold_probes_to_anchors = load_json(results_file[1])
error_threshold_cdfs_p_to_a, circles_threshold_cdfs_p_to_a, _ = compute_error_threshold_cdfs(
    errors_threshold_probes_to_anchors)

Ys = error_threshold_cdfs_p_to_a
print(len(error_threshold_cdfs_p_to_a[0]))
labels = ["All VPs"]
labels.extend([f"VPs > {t} km" for t in THRESHOLD_DISTANCES if t > 0])
fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                            "Geolocation error (km)",
                            "CDF of targets",
                            xscale="log",
                            yscale="linear",
                            legend=labels)
homogenize_legend(ax, "lower right", legend_size=12)

ofile = CBG_THRESHOLD_PROBES_FILE
plot_save(ofile, is_tight_layout=True)

## Step 3 :

In [None]:
Ys = [error_threshold_cdfs_p_to_a[0], error_threshold_cdfs_a_to_a[0]]
labels = ["All probes", "Only anchors"]
fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                             "Geolocation error (km)",
                             "CDF of targets",
                             xscale="log",
                             yscale="linear",
                             legend=labels)
homogenize_legend(ax, "lower right")
ofile = CBG_THRESHOLD_ALL_FILE
plot_save(ofile, is_tight_layout=True)

## Step 4 :

In [None]:
Ys = []
labels = []
for i, file in enumerate(results_file):
    if "vp_selection_algorithm_probes" in file:
        n_vps = file.split(".json")[0].split(
            "vp_selection_algorithm_probes_")[1]
        n_vps = int(n_vps)
        errors_threshold_vp_selection_algorithm = load_json(
            results_file[i])
        error_threshold_cdfs_p_to_a_vp_selection, circles_threshold_cdfs_p_to_a_vp_selection, _ = compute_error_threshold_cdfs(
            errors_threshold_vp_selection_algorithm)
        Ys.append(list(error_threshold_cdfs_p_to_a_vp_selection[0]))
        labels.append(f"{n_vps} closest VP (RTT)")
        if n_vps == 10:
            # Take the baseline where 10 VPs are used to geolocate a target
            error_threshold_cdfs_p_to_a, circles_threshold_cdfs_p_to_a, _ = compute_error_threshold_cdfs(
                errors_threshold_probes_to_anchors, errors_threshold_vp_selection_algorithm)
            Ys.append(list(error_threshold_cdfs_p_to_a[0]))
            labels.append("All VPs")

fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                        "Geolocation error (km)",
                        "CDF of targets",
                        xscale="log",
                        yscale="linear",
                        legend=labels)
homogenize_legend(ax, "lower right")
ofile = CBG_THRESHOLD_VP_SELECTION_FILE
plot_save(ofile, is_tight_layout=True)

## Step 5 :

In [None]:
"""
Plot results with fixed set of probes
"""

Ys = []
labels = []
for i, file in enumerate(results_file):
    if "fixed_set" in file:
        errors_threshold_fixed_set = load_json(results_file[i])
        error_threshold_cdfs_a_to_a_fixed_set, circles_threshold_cdfs_a_to_a_fixed_set, _ = compute_error_threshold_cdfs(errors_threshold_fixed_set)
        # Take the baseline where IP addresses where geolocated by the technique
        error_threshold_cdfs_a_to_a, circles_threshold_cdfs_a_to_a, _ = compute_error_threshold_cdfs(errors_threshold_anchors_to_anchors, errors_threshold_fixed_set)
        Ys.append(list(error_threshold_cdfs_a_to_a[0]))
        labels.append("All anchors")
        Ys.append(list(error_threshold_cdfs_a_to_a_fixed_set[0]))
        labels.append(f"Fixed set (one per city)")
        break

fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                        "Geolocation error (km)",
                        "CDF of targets",
                        xscale="log",
                        yscale="linear",
                        legend=labels)
homogenize_legend(ax, "lower right")
ofile = CBG_THRESHOLD_FIXED_SET_FILE
plot_save(ofile, is_tight_layout=True)

## Step 6 :

In [None]:
"""
Compute results per continent
"""

errors_threshold_probes_to_anchors = load_json(results_file[1])
continent_name_by_continent_code, continent_by_iso_2, country_by_iso_2 = iso_code_2_to_country()
_, _, error_per_ip = compute_error_threshold_cdfs(
    errors_threshold_probes_to_anchors)

error_per_continent_cdf = {}
error_per_country_cdf = {}

# Match the anchors of the second replicated paper
anchors = load_json(ANCHORS_FILE)
ip_list = []
for anchor in anchors:
    ip_list.append(anchor['address_v4'])
anchors_second = set(anchors)
print(len(anchors_second))

error_per_country_cdf_med = {country_by_iso_2[x]: (np.median(error_per_country_cdf[x]),
                                                    len(error_per_country_cdf[x]), len(vps_per_country[x])) for x in error_per_country_cdf}

error_per_country_cdf_med_sorted = sorted(
    error_per_country_cdf_med.items(), key=lambda x: x[1][0], reverse=True)
print(error_per_country_cdf_med_sorted)

Ys = [list(error_per_continent_cdf[c])
        for c in error_per_continent_cdf]
labels = [
    f"{c} ({len(error_per_continent_cdf[c])})" for c in error_per_continent_cdf]
fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                            "Geolocation error (km)",
                            "CDF of targets",
                            xscale="log",
                            yscale="linear",
                            legend=labels)
homogenize_legend(ax, "lower right")
ofile = CBG_THRESHOLD_CONTINENT_FILE
plot_save(ofile, is_tight_layout=True)

## Step 7

In [None]:
rtts_per_closest_probes = load_json(RTTS_PER_CLOSEST_PROBES_FILE)
mean_cdf = []

stddev_cdf = []

for dst, rtt_dist in rtts_per_closest_probes.items():
    if len(rtt_dist) > 1:
        mean = np.mean(rtt_dist)
        stddev = np.std(rtt_dist)
        mean_cdf.append(mean)
        stddev_cdf.append(stddev)

Ys = [mean_cdf, stddev_cdf]
labels = ["Mean", "stddev"]
fig, ax = plot_multiple_cdf(Ys, 10000, 0.1, max(max(Y) for Y in Ys),
                            "RTT Metric (ms)",
                            "CDF of targets",
                            xscale="log",
                            yscale="linear",
                            legend=labels)
homogenize_legend(ax, "lower right")
ofile = RTTS_PDF_FILE
plot_save(ofile, is_tight_layout=True)

## Step 8 :

In [None]:
accuracy_vs_n_vps_probes = load_json(ACCURACY_VS_N_VPS_PROBES_FILE)
accuracy_vs_n_vps_probes = {
    int(x): accuracy_vs_n_vps_probes[x] for x in accuracy_vs_n_vps_probes}
X = sorted([x for x in sorted(accuracy_vs_n_vps_probes.keys())])
Ys = [accuracy_vs_n_vps_probes[i] for i in X]
Ys_med = [[np.median(x) for x in Ys]]
Ys_err = [[np.std(x) for x in Ys]]

"""
Fig 3.a of the paper
"""

fig, ax = plot_multiple_error_bars(X, Ys_med, Ys_err,
                                    xmin=10, xmax=10500, ymin=1, ymax=10000,
                                    xlabel="Number of VPs",
                                    ylabel="Geolocation error (km)",
                                    xscale="log",
                                    yscale="log",
                                    labels=[
                                        ""
                                    ],

                                    )

homogenize_legend(ax, "lower right")
ofile = FIG_3A_FILE
plot_save(ofile, is_tight_layout=True)

"""
Fig 3.b of the paper
"""

subset_sizes = [100, 500, 1000, 2000]

labels = [f"{s} VPs" for s in subset_sizes]

Ys = [accuracy_vs_n_vps_probes[i] for i in subset_sizes]
print(min(accuracy_vs_n_vps_probes[100]),
        max(accuracy_vs_n_vps_probes[100]))

fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                            "Geolocation error (km)",
                            "CDF of median error",
                            xscale="log",
                            yscale="linear",
                            legend=labels)
homogenize_legend(ax, "lower right")
ofile = FIG_3B_FILE
plot_save(ofile, is_tight_layout=True)

## Step 9

In [None]:
"""
Sort the min_rtt distributions per their number of RTTs under 0.4 ms
:param fixed_set_probes_file:
:return:
"""
fixed_set_probes_results_per_city = load_json(
    FIXED_SET_ONE_PROBE_PER_CITY_FILE)
fixed_set_probes_results_per_city_asn = load_json(
    FIXED_SET_ONE_PROBE_PER_CITY_ASN_FILE)
min_rtt_per_dst = load_json(MIN_RTT_PER_DIST_FILE)

min_rtt_per_dst = {
    x: min_rtt_per_dst[x] for x in min_rtt_per_dst if x in fixed_set_probes_results_per_city["1"][1]}
min_rtt_per_dst_cdf = list(min_rtt_per_dst.values())
n_vps_per_granularity_l = [1, 10]
Ys = [min_rtt_per_dst_cdf]
labels = ["All VPs"]
for n_vps_per_granularity in n_vps_per_granularity_l:
    n_vps_per_granularity = str(n_vps_per_granularity)
    vps_per_target, fixed_set_probes_results_per_city_n_vps = fixed_set_probes_results_per_city[
        n_vps_per_granularity]
    vps_per_target_asn, fixed_set_probes_results_per_city_asn_n_vps = fixed_set_probes_results_per_city_asn[
        n_vps_per_granularity]
    fixed_set_probes_results_per_city_cdf = [
        min(x) for x in fixed_set_probes_results_per_city_n_vps.values()]
    fixed_set_probes_results_per_city_asn_cdf = [
        min(x) for x in fixed_set_probes_results_per_city_asn_n_vps.values()]
    # Compute the number of VPs needed
    n_vps_used = set()
    n_vps_used_asn = set()

    for _, vps in vps_per_target.items():
        n_vps_used.update(vps)

    for _, vps in vps_per_target_asn.items():
        n_vps_used_asn.update(vps)

    # DEBUG
    # if n_vps_per_granularity == "100":
    #     for dst, rtts in fixed_set_probes_results_per_city_asn_n_vps.items():
    #         if min(rtts) > 1 and min_rtt_per_dst[dst] < 1:
    #             print(dst)
    Ys.append(fixed_set_probes_results_per_city_asn_cdf)
    labels.append(
        f"{n_vps_per_granularity} VP per AS/City ({len(n_vps_used_asn)})")
    Ys.append(fixed_set_probes_results_per_city_cdf)
    labels.append(
        f"{n_vps_per_granularity} VP per city ({len(n_vps_used)})")
# sorted_fixed_set = sorted(fixed_set_probes_results, key=lambda x: len([y for y in x.values() if y <= 0.4]))
#
# top_fixed_set = list(sorted_fixed_set[-1].values())
# bottom_fixed_set = list(sorted_fixed_set[0].values())
# median_fixed_set = list(sorted_fixed_set[int(len(sorted_fixed_set)/2)].values())

fig, ax = plot_multiple_cdf(Ys, 10000, 0.1, max(max(Y) for Y in Ys),
                            "Min RTT (ms)",
                            "CDF of targets",
                            xscale="log",
                            yscale="linear",
                            legend=labels)
homogenize_legend(ax, "lower right")
ofile = RTTS_FIXED_SET_FILE
plot_save(ofile, is_tight_layout=True)

## Step 10

In [None]:
round_based_algorithm_results = load_json(ROUND_BASED_ALGORITHM_FILE)

round_based_algorithm_results = {
int(x): round_based_algorithm_results[x] for x in round_based_algorithm_results}

errors_threshold_probes_to_anchors = load_json(PROBES_TO_ANCHORS_RESULT_FILE)
error_threshold_cdfs_p_to_a, circles_threshold_cdfs_p_to_a, _ = compute_error_threshold_cdfs(
errors_threshold_probes_to_anchors)

Ys_error = [error_threshold_cdfs_p_to_a[0]]
Ys_n_vps = []

labels_error = ["All VPs"]
labels_n_vps = []

for tier1_vps, results in sorted(round_based_algorithm_results.items()):
        tier1_vps = int(tier1_vps)
        error_cdf = [r[1] for r in results if r[1] is not None]
        n_vps_cdf = [r[2] + tier1_vps for r in results if r[2] is not None]
        label = f"{tier1_vps} VPs"
        labels_error.append(label)
        labels_n_vps.append(label)
        Ys_error.append(error_cdf)
        Ys_n_vps.append(n_vps_cdf)
        print(tier1_vps, 3 * sum(n_vps_cdf))

fig, ax = plot_multiple_cdf(Ys_error, 10000, 1, 10000,
                        "Geolocation error (km)",
                        "CDF of targets",
                        xscale="log",
                        yscale="linear",
                        legend=labels_error)
homogenize_legend(ax, "lower right")
ofile = ROUND_ALGORITHM_ERROR_FILE
plot_save(ofile, is_tight_layout=True)

fig, ax = plot_multiple_cdf(Ys_n_vps, 10000, 10, 10000,
                        "Vantage points",
                        "CDF of representatives",
                        xscale="log",
                        yscale="linear",
                        legend=labels_n_vps)
homogenize_legend(ax, "upper left")
ofile = ROUND_ALGORITHM_VPS_FILE
plot_save(ofile, is_tight_layout=True)

## Step 11

In [None]:
data = load_json(FINAL_ANALYSABLE_FILE)

query = get_min_rtt_per_src_dst_query_ping_table(
    'geolocation_replication', 'targets_to_landmarks_pings', '', 1000000)
client = Client('127.0.0.1')
db_table = client.execute(query)
rtts = []
remove_dict = {}
print(len(db_table))
for l in db_table:
    rtts.append(l[2])
    remove_dict[(l[0], l[1])] = l[2]
print(len(rtts))
plot_multiple_cdf([rtts], 10000, 0, None, 'Min RTT (ms)',
                    'CDF of (landmark, target) pairs', None)
plot_save(CLOSE_LANDMARK_FILE, is_tight_layout=True)

plot_multiple_cdf([rtts], 10000, 0.1, None, 'Min RTT (ms)',
                    'CDF of (landmark, target) pairs', None, xscale="log")
plot_save(CLOSE_LANDMARK_LOG_FILE, is_tight_layout=True)

error1 = []
error2 = []
error3 = []
error4 = []
error1ms = []
error2ms = []
error5ms = []
error10ms = []

for _, d in data.items():
    errors = every_tier_result_and_errors(d)
    error1.append(errors['error1'])
    error2.append(errors['error2'])
    error3.append(errors['error3'])
    error4.append(errors['error4'])
    err1ms = 50000
    err2ms = 50000
    err5ms = 50000
    err10ms = 50000
    for f in ['tier2:landmarks', 'tier3:landmarks']:
        if f in d:
            for l_ip, _, l_lat, l_lon in d[f]:
                dist = haversine((l_lat, l_lon), (d['lat_c'], d['lon_c']))
                key_rtt = (l_ip, d['target_ip'])
                if dist < err1ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 1):
                    err1ms = dist
                if dist < err2ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 2):
                    err2ms = dist
                if dist < err5ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 5):
                    err5ms = dist
                if dist < err10ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 10):
                    err10ms = dist
    if err1ms != 50000:
        error1ms.append(err1ms)
    else:
        error1ms.append(error1[-1])
    if err2ms != 50000:
        error2ms.append(err2ms)
    else:
        error2ms.append(error1[-1])
    if err5ms != 50000:
        error5ms.append(err5ms)
    else:
        error5ms.append(error1[-1])
    if err10ms != 50000:
        error10ms.append(err10ms)
    else:
        error10ms.append(error1[-1])

plot_multiple_cdf([error3, error4, error1ms, error2ms, error5ms, error10ms], 10000, 0, None, 'Geolocation error (km)', 'CDF of targets', [
                    "Street Level", "Closest landmark unfiltered", "Closest landmark <= 1ms", "Closest landmark <= 2ms", "Closest landmark <= 5ms", "Closest landmark <= 10ms"])
plt.legend(fontsize="10")
plot_save(CLOSE_LANDMARK_FILE_2, is_tight_layout=True)

plot_multiple_cdf([error3, error4, error1ms, error2ms, error5ms, error10ms], 10000, 0.1, None, 'Geolocation error (km)', 'CDF of targets', [
                    "Street Level", "Closest landmark unfiltered", "Closest landmark <= 1ms", "Closest landmark <= 2ms", "Closest landmark <= 5ms", "Closest landmark <= 10ms"], xscale="log")
plt.legend(fontsize="10")
plot_save(CLOSE_LANDMARK_LOG_FILE_2, is_tight_layout=True)

for i in [1, 5, 10, 40, 9999999999]:
    c = len([j for j in error1ms if j <= i])
    print(f"{c} targets with landmarks (ping <= {i}) or {c/len(error1ms)}")

## Step X

In [None]:
ip_info_geo = load_json(IP_INFO_GEO_FILE)
mm_geo = load_json(MAXMIND_GEO_FILE)
errors_threshold_probes_to_anchors = load_json(PROBES_TO_ANCHORS_RESULT_FILE)

In [None]:
error_threshold_cdfs_p_to_a, circles_threshold_cdfs_p_to_a, _ = compute_error_threshold_cdfs(
    errors_threshold_probes_to_anchors)

maxmind_error = {}
ip_info_error = {}
for i, anchor in enumerate(sorted(anchors, key=lambda x: x["address_v4"])):
    ip = anchor["address_v4"]
    if ip in removed_probes:
        continue

    if "geometry" not in anchor:
        continue

    long, lat = anchor["geometry"]["coordinates"]
    if ip in mm_geo:
        error = haversine(mm_geo[ip], (lat, long))
        maxmind_error[ip] = error

    if ip in ip_info_geo:
        ipinfo_lat, ipinfo_long = ip_info_geo[ip]["loc"].split(",")
        ipinfo_lat, ipinfo_long = float(ipinfo_lat), float(ipinfo_long)
        error = haversine((ipinfo_lat, ipinfo_long), (lat, long))
        ip_info_error[ip] = error

Ys = [error_threshold_cdfs_p_to_a[0], list(
    maxmind_error.values()), list(ip_info_error.values())]
print([len(Y) for Y in Ys])
labels = ["All VPs", "Maxmind (Free)", "IPinfo"]
fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                            "Geolocation error (km)",
                            "CDF of targets",
                            xscale="log",
                            yscale="linear",
                            legend=labels)
homogenize_legend(ax, "lower right")

ofile = GEO_DATABASE_FILE
plot_save(ofile, is_tight_layout=True)