In [1]:
import numpy as np
import matplotlib.pyplot as plt
import csv

from clickhouse_driver import Client

from scripts.utils.file_utils import load_json
from scripts.analysis.analysis import compute_error_threshold_cdfs, every_tier_result_and_errors
from scripts.utils.plot_utils import plot_multiple_cdf, homogenize_legend, plot_save, plot_multiple_error_bars
from scripts.utils.clickhouse_utils import get_min_rtt_per_src_dst_query_ping_table
from scripts.utils.helpers import haversine
from default import *

## Loading data

In [2]:
anchors = load_json(ANCHORS_FILE)

probes = load_json(PROBES_FILE)

all_probes = load_json(PROBES_AND_ANCHORS_FILE)

removed_probes = load_json(REMOVED_PROBES_FILE)

## Step 2 :

In [5]:
errors_threshold_probes_to_anchors = load_json(PROBES_TO_ANCHORS_RESULT_FILE)

error_threshold_cdfs_p_to_a, circles_threshold_cdfs_p_to_a, _ = compute_error_threshold_cdfs(
    errors_threshold_probes_to_anchors)

Ys = error_threshold_cdfs_p_to_a
print(len(error_threshold_cdfs_p_to_a[0]))
labels = ["All VPs"]
labels.extend([f"VPs > {t} km" for t in THRESHOLD_DISTANCES if t > 0])
fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                            "Geolocation error (km)",
                            "CDF of targets",
                            xscale="log",
                            yscale="linear",
                            legend=labels)
homogenize_legend(ax, "lower right", legend_size=12)

ofile = CBG_THRESHOLD_PROBES_FILE
plot_save(ofile, is_tight_layout=True)

Computing probes to anchors
Threshold 0 no geolocation 0
Threshold 40 no geolocation 0
Threshold 100 no geolocation 0
Threshold 500 no geolocation 0
Threshold 1000 no geolocation 1
711


## Step 4 :

In [8]:
Ys = []
labels = []
results_file = [VP_SELECTION_ALGORITHM_PROBES_1_FILE, VP_SELECTION_ALGORITHM_PROBES_3_FILE, VP_SELECTION_ALGORITHM_PROBES_10_FILE]
index = [1, 3, 10]

for i, file in enumerate(results_file):
    n_vps = index[i]
    errors_threshold_vp_selection_algorithm = load_json(
        results_file[i])
    error_threshold_cdfs_p_to_a_vp_selection, circles_threshold_cdfs_p_to_a_vp_selection, _ = compute_error_threshold_cdfs(
        errors_threshold_vp_selection_algorithm)
    Ys.append(list(error_threshold_cdfs_p_to_a_vp_selection[0]))
    labels.append(f"{n_vps} closest VP (RTT)")
    if n_vps == 10:
        # Take the baseline where 10 VPs are used to geolocate a target
        error_threshold_cdfs_p_to_a, circles_threshold_cdfs_p_to_a, _ = compute_error_threshold_cdfs(
            errors_threshold_probes_to_anchors, errors_threshold_vp_selection_algorithm)
        Ys.append(list(error_threshold_cdfs_p_to_a[0]))
        labels.append("All VPs")

fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                        "Geolocation error (km)",
                        "CDF of targets",
                        xscale="log",
                        yscale="linear",
                        legend=labels)
homogenize_legend(ax, "lower right")
ofile = CBG_THRESHOLD_VP_SELECTION_FILE
plot_save(ofile, is_tight_layout=True)

Threshold 0 no geolocation 33
Threshold 0 no geolocation 7
Threshold 0 no geolocation 5
Threshold 0 no geolocation 0
Threshold 40 no geolocation 0
Threshold 100 no geolocation 0
Threshold 500 no geolocation 0
Threshold 1000 no geolocation 1


## Step 6 :

In [10]:
def iso_code_2_to_country():
    country_by_iso_2 = {}
    continent_by_iso_2 = {}
    # Continent_Name,Continent_Code,Country_Name,Two_Letter_Country_Code,Three_Letter_Country_Code,Country_Number
    with open(COUNTRIES_CSV_FILE) as f:
        reader = csv.reader(f, delimiter=",", quotechar='"')
        next(reader, None)
        for line in reader:
            continent_code = line[1]
            country_name = line[2].split(",")[0]
            country_iso_code_2 = line[3]
            country_by_iso_2[country_iso_code_2] = country_name
            continent_by_iso_2[country_iso_code_2] = continent_code
    return continent_by_iso_2, country_by_iso_2

In [27]:
ip_per_country = {}
for anchor in anchors:
    if "address_v4" in anchor and "geometry" in anchor and "coordinates" in anchor["geometry"]:
        ip_v4_address = anchor["address_v4"]
        if ip_v4_address is None:
            continue
        country = anchor["country_code"]
        ip_per_country[ip_v4_address] = country

country_per_ip = {}
for ip, country in ip_per_country.items():
    country_per_ip.setdefault(country, []).append(ip)

In [30]:
"""
Compute results per continent
"""

errors_threshold_probes_to_anchors = load_json(PROBES_TO_ANCHORS_RESULT_FILE)

continent_by_iso_2, country_by_iso_2 = iso_code_2_to_country()

_, _, error_per_ip = compute_error_threshold_cdfs(errors_threshold_probes_to_anchors)

error_per_continent_cdf = {}
error_per_country_cdf = {}

# Match the anchors of the second replicated paper
anchors_second = list(set(load_json(ANCHORS_SECOND_PAPER_FILE)))
for ip, error in error_per_ip.items():
    if ip not in anchors_second:
        continue
    country = ip_per_country[ip]
    continent = continent_by_iso_2[country]
    error_per_continent_cdf.setdefault(continent, []).append(error)
    error_per_country_cdf.setdefault(country, []).append(error)

error_per_country_cdf_med = {country_by_iso_2[x]: (np.median(error_per_country_cdf[x]),
                                                len(error_per_country_cdf[x]), len(country_per_ip[x])) for x in error_per_country_cdf}


error_per_country_cdf_med_sorted = sorted(
    error_per_country_cdf_med.items(), key=lambda x: x[1][0], reverse=True)
print(error_per_country_cdf_med_sorted)

Ys = [list(error_per_continent_cdf[c])
        for c in error_per_continent_cdf]
labels = [
    f"{c} ({len(error_per_continent_cdf[c])})" for c in error_per_continent_cdf]
fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                            "Geolocation error (km)",
                            "CDF of targets",
                            xscale="log",
                            yscale="linear",
                            legend=labels)
homogenize_legend(ax, "lower right")
ofile = CBG_THRESHOLD_CONTINENT_FILE
plot_save(ofile, is_tight_layout=True)

Threshold 0 no geolocation 0
Threshold 40 no geolocation 0
Threshold 100 no geolocation 0
Threshold 500 no geolocation 0
Threshold 1000 no geolocation 1
[('Venezuela', (2192.9140091405034, 1, 1)), ('Pakistan', (1543.1755865726761, 1, 1)), ('Colombia', (1434.2320893110607, 1, 1)), ('Mozambique', (832.4551341422323, 1, 1)), ('Kuwait', (717.8898014224587, 1, 1)), ('Peru', (383.09155748407676, 2, 2)), ('Ecuador', (263.85158220593513, 1, 1)), ('Argentina', (221.1132724306607, 4, 4)), ('Hungary', (161.44000325140453, 1, 1)), ('Macedonia', (132.45677655906775, 1, 1)), ('Paraguay', (117.97642875966719, 1, 1)), ('Ghana', (98.31229536436996, 2, 2)), ('Thailand', (93.312454164549, 2, 2)), ('Kenya', (61.11453915894795, 1, 1)), ('Iraq', (54.572811751748695, 1, 1)), ('Poland', (49.969744441399314, 8, 8)), ('Belarus', (43.57770007033401, 1, 1)), ('Dominican Republic', (42.13216173978507, 2, 2)), ('Belgium', (36.693758379143034, 6, 6)), ('United Kingdom', (30.11516867734334, 34, 34)), ('Bulgaria', (29

## Step 8 :

In [None]:
accuracy_vs_n_vps_probes = load_json(ACCURACY_VS_N_VPS_PROBES_FILE)
accuracy_vs_n_vps_probes = {
    int(x): accuracy_vs_n_vps_probes[x] for x in accuracy_vs_n_vps_probes}
X = sorted([x for x in sorted(accuracy_vs_n_vps_probes.keys())])
Ys = [accuracy_vs_n_vps_probes[i] for i in X]
Ys_med = [[np.median(x) for x in Ys]]
Ys_err = [[np.std(x) for x in Ys]]

"""
Fig 3.a of the paper
"""

fig, ax = plot_multiple_error_bars(X, Ys_med, Ys_err,
                                    xmin=10, xmax=10500, ymin=1, ymax=10000,
                                    xlabel="Number of VPs",
                                    ylabel="Geolocation error (km)",
                                    xscale="log",
                                    yscale="log",
                                    labels=[
                                        ""
                                    ],

                                    )

homogenize_legend(ax, "lower right")
ofile = FIG_3A_FILE
plot_save(ofile, is_tight_layout=True)

"""
Fig 3.b of the paper
"""

subset_sizes = [100, 500, 1000, 2000]

labels = [f"{s} VPs" for s in subset_sizes]

Ys = [accuracy_vs_n_vps_probes[i] for i in subset_sizes]
print(min(accuracy_vs_n_vps_probes[100]),
        max(accuracy_vs_n_vps_probes[100]))

fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                            "Geolocation error (km)",
                            "CDF of median error",
                            xscale="log",
                            yscale="linear",
                            legend=labels)
homogenize_legend(ax, "lower right")
ofile = FIG_3B_FILE
plot_save(ofile, is_tight_layout=True)

## Step 10

In [21]:
round_based_algorithm_results = load_json(ROUND_BASED_ALGORITHM_FILE)

round_based_algorithm_results = {
int(x): round_based_algorithm_results[x] for x in round_based_algorithm_results}

errors_threshold_probes_to_anchors = load_json(PROBES_TO_ANCHORS_RESULT_FILE)
error_threshold_cdfs_p_to_a, circles_threshold_cdfs_p_to_a, _ = compute_error_threshold_cdfs(
errors_threshold_probes_to_anchors)

Ys_error = [error_threshold_cdfs_p_to_a[0]]
Ys_n_vps = []

labels_error = ["All VPs"]
labels_n_vps = []

for tier1_vps, results in sorted(round_based_algorithm_results.items()):
        tier1_vps = int(tier1_vps)
        error_cdf = [r[1] for r in results if r[1] is not None]
        n_vps_cdf = [r[2] + tier1_vps for r in results if r[2] is not None]
        label = f"{tier1_vps} VPs"
        labels_error.append(label)
        labels_n_vps.append(label)
        Ys_error.append(error_cdf)
        Ys_n_vps.append(n_vps_cdf)
        print(tier1_vps, 3 * sum(n_vps_cdf))

fig, ax = plot_multiple_cdf(Ys_error, 10000, 1, 10000,
                        "Geolocation error (km)",
                        "CDF of targets",
                        xscale="log",
                        yscale="linear",
                        legend=labels_error)
homogenize_legend(ax, "lower right")
ofile = ROUND_ALGORITHM_ERROR_FILE
plot_save(ofile, is_tight_layout=True)

fig, ax = plot_multiple_cdf(Ys_n_vps, 10000, 10, 10000,
                        "Vantage points",
                        "CDF of representatives",
                        xscale="log",
                        yscale="linear",
                        legend=labels_n_vps)
homogenize_legend(ax, "upper left")
ofile = ROUND_BASED_ALGORITHM_FILE
plot_save(ofile, is_tight_layout=True)

Threshold 0 no geolocation 0
Threshold 40 no geolocation 0
Threshold 100 no geolocation 0
Threshold 500 no geolocation 0
Threshold 1000 no geolocation 1
10 3051555
100 4529640
300 2918748
500 2309946
1000 2639067


## Step 11

In [22]:
data = load_json(FINAL_ANALYSABLE_FILE)

query = get_min_rtt_per_src_dst_query_ping_table(
    'geolocation_replication', 'targets_to_landmarks_pings', '', 1000000)
client = Client('127.0.0.1')
db_table = client.execute(query)
rtts = []
remove_dict = {}
print(len(db_table))
for l in db_table:
    rtts.append(l[2])
    remove_dict[(l[0], l[1])] = l[2]
print(len(rtts))
plot_multiple_cdf([rtts], 10000, 0, None, 'Min RTT (ms)',
                    'CDF of (landmark, target) pairs', None)
plot_save(CLOSE_LANDMARK_FILE, is_tight_layout=True)

plot_multiple_cdf([rtts], 10000, 0.1, None, 'Min RTT (ms)',
                    'CDF of (landmark, target) pairs', None, xscale="log")
plot_save(CLOSE_LANDMARK_LOG_FILE, is_tight_layout=True)

error1 = []
error2 = []
error3 = []
error4 = []
error1ms = []
error2ms = []
error5ms = []
error10ms = []

for _, d in data.items():
    errors = every_tier_result_and_errors(d)
    error1.append(errors['error1'])
    error2.append(errors['error2'])
    error3.append(errors['error3'])
    error4.append(errors['error4'])
    err1ms = 50000
    err2ms = 50000
    err5ms = 50000
    err10ms = 50000
    for f in ['tier2:landmarks', 'tier3:landmarks']:
        if f in d:
            for l_ip, _, l_lat, l_lon in d[f]:
                dist = haversine((l_lat, l_lon), (d['lat_c'], d['lon_c']))
                key_rtt = (l_ip, d['target_ip'])
                if dist < err1ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 1):
                    err1ms = dist
                if dist < err2ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 2):
                    err2ms = dist
                if dist < err5ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 5):
                    err5ms = dist
                if dist < err10ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 10):
                    err10ms = dist
    if err1ms != 50000:
        error1ms.append(err1ms)
    else:
        error1ms.append(error1[-1])
    if err2ms != 50000:
        error2ms.append(err2ms)
    else:
        error2ms.append(error1[-1])
    if err5ms != 50000:
        error5ms.append(err5ms)
    else:
        error5ms.append(error1[-1])
    if err10ms != 50000:
        error10ms.append(err10ms)
    else:
        error10ms.append(error1[-1])

plot_multiple_cdf([error3, error4, error1ms, error2ms, error5ms, error10ms], 10000, 0, None, 'Geolocation error (km)', 'CDF of targets', [
                    "Street Level", "Closest landmark unfiltered", "Closest landmark <= 1ms", "Closest landmark <= 2ms", "Closest landmark <= 5ms", "Closest landmark <= 10ms"])
plt.legend(fontsize="10")
plot_save(CLOSE_LANDMARK_FILE_2, is_tight_layout=True)

plot_multiple_cdf([error3, error4, error1ms, error2ms, error5ms, error10ms], 10000, 0.1, None, 'Geolocation error (km)', 'CDF of targets', [
                    "Street Level", "Closest landmark unfiltered", "Closest landmark <= 1ms", "Closest landmark <= 2ms", "Closest landmark <= 5ms", "Closest landmark <= 10ms"], xscale="log")
plt.legend(fontsize="10")
plot_save(CLOSE_LANDMARK_LOG_FILE_2, is_tight_layout=True)

for i in [1, 5, 10, 40, 9999999999]:
    c = len([j for j in error1ms if j <= i])
    print(f"{c} targets with landmarks (ping <= {i}) or {c/len(error1ms)}")

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\milo2\\Desktop\\review\\datasets\\measurements\\street_level\\final_all_res.json'

## Step X

In [None]:
ip_info_geo = load_json(IP_INFO_GEO_FILE)
mm_geo = load_json(MAXMIND_GEO_FILE)
errors_threshold_probes_to_anchors = load_json(PROBES_TO_ANCHORS_RESULT_FILE)

In [None]:
error_threshold_cdfs_p_to_a, circles_threshold_cdfs_p_to_a, _ = compute_error_threshold_cdfs(
    errors_threshold_probes_to_anchors)

maxmind_error = {}
ip_info_error = {}
for i, anchor in enumerate(sorted(anchors, key=lambda x: x["address_v4"])):
    ip = anchor["address_v4"]
    if ip in removed_probes:
        continue

    if "geometry" not in anchor:
        continue

    long, lat = anchor["geometry"]["coordinates"]
    if ip in mm_geo:
        error = haversine(mm_geo[ip], (lat, long))
        maxmind_error[ip] = error

    if ip in ip_info_geo:
        ipinfo_lat, ipinfo_long = ip_info_geo[ip]["loc"].split(",")
        ipinfo_lat, ipinfo_long = float(ipinfo_lat), float(ipinfo_long)
        error = haversine((ipinfo_lat, ipinfo_long), (lat, long))
        ip_info_error[ip] = error

Ys = [error_threshold_cdfs_p_to_a[0], list(
    maxmind_error.values()), list(ip_info_error.values())]
print([len(Y) for Y in Ys])
labels = ["All VPs", "Maxmind (Free)", "IPinfo"]
fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                            "Geolocation error (km)",
                            "CDF of targets",
                            xscale="log",
                            yscale="linear",
                            legend=labels)
homogenize_legend(ax, "lower right")

ofile = GEO_DATABASE_FILE
plot_save(ofile, is_tight_layout=True)