# Plot
Plot all the figures of the replication paper  
To do after analysis/million_scale.ipynb

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import csv

from scipy.stats import pearsonr

from scripts.utils.file_utils import load_json
from scripts.analysis.analysis import compute_error_threshold_cdfs, every_tier_result_and_errors
from scripts.utils.plot_utils import plot_multiple_cdf, homogenize_legend, plot_save, plot_multiple_error_bars, plot_scatter_multiple
from scripts.utils.helpers import haversine, rtt_to_km
from default import *

repro = False
if repro:
    ACCURACY_VS_N_VPS_PROBES_FILE = REPRO_ACCURACY_VS_N_VPS_PROBES_FILE
    ACCURACY_VS_NB_VPS_FILE = REPRO_ACCURACY_VS_NB_VPS_FILE
    ACCURACY_VS_SUBSET_SIZES_FILE = REPRO_ACCURACY_VS_SUBSET_SIZES_FILE
    PROBES_TO_ANCHORS_RESULT_FILE = REPRO_PROBES_TO_ANCHORS_RESULT_FILE
    CBG_THRESHOLD_PROBES_FILE = REPRO_CBG_THRESHOLD_PROBES_FILE
    VP_SELECTION_ALGORITHM_PROBES_1_FILE = REPRO_VP_SELECTION_ALGORITHM_PROBES_1_FILE
    VP_SELECTION_ALGORITHM_PROBES_3_FILE = REPRO_VP_SELECTION_ALGORITHM_PROBES_3_FILE
    VP_SELECTION_ALGORITHM_PROBES_10_FILE = REPRO_VP_SELECTION_ALGORITHM_PROBES_10_FILE
    CBG_THRESHOLD_CONTINENT_FILE = REPRO_CBG_THRESHOLD_CONTINENT_FILE
    CLOSE_LANDMARK_FILE = REPRO_CLOSE_LANDMARK_FILE
    INVALID_RTT_FILE = REPRO_INVALID_RTT_FILE
else:
    ACCURACY_VS_N_VPS_PROBES_FILE = USER_ACCURACY_VS_N_VPS_PROBES_FILE
    ACCURACY_VS_NB_VPS_FILE = USER_ACCURACY_VS_NB_VPS_FILE
    ACCURACY_VS_SUBSET_SIZES_FILE = USER_ACCURACY_VS_SUBSET_SIZES_FILE
    PROBES_TO_ANCHORS_RESULT_FILE = USER_PROBES_TO_ANCHORS_RESULT_FILE
    CBG_THRESHOLD_PROBES_FILE = USER_CBG_THRESHOLD_PROBES_FILE
    VP_SELECTION_ALGORITHM_PROBES_1_FILE = USER_VP_SELECTION_ALGORITHM_PROBES_1_FILE
    VP_SELECTION_ALGORITHM_PROBES_3_FILE = USER_VP_SELECTION_ALGORITHM_PROBES_3_FILE
    VP_SELECTION_ALGORITHM_PROBES_10_FILE = USER_VP_SELECTION_ALGORITHM_PROBES_10_FILE
    CBG_THRESHOLD_CONTINENT_FILE = USER_CBG_THRESHOLD_CONTINENT_FILE
    CLOSE_LANDMARK_FILE = USER_CLOSE_LANDMARK_FILE
    INVALID_RTT_FILE = USER_INVALID_RTT_FILE


## Accuracy vs number of VPs and subset sizes

In [2]:
accuracy_vs_n_vps_probes = load_json(ACCURACY_VS_N_VPS_PROBES_FILE)
accuracy_vs_n_vps_probes = {
    int(x): accuracy_vs_n_vps_probes[x] for x in accuracy_vs_n_vps_probes}
X = sorted([x for x in sorted(accuracy_vs_n_vps_probes.keys())])
Ys = [accuracy_vs_n_vps_probes[i] for i in X]
Ys_med = [[np.median(x) for x in Ys]]
Ys_err = [[np.std(x) for x in Ys]]

### Fig 2.a of the replication paper

In [3]:
fig, ax = plot_multiple_error_bars(X, Ys_med, Ys_err,
                                    xmin=10, xmax=10500, ymin=1, ymax=10000,
                                    xlabel="Number of VPs",
                                    ylabel="Geolocation error (km)",
                                    xscale="log",
                                    yscale="log",
                                    labels=[
                                        ""
                                    ],

                                    )

homogenize_legend(ax, "lower right")
plot_save(ACCURACY_VS_NB_VPS_FILE, is_tight_layout=True)

  miny = np.nanmin(masked_verts[..., 1])
  maxy = np.nanmax(masked_verts[..., 1])
2023-09-13 16:46:47::INFO:fontTools.subset:__init__:: maxp pruned
2023-09-13 16:46:47::INFO:fontTools.subset:__init__:: cmap pruned
2023-09-13 16:46:47::INFO:fontTools.subset:__init__:: kern dropped
2023-09-13 16:46:47::INFO:fontTools.subset:__init__:: post pruned
2023-09-13 16:46:47::INFO:fontTools.subset:__init__:: FFTM dropped
2023-09-13 16:46:47::INFO:fontTools.subset:__init__:: GPOS pruned
2023-09-13 16:46:47::INFO:fontTools.subset:__init__:: GSUB pruned
2023-09-13 16:46:47::INFO:fontTools.subset:__init__:: glyf pruned
2023-09-13 16:46:47::INFO:fontTools.subset:__init__:: Added gid0 to subset
2023-09-13 16:46:47::INFO:fontTools.subset:__init__:: Added first four glyphs to subset
2023-09-13 16:46:47::INFO:fontTools.subset:__init__:: Closing glyph list over 'GSUB': 9 glyphs before
2023-09-13 16:46:47::INFO:fontTools.subset:__init__:: Glyph names: ['.notdef', '.null', 'four', 'nonmarkingreturn', 'one', 

### Fig 2.b of the replication paper

In [4]:
subset_sizes = [100]

labels = [f"{s} VPs" for s in subset_sizes]

Ys = [accuracy_vs_n_vps_probes[i] for i in subset_sizes]


fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                            "Geolocation error (km)",
                            "CDF of median error",
                            xscale="log",
                            yscale="linear",
                            legend=labels)
homogenize_legend(ax, "lower right")
ofile = ACCURACY_VS_SUBSET_SIZES_FILE
plot_save(ofile, is_tight_layout=True)

2023-09-13 16:46:48::INFO:fontTools.subset:__init__:: maxp pruned
2023-09-13 16:46:48::INFO:fontTools.subset:__init__:: cmap pruned
2023-09-13 16:46:48::INFO:fontTools.subset:__init__:: kern dropped
2023-09-13 16:46:48::INFO:fontTools.subset:__init__:: post pruned
2023-09-13 16:46:48::INFO:fontTools.subset:__init__:: FFTM dropped
2023-09-13 16:46:48::INFO:fontTools.subset:__init__:: GPOS pruned
2023-09-13 16:46:48::INFO:fontTools.subset:__init__:: GSUB pruned
2023-09-13 16:46:48::INFO:fontTools.subset:__init__:: glyf pruned
2023-09-13 16:46:48::INFO:fontTools.subset:__init__:: Added gid0 to subset
2023-09-13 16:46:48::INFO:fontTools.subset:__init__:: Added first four glyphs to subset
2023-09-13 16:46:48::INFO:fontTools.subset:__init__:: Closing glyph list over 'GSUB': 12 glyphs before
2023-09-13 16:46:48::INFO:fontTools.subset:__init__:: Glyph names: ['.notdef', '.null', 'eight', 'four', 'nonmarkingreturn', 'one', 'period', 'six', 'space', 'three', 'two', 'zero']
2023-09-13 16:46:48::I

## CBG with VPs threshold

### Fig 2.c of the replication paper

In [5]:
errors_threshold_probes_to_anchors = load_json(PROBES_TO_ANCHORS_RESULT_FILE)

error_threshold_cdfs_p_to_a, circles_threshold_cdfs_p_to_a, _ = compute_error_threshold_cdfs(
    errors_threshold_probes_to_anchors)

Ys = error_threshold_cdfs_p_to_a
print(len(error_threshold_cdfs_p_to_a[0]))
labels = ["All VPs"]
labels.extend([f"VPs > {t} km" for t in THRESHOLD_DISTANCES if t > 0])
fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                            "Geolocation error (km)",
                            "CDF of targets",
                            xscale="log",
                            yscale="linear",
                            legend=labels)
homogenize_legend(ax, "lower right", legend_size=12)

ofile = CBG_THRESHOLD_PROBES_FILE
plot_save(ofile, is_tight_layout=True)

2023-09-13 16:46:49::INFO:root:analysis:: Threshold 0 no geolocation 0
2023-09-13 16:46:49::INFO:root:analysis:: Threshold 40 no geolocation 0
2023-09-13 16:46:49::INFO:root:analysis:: Threshold 100 no geolocation 0
2023-09-13 16:46:49::INFO:root:analysis:: Threshold 500 no geolocation 1
2023-09-13 16:46:49::INFO:root:analysis:: Threshold 1000 no geolocation 1


1


  return n/db/n.sum(), bin_edges
2023-09-13 16:46:50::INFO:fontTools.subset:__init__:: maxp pruned
2023-09-13 16:46:50::INFO:fontTools.subset:__init__:: cmap pruned
2023-09-13 16:46:50::INFO:fontTools.subset:__init__:: kern dropped
2023-09-13 16:46:50::INFO:fontTools.subset:__init__:: post pruned
2023-09-13 16:46:50::INFO:fontTools.subset:__init__:: FFTM dropped
2023-09-13 16:46:50::INFO:fontTools.subset:__init__:: GPOS pruned
2023-09-13 16:46:51::INFO:fontTools.subset:__init__:: GSUB pruned
2023-09-13 16:46:51::INFO:fontTools.subset:__init__:: glyf pruned
2023-09-13 16:46:51::INFO:fontTools.subset:__init__:: Added gid0 to subset
2023-09-13 16:46:51::INFO:fontTools.subset:__init__:: Added first four glyphs to subset
2023-09-13 16:46:51::INFO:fontTools.subset:__init__:: Closing glyph list over 'GSUB': 21 glyphs before
2023-09-13 16:46:51::INFO:fontTools.subset:__init__:: Glyph names: ['.notdef', '.null', 'A', 'P', 'V', 'eight', 'five', 'four', 'greater', 'k', 'l', 'm', 'nonmarkingreturn

## CBG performance with original VP selection algorithm and new VP selection algorithm

### Fig 3.a of the replication paper

In [None]:
Ys = []
labels = []
results_file = [VP_SELECTION_ALGORITHM_PROBES_1_FILE, VP_SELECTION_ALGORITHM_PROBES_3_FILE, VP_SELECTION_ALGORITHM_PROBES_10_FILE]
index = [1, 3, 10]

for i, file in enumerate(results_file):
    n_vps = index[i]
    errors_threshold_vp_selection_algorithm = load_json(
        results_file[i])
    error_threshold_cdfs_p_to_a_vp_selection, circles_threshold_cdfs_p_to_a_vp_selection, _ = compute_error_threshold_cdfs(
        errors_threshold_vp_selection_algorithm)
    Ys.append(list(error_threshold_cdfs_p_to_a_vp_selection[0]))
    labels.append(f"{n_vps} closest VP (RTT)")
    if n_vps == 10:
        # Take the baseline where 10 VPs are used to geolocate a target
        error_threshold_cdfs_p_to_a, circles_threshold_cdfs_p_to_a, _ = compute_error_threshold_cdfs(
            errors_threshold_probes_to_anchors, errors_threshold_vp_selection_algorithm)
        Ys.append(list(error_threshold_cdfs_p_to_a[0]))
        labels.append("All VPs")

fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                        "Geolocation error (km)",
                        "CDF of targets",
                        xscale="log",
                        yscale="linear",
                        legend=labels)
homogenize_legend(ax, "lower right")
ofile = CBG_THRESHOLD_VP_SELECTION_FILE
plot_save(ofile, is_tight_layout=True)

### Fig 3.b of the replication paper

In [None]:
round_based_algorithm_results = load_json(ROUND_BASED_ALGORITHM_FILE)

round_based_algorithm_results = {int(x):round_based_algorithm_results[x] for x in round_based_algorithm_results}

errors_threshold_probes_to_anchors = load_json(PROBES_TO_ANCHORS_RESULT_FILE)
error_threshold_cdfs_p_to_a, circles_threshold_cdfs_p_to_a, _ = compute_error_threshold_cdfs(
    errors_threshold_probes_to_anchors)

Ys_error = [error_threshold_cdfs_p_to_a[0]]
Ys_n_vps = []

labels_error = ["All VPs"]
labels_n_vps = []


for tier1_vps, results in sorted(round_based_algorithm_results.items()):
    tier1_vps = int(tier1_vps)
    error_cdf = [r[1] for r in results if r[1] is not None]
    n_vps_cdf = [r[2] + tier1_vps for r in results if r[2] is not None]
    label = f"{tier1_vps} VPs"
    labels_error.append(label)
    labels_n_vps.append(label)
    Ys_error.append(error_cdf)
    Ys_n_vps.append(n_vps_cdf)
    print(tier1_vps, 3 * sum(n_vps_cdf))

fig, ax = plot_multiple_cdf(Ys_error, 10000, 1, 10000,
                            "Geolocation error (km)",
                            "CDF of targets",
                            xscale="log",
                            yscale="linear",
                            legend=labels_error)
homogenize_legend(ax, "lower right")
ofile = ROUND_ALGORITHM_ERROR_FILE
plot_save(ofile, is_tight_layout=True)

2023-09-13 14:53:14::INFO:root:analysis:: Threshold 0 no geolocation 0
2023-09-13 14:53:14::INFO:root:analysis:: Threshold 40 no geolocation 0
2023-09-13 14:53:14::INFO:root:analysis:: Threshold 100 no geolocation 0
2023-09-13 14:53:14::INFO:root:analysis:: Threshold 500 no geolocation 1
2023-09-13 14:53:14::INFO:root:analysis:: Threshold 1000 no geolocation 3


10 5785182
100 4459050
300 3205290
500 2800245
1000 2817933


2023-09-13 14:53:15::INFO:fontTools.subset:__init__:: maxp pruned
2023-09-13 14:53:15::INFO:fontTools.subset:__init__:: cmap pruned
2023-09-13 14:53:15::INFO:fontTools.subset:__init__:: kern dropped
2023-09-13 14:53:15::INFO:fontTools.subset:__init__:: post pruned
2023-09-13 14:53:15::INFO:fontTools.subset:__init__:: FFTM dropped
2023-09-13 14:53:15::INFO:fontTools.subset:__init__:: GPOS pruned
2023-09-13 14:53:15::INFO:fontTools.subset:__init__:: GSUB pruned
2023-09-13 14:53:15::INFO:fontTools.subset:__init__:: glyf pruned
2023-09-13 14:53:15::INFO:fontTools.subset:__init__:: Added gid0 to subset
2023-09-13 14:53:15::INFO:fontTools.subset:__init__:: Added first four glyphs to subset
2023-09-13 14:53:15::INFO:fontTools.subset:__init__:: Closing glyph list over 'GSUB': 18 glyphs before
2023-09-13 14:53:15::INFO:fontTools.subset:__init__:: Glyph names: ['.notdef', '.null', 'A', 'P', 'V', 'eight', 'five', 'four', 'l', 'nonmarkingreturn', 'one', 'period', 's', 'six', 'space', 'three', 'two

## Error per continent

### Fig 4 of the replication paper

In [None]:
def iso_code_2_to_country():
    country_by_iso_2 = {}
    continent_by_iso_2 = {}
    # Continent_Name,Continent_Code,Country_Name,Two_Letter_Country_Code,Three_Letter_Country_Code,Country_Number
    with open(COUNTRIES_CSV_FILE) as f:
        reader = csv.reader(f, delimiter=",", quotechar='"')
        next(reader, None)
        for line in reader:
            continent_code = line[1]
            country_name = line[2].split(",")[0]
            country_iso_code_2 = line[3]
            country_by_iso_2[country_iso_code_2] = country_name
            continent_by_iso_2[country_iso_code_2] = continent_code
    return continent_by_iso_2, country_by_iso_2

In [None]:
anchors = load_json(REPRO_ANCHORS_FILE)
ip_per_country = {}
for anchor in anchors:
    if "address_v4" in anchor and "geometry" in anchor and "coordinates" in anchor["geometry"]:
        ip_v4_address = anchor["address_v4"]
        if ip_v4_address is None:
            continue
        country = anchor["country_code"]
        ip_per_country[ip_v4_address] = country

country_per_ip = {}
for ip, country in ip_per_country.items():
    country_per_ip.setdefault(country, []).append(ip)

In [None]:
# Compute results per continent

errors_threshold_probes_to_anchors = load_json(PROBES_TO_ANCHORS_RESULT_FILE)

continent_by_iso_2, country_by_iso_2 = iso_code_2_to_country()

_, _, error_per_ip = compute_error_threshold_cdfs(errors_threshold_probes_to_anchors)

error_per_continent_cdf = {}
error_per_country_cdf = {}

# Match the anchors of the second replicated paper
anchors_second = list(set(load_json(ANCHORS_SECOND_PAPER_FILE)))
for ip, error in error_per_ip.items():
    if ip not in anchors_second:
        continue
    country = ip_per_country[ip]
    continent = continent_by_iso_2[country]
    error_per_continent_cdf.setdefault(continent, []).append(error)
    error_per_country_cdf.setdefault(country, []).append(error)

error_per_country_cdf_med = {country_by_iso_2[x]: (np.median(error_per_country_cdf[x]),
                                                len(error_per_country_cdf[x]), len(country_per_ip[x])) for x in error_per_country_cdf}


error_per_country_cdf_med_sorted = sorted(
    error_per_country_cdf_med.items(), key=lambda x: x[1][0], reverse=True)
print(error_per_country_cdf_med_sorted)

Ys = [list(error_per_continent_cdf[c])
        for c in error_per_continent_cdf]
labels = [
    f"{c} ({len(error_per_continent_cdf[c])})" for c in error_per_continent_cdf]
fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                            "Geolocation error (km)",
                            "CDF of targets",
                            xscale="log",
                            yscale="linear",
                            legend=labels)
homogenize_legend(ax, "lower right")
ofile = CBG_THRESHOLD_CONTINENT_FILE
plot_save(ofile, is_tight_layout=True)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/hugo/Documents/geoloc/geoloc-imc-2023/datasets/static_datasets/iso_code_2.csv'

## Performance of the street level technique

### Fig 5.a of the replication paper

In [2]:
data = load_json(ANALYZABLE_FILE)

error1 = []
error2 = []
error3 = []
error4 = []

filtered_error1 = []
filtered_error2 = []
filtered_error3 = []
filtered_error4 = []
for _, d in data.items():
    errors = every_tier_result_and_errors(d)
    error1.append(errors['error1'])
    error2.append(errors['error2'])
    error3.append(errors['error3'])
    error4.append(errors['error4'])
    if d['tier1:done'] and 'tier2:landmarks' in d and len(d['tier2:landmarks']) > 0:
        filtered_error1.append(errors['error1'])
        filtered_error2.append(errors['error2'])
        filtered_error3.append(errors['error3'])
        filtered_error4.append(errors['error4'])

print(len([i for i in error4 if i <= 1]))

street_lvl_count_cbg = 0
street_lvl_count_tech = 0
for e in error1:
    if e <= 1:
        street_lvl_count_cbg += 1
for e in error3:
    if e <= 1:
        street_lvl_count_tech += 1
print(f"{street_lvl_count_cbg} targets are geolocated at street lvl using CBG {street_lvl_count_cbg/len(error1)}")
print(f"{street_lvl_count_tech} targets are geolocated at street lvl using tech {street_lvl_count_tech/len(error3)}")

median1 = np.median(error1)
median2 = np.median(error2)
median3 = np.median(error3)
median4 = np.median(error4)

print(f"CBG median error = {median1}")
print(f"Median error of tier 2 = {median2}")
print(f"Street lvl median error = {median3}")
print(f"closest landmark distance median = {median4}")

less_then_1 = 0
less_then_1_lm = 0
for e in error3:
    if e <= 1:
        less_then_1 += 1
for e in error4:
    if e <= 1:
        less_then_1_lm += 1
print(f"{less_then_1} targets are geolocated at street lvl out of {len(error3)} or {less_then_1*100/len(error3)}%")
print(f"{less_then_1_lm} targets has a landmark at street lvl out of {len(error4)} or {less_then_1_lm*100/len(error4)}%")


plot_multiple_cdf([error3, error1, error4], 10000, 0.1, None, 'Geolocation error (km)',
                    'CDF of targets', ["Street Level", "CBG", "Closest Landmark"], xscale="log")
plt.legend(fontsize="14")
plot_save(CLOSE_LANDMARK_FILE, is_tight_layout=True)

2023-09-14 13:22:57::INFO:root:analysis:: Tier1 Failed


207
77 targets are geolocated at street lvl using CBG 0.10650069156293222
17 targets are geolocated at street lvl using tech 0.02351313969571231
CBG median error = 29.368729465989418
Median error of tier 2 = 34.41749100848292
Street lvl median error = 27.917440080911355
closest landmark distance median = 3.316187115363793
17 targets are geolocated at street lvl out of 723 or 2.351313969571231%
207 targets has a landmark at street lvl out of 723 or 28.630705394190873%


2023-09-14 13:23:04::INFO:fontTools.subset:__init__:: maxp pruned
2023-09-14 13:23:04::INFO:fontTools.subset:__init__:: cmap pruned
2023-09-14 13:23:04::INFO:fontTools.subset:__init__:: kern dropped
2023-09-14 13:23:04::INFO:fontTools.subset:__init__:: post pruned
2023-09-14 13:23:04::INFO:fontTools.subset:__init__:: FFTM dropped
2023-09-14 13:23:04::INFO:fontTools.subset:__init__:: GPOS pruned
2023-09-14 13:23:04::INFO:fontTools.subset:__init__:: GSUB pruned
2023-09-14 13:23:04::INFO:fontTools.subset:__init__:: glyf pruned
2023-09-14 13:23:04::INFO:fontTools.subset:__init__:: Added gid0 to subset
2023-09-14 13:23:04::INFO:fontTools.subset:__init__:: Added first four glyphs to subset
2023-09-14 13:23:04::INFO:fontTools.subset:__init__:: Closing glyph list over 'GSUB': 30 glyphs before
2023-09-14 13:23:04::INFO:fontTools.subset:__init__:: Glyph names: ['.notdef', '.null', 'B', 'C', 'G', 'L', 'S', 'a', 'd', 'e', 'eight', 'four', 'k', 'l', 'm', 'minus', 'n', 'nonmarkingreturn', 'o', 'one'

### Fig 5.c of the replication paper

In [3]:
data = load_json(ANALYZABLE_FILE)

correlations = []
mdvd = {}
scater_plot_data = {}
for target_ip, d in data.items():
    tmp_landmarks = {}
    for f in ['tier2:traceroutes', 'tier3:traceroutes']:
        if f in d:
            for t in d[f]:
                if t[4] < 0:
                    continue
                landmarks_ip = t[2]
                measured_distance = rtt_to_km(t[4], 4/9, 300)
                distance = haversine(
                    (t[5], t[6]), (d['RIPE:lat'], d['RIPE:lon']))
                if landmarks_ip not in tmp_landmarks:
                    tmp_landmarks[landmarks_ip] = (
                        measured_distance, distance)
                if measured_distance < tmp_landmarks[landmarks_ip][0]:
                    tmp_landmarks[landmarks_ip] = (
                        measured_distance, distance)
    if len(tmp_landmarks) != 0:
        tmp_dict = {'md': [], 'd': []}
        for k, v in tmp_landmarks.items():
            all_diff = True
            for i in range(len(tmp_dict['d'])):
                if v[1] == tmp_dict['d'][i]:
                    all_diff = False
            if all_diff:
                tmp_dict['md'].append(v[0])
                tmp_dict['d'].append(v[1])
        if len(tmp_dict['md']) > 1:
            correlation = pearsonr(tmp_dict['md'], tmp_dict['d'])[0]
            tmp_dict['correlation'] = correlation
            correlations.append(correlation)
            mdvd[d['target_ip']] = tmp_dict
        if len(tmp_dict['md']) >= 5:  # and len(tmp_dict['md']) <= 15:
            error = every_tier_result_and_errors(d)
            if error['error3'] < 45:
                scater_plot_data[target_ip] = {
                    'geo_loc_data': d, 'error_data': error, 'mdvd_data': tmp_dict}

medianc = np.median(correlations)
minc = min(correlations)
maxc = max(correlations)

print(f"Measured Distance vs Distance median correlation = {medianc}")
print(f"Measured Distance vs Distance min correlation = {minc}")
print(f"Measured Distance vs Distance max correlation = {maxc}")


x1 = []
x2 = []
x3 = []
x4 = []
y1 = []
y2 = []
y3 = []
y4 = []
for _, d in scater_plot_data.items():
    if d['error_data']['error3'] != d['error_data']['error1'] and d['error_data']['error3'] < 1:
        if len(x1) == 0:
            x1 = d['mdvd_data']['d']
            y1 = d['mdvd_data']['md']
        if len(x1) > len(d['mdvd_data']['d']):
            x1 = d['mdvd_data']['d']
            y1 = d['mdvd_data']['md']
    if d['error_data']['error3'] != d['error_data']['error1'] and d['error_data']['error3'] < 6 and d['error_data']['error3'] > 4:
        if len(x2) == 0:
            x2 = d['mdvd_data']['d']
            y2 = d['mdvd_data']['md']
        if len(x2) > len(d['mdvd_data']['d']):
            x2 = d['mdvd_data']['d']
            y2 = d['mdvd_data']['md']
    if d['error_data']['error3'] != d['error_data']['error1'] and d['error_data']['error3'] < 11 and d['error_data']['error3'] > 9:
        if len(x3) == 0:
            x3 = d['mdvd_data']['d']
            y3 = d['mdvd_data']['md']
        if len(x3) > len(d['mdvd_data']['d']):
            x3 = d['mdvd_data']['d']
            y3 = d['mdvd_data']['md']
    if d['error_data']['error3'] != d['error_data']['error1'] and d['error_data']['error3'] < 41 and d['error_data']['error3'] > 39:
        if len(x4) == 0:
            x4 = d['mdvd_data']['d']
            y4 = d['mdvd_data']['md']
        if len(x4) > len(d['mdvd_data']['d']):
            x4 = d['mdvd_data']['d']
            y4 = d['mdvd_data']['md']

list_color = ['r', 'b', 'g', 'y']
list_mak = ['o', '*', 'x', '+']
list_lab = ['< 1 km error', '5 km error', '10 km error', '40 km error']
plot_scatter_multiple([x1, x2, x3, x4], [y1, y2, y3, y4], None, None, 1, None, "log", "log",
                        'Geographical distance (km)', 'Measured distance (km)', list_mak, list_color, [10, 10, 10, 10])
plt.legend(list_lab, fontsize="14")
plot_save(SCATTER_DISTANCE_FILE, is_tight_layout=True)

Measured Distance vs Distance median correlation = 0.07190527319632016
Measured Distance vs Distance min correlation = -1.0
Measured Distance vs Distance max correlation = 1.0


2023-09-14 13:23:15::INFO:fontTools.subset:__init__:: maxp pruned
2023-09-14 13:23:15::INFO:fontTools.subset:__init__:: cmap pruned
2023-09-14 13:23:15::INFO:fontTools.subset:__init__:: kern dropped
2023-09-14 13:23:15::INFO:fontTools.subset:__init__:: post pruned
2023-09-14 13:23:15::INFO:fontTools.subset:__init__:: FFTM dropped
2023-09-14 13:23:15::INFO:fontTools.subset:__init__:: GPOS pruned
2023-09-14 13:23:15::INFO:fontTools.subset:__init__:: GSUB pruned
2023-09-14 13:23:15::INFO:fontTools.subset:__init__:: glyf pruned
2023-09-14 13:23:15::INFO:fontTools.subset:__init__:: Added gid0 to subset
2023-09-14 13:23:15::INFO:fontTools.subset:__init__:: Added first four glyphs to subset
2023-09-14 13:23:15::INFO:fontTools.subset:__init__:: Closing glyph list over 'GSUB': 16 glyphs before
2023-09-14 13:23:15::INFO:fontTools.subset:__init__:: Glyph names: ['.notdef', '.null', 'e', 'five', 'four', 'k', 'less', 'm', 'nonmarkingreturn', 'o', 'one', 'r', 'space', 'three', 'two', 'zero']
2023-09

## Fraction of landmarks per target with unusable delays

### Fig 6.a of the replication paper

In [4]:
data = load_json(ANALYZABLE_FILE)

values = []
all_traceroutes_count = 0
no_r1_traceroutes_count = 0

for _, d in data.items():
    good = 0
    bad = 0

    for f in ['tier2:traceroutes', 'tier3:traceroutes']:
        if f in d:
            for t in d[f]:
                if t[4] < 0:
                    bad += 1
                else:
                    good += 1

                all_traceroutes_count += 1
                if t[3] == None:
                    no_r1_traceroutes_count += 1



    if good != 0 or bad != 0:
        values.append(bad/(bad+good))

print(f"{no_r1_traceroutes_count} no r1 found out of {all_traceroutes_count}")
plot_multiple_cdf([values], 10000, 0, 1,
                    'Fraction of landmarks with\nD1 + D2 < 0', 'CDF of targets', None)
plot_save(INVALID_RTT_FILE, is_tight_layout=True)

44833 no r1 found out of 143601


2023-09-14 13:23:18::INFO:fontTools.subset:__init__:: maxp pruned
2023-09-14 13:23:18::INFO:fontTools.subset:__init__:: cmap pruned
2023-09-14 13:23:18::INFO:fontTools.subset:__init__:: kern dropped
2023-09-14 13:23:18::INFO:fontTools.subset:__init__:: post pruned
2023-09-14 13:23:18::INFO:fontTools.subset:__init__:: FFTM dropped
2023-09-14 13:23:18::INFO:fontTools.subset:__init__:: GPOS pruned
2023-09-14 13:23:18::INFO:fontTools.subset:__init__:: GSUB pruned
2023-09-14 13:23:18::INFO:fontTools.subset:__init__:: glyf pruned
2023-09-14 13:23:18::INFO:fontTools.subset:__init__:: Added gid0 to subset
2023-09-14 13:23:18::INFO:fontTools.subset:__init__:: Added first four glyphs to subset
2023-09-14 13:23:18::INFO:fontTools.subset:__init__:: Closing glyph list over 'GSUB': 11 glyphs before
2023-09-14 13:23:18::INFO:fontTools.subset:__init__:: Glyph names: ['.notdef', '.null', 'eight', 'four', 'nonmarkingreturn', 'one', 'period', 'six', 'space', 'two', 'zero']
2023-09-14 13:23:18::INFO:fontT

## Error distance vs population density

In [6]:
data = load_json(ANALYZABLE_FILE)
pop_data = load_json(POPULATION_CITY_FILE)

dens_lst = []
error_lst = []
for d in pop_data:
    ip = d['target_ip']
    if ip not in data:
        continue
    pop = d['density']
    dens_lst.append(pop)
    errors = every_tier_result_and_errors(data[ip])
    error_lst.append(errors['error3'])

2023-09-14 13:41:58::INFO:root:analysis:: Tier1 Failed


### Fig 6.b of the replication paper

In [7]:
fig, ax = plot_scatter_multiple([error_lst], [dens_lst], 0.1, 10000, 0.1, 100000, "log",
                                "log", 'Error distance (km)', 'Population Density (people/km²)', ["x"], ["b"], [10])
degree = 1
coef = np.polyfit(error_lst, dens_lst, deg=degree)
xseq = np.linspace(0, 10000, num=100)
yseq = [0 for i in range(len(xseq))]
for i in range(len(coef)):
    power = len(coef) - i - 1
    yseq = [(xseq[j]**power)*coef[i]+yseq[j] for j in range(len(xseq))]
ax.plot(xseq, yseq, color="k", lw=2.5)
plot_save(SCATTER_DENSITY_FILE, is_tight_layout=True)

2023-09-14 13:42:02::INFO:fontTools.subset:__init__:: maxp pruned
2023-09-14 13:42:02::INFO:fontTools.subset:__init__:: cmap pruned
2023-09-14 13:42:02::INFO:fontTools.subset:__init__:: kern dropped
2023-09-14 13:42:02::INFO:fontTools.subset:__init__:: post pruned
2023-09-14 13:42:02::INFO:fontTools.subset:__init__:: FFTM dropped
2023-09-14 13:42:02::INFO:fontTools.subset:__init__:: GPOS pruned
2023-09-14 13:42:02::INFO:fontTools.subset:__init__:: GSUB pruned
2023-09-14 13:42:02::INFO:fontTools.subset:__init__:: glyf pruned
2023-09-14 13:42:02::INFO:fontTools.subset:__init__:: Added gid0 to subset
2023-09-14 13:42:02::INFO:fontTools.subset:__init__:: Added first four glyphs to subset
2023-09-14 13:42:02::INFO:fontTools.subset:__init__:: Closing glyph list over 'GSUB': 11 glyphs before
2023-09-14 13:42:02::INFO:fontTools.subset:__init__:: Glyph names: ['.notdef', '.null', 'five', 'four', 'minus', 'nonmarkingreturn', 'one', 'space', 'three', 'two', 'zero']
2023-09-14 13:42:02::INFO:fontT

### Fig 8 of the replication paper (appendix)

In [8]:
plot_multiple_cdf([dens_lst], 10000, None, None,
                    'Population Density (people/km²)', 'CDF of targets', None, xscale="log")
plot_save(CDF_DENSITY_FILE, is_tight_layout=True)

2023-09-14 13:42:04::INFO:fontTools.subset:__init__:: maxp pruned
2023-09-14 13:42:04::INFO:fontTools.subset:__init__:: cmap pruned
2023-09-14 13:42:04::INFO:fontTools.subset:__init__:: kern dropped
2023-09-14 13:42:04::INFO:fontTools.subset:__init__:: post pruned
2023-09-14 13:42:04::INFO:fontTools.subset:__init__:: FFTM dropped
2023-09-14 13:42:04::INFO:fontTools.subset:__init__:: GPOS pruned
2023-09-14 13:42:04::INFO:fontTools.subset:__init__:: GSUB pruned
2023-09-14 13:42:04::INFO:fontTools.subset:__init__:: glyf pruned
2023-09-14 13:42:04::INFO:fontTools.subset:__init__:: Added gid0 to subset
2023-09-14 13:42:04::INFO:fontTools.subset:__init__:: Added first four glyphs to subset
2023-09-14 13:42:04::INFO:fontTools.subset:__init__:: Closing glyph list over 'GSUB': 13 glyphs before
2023-09-14 13:42:04::INFO:fontTools.subset:__init__:: Glyph names: ['.notdef', '.null', 'eight', 'five', 'four', 'nonmarkingreturn', 'one', 'period', 'six', 'space', 'three', 'two', 'zero']
2023-09-14 13:

## Time to geolocate targets

### Fig 6.c of the replication paper

In [9]:
data= load_json(ANALYZABLE_FILE)

time1 = []
time2 = []
time3 = []
values = []
for _, d in data.items():
    if d['tier1:done'] and 'tier1:duration' in d:
        time1.append(d['tier1:duration'])
    if d['tier2:done'] and 'tier2:duration' in d:
        time2.append(d['tier2:duration'])
    if d['tier3:done'] and 'tier3:duration' in d:
        time3.append(d['tier3:duration'])
        values.append(d['tier1:duration'] +
                        d['tier2:duration']+d['tier3:duration'])

median1 = np.median(time1)
median2 = np.median(time2)
median3 = np.median(time3)
median = np.median(values)

print(f"CBG median duration = {median1}")
print(f"tier 2 median duration = {median2}")
print(f"tier 3 median duration = {median3}")
print(f"Street Level median duration = {median}")

plot_multiple_cdf([values], 1000, None, None,
                    'Time to geolocate a target (sec)', 'CDF of targets', None)
plot_save(TIME_TO_GEOLOCATE_FILE, is_tight_layout=True)

CBG median duration = 0.20766615867614746
tier 2 median duration = 394.77880704402924
tier 3 median duration = 736.932727098465
Street Level median duration = 1210.4626953601837


2023-09-14 13:42:06::INFO:fontTools.subset:__init__:: maxp pruned
2023-09-14 13:42:06::INFO:fontTools.subset:__init__:: cmap pruned
2023-09-14 13:42:06::INFO:fontTools.subset:__init__:: kern dropped
2023-09-14 13:42:06::INFO:fontTools.subset:__init__:: post pruned
2023-09-14 13:42:06::INFO:fontTools.subset:__init__:: FFTM dropped
2023-09-14 13:42:06::INFO:fontTools.subset:__init__:: GPOS pruned
2023-09-14 13:42:06::INFO:fontTools.subset:__init__:: GSUB pruned
2023-09-14 13:42:06::INFO:fontTools.subset:__init__:: glyf pruned
2023-09-14 13:42:06::INFO:fontTools.subset:__init__:: Added gid0 to subset
2023-09-14 13:42:06::INFO:fontTools.subset:__init__:: Added first four glyphs to subset
2023-09-14 13:42:06::INFO:fontTools.subset:__init__:: Closing glyph list over 'GSUB': 12 glyphs before
2023-09-14 13:42:06::INFO:fontTools.subset:__init__:: Glyph names: ['.notdef', '.null', 'eight', 'five', 'four', 'nonmarkingreturn', 'one', 'period', 'six', 'space', 'two', 'zero']
2023-09-14 13:42:06::IN

## Geolocation of CBG with all the RIPE Atlas VPs versus geolocation databases

### Fig 7 of the replication paper

In [10]:
ip_info_geo = load_json(IP_INFO_GEO_FILE)
mm_geo = load_json(MAXMIND_GEO_FILE)
errors_threshold_probes_to_anchors = load_json(PROBES_TO_ANCHORS_RESULT_FILE)
removed_probes = load_json(REMOVED_PROBES_FILE)

FileNotFoundError: [Errno 2] No such file or directory: '/srv/omar/geoloc-imc-2023/datasets/static_datasets/ip_info_geo_anchors.json'

In [None]:
error_threshold_cdfs_p_to_a, circles_threshold_cdfs_p_to_a, _ = compute_error_threshold_cdfs(
    errors_threshold_probes_to_anchors)

maxmind_error = {}
ip_info_error = {}
for i, anchor in enumerate(sorted(anchors, key=lambda x: x["address_v4"])):
    ip = anchor["address_v4"]
    if ip in removed_probes:
        continue

    if "geometry" not in anchor:
        continue

    long, lat = anchor["geometry"]["coordinates"]
    if ip in mm_geo:
        error = haversine(mm_geo[ip], (lat, long))
        maxmind_error[ip] = error

    if ip in ip_info_geo:
        ipinfo_lat, ipinfo_long = ip_info_geo[ip]["loc"].split(",")
        ipinfo_lat, ipinfo_long = float(ipinfo_lat), float(ipinfo_long)
        error = haversine((ipinfo_lat, ipinfo_long), (lat, long))
        ip_info_error[ip] = error

Ys = [error_threshold_cdfs_p_to_a[0], list(
    maxmind_error.values()), list(ip_info_error.values())]
print([len(Y) for Y in Ys])
labels = ["All VPs", "Maxmind (Free)", "IPinfo"]
fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                            "Geolocation error (km)",
                            "CDF of targets",
                            xscale="log",
                            yscale="linear",
                            legend=labels)
homogenize_legend(ax, "lower right")

ofile = GEO_DATABASE_FILE
plot_save(ofile, is_tight_layout=True)