In [2]:
import numpy as np
import matplotlib.pyplot as plt
import csv

from clickhouse_driver import Client

from scripts.utils.file_utils import load_json
from scripts.analysis.analysis import compute_error_threshold_cdfs, every_tier_result_and_errors
from scripts.utils.plot_utils import plot_multiple_cdf, homogenize_legend, plot_save, plot_multiple_error_bars
from scripts.utils.clickhouse_utils import get_min_rtt_per_src_dst_query_ping_table
from scripts.utils.helpers import haversine
from default import *

## Loading data

In [2]:
anchors = load_json(ANCHORS_FILE)

probes = load_json(PROBES_FILE)

all_probes = load_json(PROBES_AND_ANCHORS_FILE)

removed_probes = load_json(REMOVED_PROBES_FILE)

## Step 2 :

In [5]:
errors_threshold_probes_to_anchors = load_json(PROBES_TO_ANCHORS_RESULT_FILE)

error_threshold_cdfs_p_to_a, circles_threshold_cdfs_p_to_a, _ = compute_error_threshold_cdfs(
    errors_threshold_probes_to_anchors)

Ys = error_threshold_cdfs_p_to_a
print(len(error_threshold_cdfs_p_to_a[0]))
labels = ["All VPs"]
labels.extend([f"VPs > {t} km" for t in THRESHOLD_DISTANCES if t > 0])
fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                            "Geolocation error (km)",
                            "CDF of targets",
                            xscale="log",
                            yscale="linear",
                            legend=labels)
homogenize_legend(ax, "lower right", legend_size=12)

ofile = CBG_THRESHOLD_PROBES_FILE
plot_save(ofile, is_tight_layout=True)

Computing probes to anchors
Threshold 0 no geolocation 0
Threshold 40 no geolocation 0
Threshold 100 no geolocation 0
Threshold 500 no geolocation 0
Threshold 1000 no geolocation 1
711


## Step 4 :

In [8]:
Ys = []
labels = []
results_file = [VP_SELECTION_ALGORITHM_PROBES_1_FILE, VP_SELECTION_ALGORITHM_PROBES_3_FILE, VP_SELECTION_ALGORITHM_PROBES_10_FILE]
index = [1, 3, 10]

for i, file in enumerate(results_file):
    n_vps = index[i]
    errors_threshold_vp_selection_algorithm = load_json(
        results_file[i])
    error_threshold_cdfs_p_to_a_vp_selection, circles_threshold_cdfs_p_to_a_vp_selection, _ = compute_error_threshold_cdfs(
        errors_threshold_vp_selection_algorithm)
    Ys.append(list(error_threshold_cdfs_p_to_a_vp_selection[0]))
    labels.append(f"{n_vps} closest VP (RTT)")
    if n_vps == 10:
        # Take the baseline where 10 VPs are used to geolocate a target
        error_threshold_cdfs_p_to_a, circles_threshold_cdfs_p_to_a, _ = compute_error_threshold_cdfs(
            errors_threshold_probes_to_anchors, errors_threshold_vp_selection_algorithm)
        Ys.append(list(error_threshold_cdfs_p_to_a[0]))
        labels.append("All VPs")

fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                        "Geolocation error (km)",
                        "CDF of targets",
                        xscale="log",
                        yscale="linear",
                        legend=labels)
homogenize_legend(ax, "lower right")
ofile = CBG_THRESHOLD_VP_SELECTION_FILE
plot_save(ofile, is_tight_layout=True)

Threshold 0 no geolocation 33
Threshold 0 no geolocation 7
Threshold 0 no geolocation 5
Threshold 0 no geolocation 0
Threshold 40 no geolocation 0
Threshold 100 no geolocation 0
Threshold 500 no geolocation 0
Threshold 1000 no geolocation 1


## Step 6 :

In [10]:
def iso_code_2_to_country():
    country_by_iso_2 = {}
    continent_by_iso_2 = {}
    # Continent_Name,Continent_Code,Country_Name,Two_Letter_Country_Code,Three_Letter_Country_Code,Country_Number
    with open(COUNTRIES_CSV_FILE) as f:
        reader = csv.reader(f, delimiter=",", quotechar='"')
        next(reader, None)
        for line in reader:
            continent_code = line[1]
            country_name = line[2].split(",")[0]
            country_iso_code_2 = line[3]
            country_by_iso_2[country_iso_code_2] = country_name
            continent_by_iso_2[country_iso_code_2] = continent_code
    return continent_by_iso_2, country_by_iso_2

In [27]:
ip_per_country = {}
for anchor in anchors:
    if "address_v4" in anchor and "geometry" in anchor and "coordinates" in anchor["geometry"]:
        ip_v4_address = anchor["address_v4"]
        if ip_v4_address is None:
            continue
        country = anchor["country_code"]
        ip_per_country[ip_v4_address] = country

country_per_ip = {}
for ip, country in ip_per_country.items():
    country_per_ip.setdefault(country, []).append(ip)

In [30]:
"""
Compute results per continent
"""

errors_threshold_probes_to_anchors = load_json(PROBES_TO_ANCHORS_RESULT_FILE)

continent_by_iso_2, country_by_iso_2 = iso_code_2_to_country()

_, _, error_per_ip = compute_error_threshold_cdfs(errors_threshold_probes_to_anchors)

error_per_continent_cdf = {}
error_per_country_cdf = {}

# Match the anchors of the second replicated paper
anchors_second = list(set(load_json(ANCHORS_SECOND_PAPER_FILE)))
for ip, error in error_per_ip.items():
    if ip not in anchors_second:
        continue
    country = ip_per_country[ip]
    continent = continent_by_iso_2[country]
    error_per_continent_cdf.setdefault(continent, []).append(error)
    error_per_country_cdf.setdefault(country, []).append(error)

error_per_country_cdf_med = {country_by_iso_2[x]: (np.median(error_per_country_cdf[x]),
                                                len(error_per_country_cdf[x]), len(country_per_ip[x])) for x in error_per_country_cdf}


error_per_country_cdf_med_sorted = sorted(
    error_per_country_cdf_med.items(), key=lambda x: x[1][0], reverse=True)
print(error_per_country_cdf_med_sorted)

Ys = [list(error_per_continent_cdf[c])
        for c in error_per_continent_cdf]
labels = [
    f"{c} ({len(error_per_continent_cdf[c])})" for c in error_per_continent_cdf]
fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                            "Geolocation error (km)",
                            "CDF of targets",
                            xscale="log",
                            yscale="linear",
                            legend=labels)
homogenize_legend(ax, "lower right")
ofile = CBG_THRESHOLD_CONTINENT_FILE
plot_save(ofile, is_tight_layout=True)

Threshold 0 no geolocation 0
Threshold 40 no geolocation 0
Threshold 100 no geolocation 0
Threshold 500 no geolocation 0
Threshold 1000 no geolocation 1
[('Venezuela', (2192.9140091405034, 1, 1)), ('Pakistan', (1543.1755865726761, 1, 1)), ('Colombia', (1434.2320893110607, 1, 1)), ('Mozambique', (832.4551341422323, 1, 1)), ('Kuwait', (717.8898014224587, 1, 1)), ('Peru', (383.09155748407676, 2, 2)), ('Ecuador', (263.85158220593513, 1, 1)), ('Argentina', (221.1132724306607, 4, 4)), ('Hungary', (161.44000325140453, 1, 1)), ('Macedonia', (132.45677655906775, 1, 1)), ('Paraguay', (117.97642875966719, 1, 1)), ('Ghana', (98.31229536436996, 2, 2)), ('Thailand', (93.312454164549, 2, 2)), ('Kenya', (61.11453915894795, 1, 1)), ('Iraq', (54.572811751748695, 1, 1)), ('Poland', (49.969744441399314, 8, 8)), ('Belarus', (43.57770007033401, 1, 1)), ('Dominican Republic', (42.13216173978507, 2, 2)), ('Belgium', (36.693758379143034, 6, 6)), ('United Kingdom', (30.11516867734334, 34, 34)), ('Bulgaria', (29

## Step 8 :

In [None]:
accuracy_vs_n_vps_probes = load_json(ACCURACY_VS_N_VPS_PROBES_FILE)
accuracy_vs_n_vps_probes = {
    int(x): accuracy_vs_n_vps_probes[x] for x in accuracy_vs_n_vps_probes}
X = sorted([x for x in sorted(accuracy_vs_n_vps_probes.keys())])
Ys = [accuracy_vs_n_vps_probes[i] for i in X]
Ys_med = [[np.median(x) for x in Ys]]
Ys_err = [[np.std(x) for x in Ys]]

"""
Fig 3.a of the paper
"""

fig, ax = plot_multiple_error_bars(X, Ys_med, Ys_err,
                                    xmin=10, xmax=10500, ymin=1, ymax=10000,
                                    xlabel="Number of VPs",
                                    ylabel="Geolocation error (km)",
                                    xscale="log",
                                    yscale="log",
                                    labels=[
                                        ""
                                    ],

                                    )

homogenize_legend(ax, "lower right")
ofile = FIG_3A_FILE
plot_save(ofile, is_tight_layout=True)

"""
Fig 3.b of the paper
"""

subset_sizes = [100, 500, 1000, 2000]

labels = [f"{s} VPs" for s in subset_sizes]

Ys = [accuracy_vs_n_vps_probes[i] for i in subset_sizes]
print(min(accuracy_vs_n_vps_probes[100]),
        max(accuracy_vs_n_vps_probes[100]))

fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                            "Geolocation error (km)",
                            "CDF of median error",
                            xscale="log",
                            yscale="linear",
                            legend=labels)
homogenize_legend(ax, "lower right")
ofile = FIG_3B_FILE
plot_save(ofile, is_tight_layout=True)

## Step 10

In [21]:
round_based_algorithm_results = load_json(ROUND_BASED_ALGORITHM_FILE)

round_based_algorithm_results = {
int(x): round_based_algorithm_results[x] for x in round_based_algorithm_results}

errors_threshold_probes_to_anchors = load_json(PROBES_TO_ANCHORS_RESULT_FILE)
error_threshold_cdfs_p_to_a, circles_threshold_cdfs_p_to_a, _ = compute_error_threshold_cdfs(
errors_threshold_probes_to_anchors)

Ys_error = [error_threshold_cdfs_p_to_a[0]]
Ys_n_vps = []

labels_error = ["All VPs"]
labels_n_vps = []

for tier1_vps, results in sorted(round_based_algorithm_results.items()):
        tier1_vps = int(tier1_vps)
        error_cdf = [r[1] for r in results if r[1] is not None]
        n_vps_cdf = [r[2] + tier1_vps for r in results if r[2] is not None]
        label = f"{tier1_vps} VPs"
        labels_error.append(label)
        labels_n_vps.append(label)
        Ys_error.append(error_cdf)
        Ys_n_vps.append(n_vps_cdf)
        print(tier1_vps, 3 * sum(n_vps_cdf))

fig, ax = plot_multiple_cdf(Ys_error, 10000, 1, 10000,
                        "Geolocation error (km)",
                        "CDF of targets",
                        xscale="log",
                        yscale="linear",
                        legend=labels_error)
homogenize_legend(ax, "lower right")
ofile = ROUND_ALGORITHM_ERROR_FILE
plot_save(ofile, is_tight_layout=True)

fig, ax = plot_multiple_cdf(Ys_n_vps, 10000, 10, 10000,
                        "Vantage points",
                        "CDF of representatives",
                        xscale="log",
                        yscale="linear",
                        legend=labels_n_vps)
homogenize_legend(ax, "upper left")
ofile = ROUND_BASED_ALGORITHM_FILE
plot_save(ofile, is_tight_layout=True)

Threshold 0 no geolocation 0
Threshold 40 no geolocation 0
Threshold 100 no geolocation 0
Threshold 500 no geolocation 0
Threshold 1000 no geolocation 1
10 3051555
100 4529640
300 2918748
500 2309946
1000 2639067


## Step 11

In [22]:
data = load_json(ANALYZABLE_FILE)

query = get_min_rtt_per_src_dst_query_ping_table(
    'geolocation_replication', 'targets_to_landmarks_pings', '', 1000000)
client = Client('127.0.0.1')
db_table = client.execute(query)
rtts = []
remove_dict = {}
print(len(db_table))
for l in db_table:
    rtts.append(l[2])
    remove_dict[(l[0], l[1])] = l[2]
print(len(rtts))
plot_multiple_cdf([rtts], 10000, 0, None, 'Min RTT (ms)',
                    'CDF of (landmark, target) pairs', None)
plot_save(CLOSE_LANDMARK_FILE, is_tight_layout=True)

plot_multiple_cdf([rtts], 10000, 0.1, None, 'Min RTT (ms)',
                    'CDF of (landmark, target) pairs', None, xscale="log")
plot_save(CLOSE_LANDMARK_LOG_FILE, is_tight_layout=True)

error1 = []
error2 = []
error3 = []
error4 = []
error1ms = []
error2ms = []
error5ms = []
error10ms = []

for _, d in data.items():
    errors = every_tier_result_and_errors(d)
    error1.append(errors['error1'])
    error2.append(errors['error2'])
    error3.append(errors['error3'])
    error4.append(errors['error4'])
    err1ms = 50000
    err2ms = 50000
    err5ms = 50000
    err10ms = 50000
    for f in ['tier2:landmarks', 'tier3:landmarks']:
        if f in d:
            for l_ip, _, l_lat, l_lon in d[f]:
                dist = haversine((l_lat, l_lon), (d['lat_c'], d['lon_c']))
                key_rtt = (l_ip, d['target_ip'])
                if dist < err1ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 1):
                    err1ms = dist
                if dist < err2ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 2):
                    err2ms = dist
                if dist < err5ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 5):
                    err5ms = dist
                if dist < err10ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 10):
                    err10ms = dist
    if err1ms != 50000:
        error1ms.append(err1ms)
    else:
        error1ms.append(error1[-1])
    if err2ms != 50000:
        error2ms.append(err2ms)
    else:
        error2ms.append(error1[-1])
    if err5ms != 50000:
        error5ms.append(err5ms)
    else:
        error5ms.append(error1[-1])
    if err10ms != 50000:
        error10ms.append(err10ms)
    else:
        error10ms.append(error1[-1])

plot_multiple_cdf([error3, error4, error1ms, error2ms, error5ms, error10ms], 10000, 0, None, 'Geolocation error (km)', 'CDF of targets', [
                    "Street Level", "Closest landmark unfiltered", "Closest landmark <= 1ms", "Closest landmark <= 2ms", "Closest landmark <= 5ms", "Closest landmark <= 10ms"])
plt.legend(fontsize="10")
plot_save(CLOSE_LANDMARK_FILE_2, is_tight_layout=True)

plot_multiple_cdf([error3, error4, error1ms, error2ms, error5ms, error10ms], 10000, 0.1, None, 'Geolocation error (km)', 'CDF of targets', [
                    "Street Level", "Closest landmark unfiltered", "Closest landmark <= 1ms", "Closest landmark <= 2ms", "Closest landmark <= 5ms", "Closest landmark <= 10ms"], xscale="log")
plt.legend(fontsize="10")
plot_save(CLOSE_LANDMARK_LOG_FILE_2, is_tight_layout=True)

for i in [1, 5, 10, 40, 9999999999]:
    c = len([j for j in error1ms if j <= i])
    print(f"{c} targets with landmarks (ping <= {i}) or {c/len(error1ms)}")

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\milo2\\Desktop\\review\\datasets\\measurements\\street_level\\final_all_res.json'

## Step X

In [3]:
ip_info_geo = load_json(IP_INFO_GEO_FILE)
mm_geo = load_json(MAXMIND_GEO_FILE)
errors_threshold_probes_to_anchors = load_json(PROBES_TO_ANCHORS_RESULT_FILE)

In [4]:
error_threshold_cdfs_p_to_a, circles_threshold_cdfs_p_to_a, _ = compute_error_threshold_cdfs(
    errors_threshold_probes_to_anchors)

maxmind_error = {}
ip_info_error = {}
for i, anchor in enumerate(sorted(anchors, key=lambda x: x["address_v4"])):
    ip = anchor["address_v4"]
    if ip in removed_probes:
        continue

    if "geometry" not in anchor:
        continue

    long, lat = anchor["geometry"]["coordinates"]
    if ip in mm_geo:
        error = haversine(mm_geo[ip], (lat, long))
        maxmind_error[ip] = error

    if ip in ip_info_geo:
        ipinfo_lat, ipinfo_long = ip_info_geo[ip]["loc"].split(",")
        ipinfo_lat, ipinfo_long = float(ipinfo_lat), float(ipinfo_long)
        error = haversine((ipinfo_lat, ipinfo_long), (lat, long))
        ip_info_error[ip] = error

Ys = [error_threshold_cdfs_p_to_a[0], list(
    maxmind_error.values()), list(ip_info_error.values())]
print([len(Y) for Y in Ys])
labels = ["All VPs", "Maxmind (Free)", "IPinfo"]
fig, ax = plot_multiple_cdf(Ys, 10000, 1, 10000,
                            "Geolocation error (km)",
                            "CDF of targets",
                            xscale="log",
                            yscale="linear",
                            legend=labels)
homogenize_legend(ax, "lower right")

ofile = GEO_DATABASE_FILE
plot_save(ofile, is_tight_layout=True)

Threshold 0 no geolocation 0
Threshold 40 no geolocation 0
Threshold 100 no geolocation 0
Threshold 500 no geolocation 0
Threshold 1000 no geolocation 1
[711, 709, 711]


## cdf error

In [3]:
data = load_json(ANALYZABLE_FILE)

error1 = []
error2 = []
error3 = []
error4 = []

filtered_error1 = []
filtered_error2 = []
filtered_error3 = []
filtered_error4 = []
for _, d in data.items():
    errors = every_tier_result_and_errors(d)
    error1.append(errors['error1'])
    error2.append(errors['error2'])
    error3.append(errors['error3'])
    error4.append(errors['error4'])
    if d['tier1:done'] and 'tier2:landmarks' in d and len(d['tier2:landmarks']) > 0:
        filtered_error1.append(errors['error1'])
        filtered_error2.append(errors['error2'])
        filtered_error3.append(errors['error3'])
        filtered_error4.append(errors['error4'])

print(len(error1))
print(len(error2))
print(len(error3))
print(len(error4))
print(len([i for i in error4 if i <= 1]))

street_lvl_count_cbg = 0
street_lvl_count_tech = 0
for e in error1:
    if e <= 1:
        street_lvl_count_cbg += 1
for e in error3:
    if e <= 1:
        street_lvl_count_tech += 1
print(f"{street_lvl_count_cbg} targets are geolocated at street lvl using CBG {street_lvl_count_cbg/len(error1)}")
print(f"{street_lvl_count_tech} targets are geolocated at street lvl using tech {street_lvl_count_tech/len(error3)}")

median1 = np.median(error1)
median2 = np.median(error2)
median3 = np.median(error3)
median4 = np.median(error4)

print(f"tier 1 median error = {median1}")
print(f"tier 2 median error = {median2}")
print(f"tier 3 median error = {median3}")
print(f"closest landmark distance median = {median4}")

fmedian1 = np.median(filtered_error1)
fmedian2 = np.median(filtered_error2)
fmedian3 = np.median(filtered_error3)
fmedian4 = np.median(filtered_error4)

print(f"filtered tier 1 median error = {fmedian1}")
print(f"filtered tier 2 median error = {fmedian2}")
print(f"filtered tier 3 median error = {fmedian3}")
print(f"filtered closest landmark distance median = {fmedian4}")

less_then_1 = 0
less_then_1_lm = 0
for e in error3:
    if e <= 1:
        less_then_1 += 1
for e in error4:
    if e <= 1:
        less_then_1_lm += 1
print(f"{less_then_1} targets are geolocated at street lvl out of {len(error3)} or {less_then_1*100/len(error3)}%")
print(f"{less_then_1_lm} targets has a landmark at street lvl out of {len(error4)} or {less_then_1_lm*100/len(error4)}%")


plot_multiple_cdf([error3, error1, error4], 10000, 0.1, None, 'Geolocation error (km)',
                    'CDF of targets', ["Street Level", "CBG", "Closest Landmark"], xscale="log")
plt.legend(fontsize="14")
plot_save(CLOSE_LANDMARK_FILE, is_tight_layout=True)

Tier1 Failed
723
723
723
723
207
77 targets are geolocated at street lvl using CBG 0.10650069156293222
17 targets are geolocated at street lvl using tech 0.02351313969571231
tier 1 median error = 29.368729465989418
tier 2 median error = 34.41749100849208
tier 3 median error = 27.917440080911355
closest landmark distance median = 3.316187115363793
filtered tier 1 median error = 24.701434871210086
filtered tier 2 median error = 27.976445217281974
filtered tier 3 median error = 22.686067574789096
filtered closest landmark distance median = 2.8434411415239755
17 targets are geolocated at street lvl out of 723 or 2.351313969571231%
207 targets has a landmark at street lvl out of 723 or 28.630705394190873%


## Min dist landmark

In [None]:
def cdf_min_dist_landmark(data):
    without_landmarks_count = 0
    with_landmarks_count = 0
    distances = []
    for _, d in data.items():
        landmarks = []
        if 'tier2:landmarks' in d:
            for l in d['tier2:landmarks']:
                landmarks.append(l)
        if 'tier3:landmarks' in d:
            for l in d['tier3:landmarks']:
                landmarks.append(l)
        if len(landmarks) == 0:
            without_landmarks_count += 1
        else:
            with_landmarks_count += 1
            mindist = haversine((d['lat_c'], d['lon_c']),
                                (landmarks[0][2], landmarks[0][3]))
            for l in landmarks[1:]:
                dist = haversine((d['lat_c'], d['lon_c']), (l[2], l[3]))
                mindist = min(mindist, dist)
            distances.append(mindist)

    median = np.median(distances)
    print(f"Median distance to nearest landmark = {median}")

    plot_multiple_cdf([distances], 10000, 0.1, None,
                      'Distance to the nearest landmark (km)', 'CDF of targets', None)
    plot_save("./fig/distance_to_landmark.pdf", is_tight_layout=True)

    plot_multiple_cdf([distances], 10000, 0.1, None,
                      'Distance to the nearest landmark (km)', 'CDF of targets', None, xscale="log")
    plot_save("./fig/distance_to_landmark_log.pdf", is_tight_layout=True)

    print(f"{with_landmarks_count} targets with at least one landmark")
    print(f"{without_landmarks_count} targets without any landmark")

## cdf landmarks

In [None]:
def cdf_landmarks(data):
    valid_landmarks_count = 0
    unvalid_landmarks_count = 0
    values = []
    same_asn_lst = []
    same_24_lst = []
    same_bgp_lst = []
    all_traceroutes_count = 0
    no_r1_traceroutes_count = 0
    asndb = pyasn.pyasn(IP_TO_ASN_FILE_PATH)
    distances_to_landmarks = []
    all_landmarks = []
    bgp_prefixes = get_all_bgp_prefixes()
    for _, d in data.items():
        good = 0
        bad = 0
        same_asn = 0
        diff_asn = 0
        same_bgp = 0
        diff_bgp = 0
        same_24 = 0
        diff_24 = 0
        all_landmarks.append(0)
        if "tier2:cdn_count" in d and "tier2:landmark_count" in d and "tier2:failed_header_test_count" in d:
            all_landmarks[-1] += d['tier2:landmark_count'] + \
                d['tier2:cdn_count'] + d['tier2:failed_header_test_count']
            valid_landmarks_count += d['tier2:landmark_count']
            unvalid_landmarks_count += d['tier2:cdn_count'] + \
                d['tier2:failed_header_test_count']
        if "tier3:cdn_count" in d and "tier3:landmark_count" in d and "tier3:failed_header_test_count" in d:
            all_landmarks[-1] += d['tier3:landmark_count'] + \
                d['tier3:cdn_count'] + d['tier3:failed_header_test_count']
            valid_landmarks_count += d['tier3:landmark_count']
            unvalid_landmarks_count += d['tier3:cdn_count'] + \
                d['tier3:failed_header_test_count']
        for f in ['tier2:traceroutes', 'tier3:traceroutes']:
            if f in d:
                for t in d[f]:
                    if t[4] < 0:
                        bad += 1
                    else:
                        good += 1

                    all_traceroutes_count += 1
                    if t[3] == None:
                        no_r1_traceroutes_count += 1

                    ipt = t[1]
                    ipl = t[2]
                    asnt = asndb.lookup(ipt)[0]
                    asnl = asndb.lookup(ipl)[0]
                    if asnl != None and asnt != None:
                        if asnt == asnl:
                            same_asn += 1
                        else:
                            diff_asn += 1
                    nt = ip_network(ipt+"/24", strict=False).network_address
                    nl = ip_network(ipl+"/24", strict=False).network_address
                    if nt == nl:
                        same_24 += 1
                    else:
                        diff_24 += 1

                    if is_same_bgp_prefix(ipt, ipl, bgp_prefixes):
                        same_bgp += 1
                    else:
                        diff_bgp += 1
        distances = []
        for f in ['tier2:landmarks', 'tier3:landmarks']:
            target_geo = (d['lat_c'], d['lon_c'])
            if f in d:
                for l in d[f]:
                    landmark_geo = (l[2], l[3])
                    distances.append(haversine(target_geo, landmark_geo))
        distances_to_landmarks.append(distances)

        if same_asn != 0 or diff_asn != 0:
            same_asn_lst.append(same_asn/(same_asn+diff_asn))

        if same_24 != 0 or diff_24 != 0:
            same_24_lst.append(same_24/(same_24+diff_24))
            if same_24 != 0:
                print(
                    f"Found {d['target_ip']} with a landmark in the same /24")
        if same_bgp != 0 or diff_bgp != 0:
            same_bgp_lst.append(same_bgp/(diff_bgp+same_bgp))

        if good != 0 or bad != 0:
            values.append(bad/(bad+good))

    print(f"{no_r1_traceroutes_count} no r1 found out of {all_traceroutes_count}")
    plot_multiple_cdf([values], 10000, 0, 1,
                      'Fraction of landmarks with\nD1 + D2 < 0', 'CDF of targets', None)
    plot_save("./fig/invalid_rtt.pdf", is_tight_layout=True)

    # plot_multiple_cdf([error3, error1, error4, error2], 10000, 0.1, None, 'Error distance (km)', 'CDF of error distance', ["Street Level", "CBG", "Closest Landmarks", "Tier 2 estimation"])
    only_outside_asn = 0
    for x in same_asn_lst:
        if x == 0:
            only_outside_asn += 1
    only_outside_24 = 0
    for x in same_24_lst:
        if x == 0:
            only_outside_24 += 1
    only_outside_bgp = 0
    for x in same_bgp_lst:
        if x == 0:
            only_outside_bgp += 1

    print(f"{valid_landmarks_count} total valid landmarks")
    print(f"{unvalid_landmarks_count} unvalid landmarks")
    print(f"{(valid_landmarks_count*100)/(valid_landmarks_count+unvalid_landmarks_count)}% valid landmarks")

    print(f"{only_outside_asn} targets has all its landmarks outside its AS out of {len(same_asn_lst)} {only_outside_asn*100/(len(same_asn_lst))}%")
    print(f"{only_outside_24} targets has all its landmarks outside its /24 out of {len(same_24_lst)} {only_outside_24*100/(len(same_24_lst))}%")
    print(f"{only_outside_bgp} targets has all its landmarks outside its BGP prefix out of {len(same_bgp_lst)} {only_outside_bgp*100/(len(same_bgp_lst))}%")

    plot_multiple_cdf([same_asn_lst, same_bgp_lst, same_24_lst], 10000, None, None,
                      'Fraction of landmarks and targets\nsharing network', 'CDF of targets', ['ASN', 'BGP prefix', '/24'])
    plt.legend(fontsize="14")
    plot_save("./fig/landmarks_targets_network.pdf", is_tight_layout=True)

    landmarks_all = []
    landmarks_less_1 = []
    landmarks_less_5 = []
    landmarks_less_10 = []
    landmarks_less_40 = []
    total_count_ping = 0
    for landmark_distances in distances_to_landmarks:
        # if len(landmark_distances) == 0:
        #     continue
        landmarks_all.append(len(landmark_distances))
        landmarks_less_1.append(len([i for i in landmark_distances if i <= 1]))
        landmarks_less_5.append(len([i for i in landmark_distances if i <= 5]))
        landmarks_less_10.append(
            len([i for i in landmark_distances if i <= 10]))
        landmarks_less_40.append(
            len([i for i in landmark_distances if i <= 40]))
        total_count_ping += len([i for i in landmark_distances if i <= 40])

    print(f"{total_count_ping} ping measurement to do")

    lm_a_0 = len([i for i in all_landmarks if i > 0])
    lmv_a_0 = len([i for i in landmarks_all if i > 0])
    lm1_0 = len([i for i in landmarks_less_1 if i > 0])
    lm5_0 = len([i for i in landmarks_less_5 if i > 0])
    lm10_0 = len([i for i in landmarks_less_10 if i > 0])
    lm40_0 = len([i for i in landmarks_less_40 if i > 0])

    lm1_1 = len([i for i in landmarks_less_1 if i >= 1])
    print(lm1_1)

    len_all = len(data)
    print(f"{lm_a_0} target have potentail landmarks or {lm_a_0/len_all}")
    print(f"{lmv_a_0} target have valid landmarks or {lmv_a_0/len_all}")
    print(f"{lm1_0} target with a landmark within 1 km or {lm1_0/len_all}")
    print(f"{lm5_0} target with a landmark within 5 km or {lm5_0/len_all}")
    print(f"{lm10_0} target with a landmark within 10 km or {lm10_0/len_all}")
    print(f"{lm40_0} target with a landmark within 40 km or {lm40_0/len_all}")

    plot_multiple_cdf([all_landmarks, landmarks_all, landmarks_less_1, landmarks_less_5, landmarks_less_10, landmarks_less_40], 10000, 1, 0, 'Number of landmarks', 'CDF of targets', [
                      'All potential landmarks', 'All valid landmarks', 'Landmarks within 1 km', 'Landmarks within 5 km', 'Landmarks within 10 km', 'Landmarks within 40 km'])
    plt.legend(fontsize="14")
    plot_save("./fig/landmarks_count_per_target.pdf", is_tight_layout=True)

    plot_multiple_cdf([landmarks_less_1, landmarks_less_5, landmarks_less_10, landmarks_less_40], 10000, 1, 0, 'Number of landmarks',
                      'CDF of targets', ['Landmarks within 1 km', 'Landmarks within 5 km', 'Landmarks within 10 km', 'Landmarks within 40 km'])
    plt.legend(fontsize="14")
    plot_save("./fig/landmarks_count_per_target_40.pdf", is_tight_layout=True)

    plot_multiple_cdf([all_landmarks, landmarks_all, landmarks_less_1, landmarks_less_5, landmarks_less_10, landmarks_less_40], 10000, 0.8, 0, 'Number of landmarks', 'CDF of targets', [
                      'All potential landmarks', 'All valid landmarks', 'Landmarks within 1 km', 'Landmarks within 5 km', 'Landmarks within 10 km', 'Landmarks within 40 km'], xscale="log")
    plt.legend(fontsize="14")
    plot_save("./fig/landmarks_count_per_target_log.pdf", is_tight_layout=True)

    plot_multiple_cdf([landmarks_less_1, landmarks_less_5, landmarks_less_10, landmarks_less_40], 10000, 1, 0, 'Number of landmarks', 'CDF of targets', [
                      'Landmarks within 1 km', 'Landmarks within 5 km', 'Landmarks within 10 km', 'Landmarks within 40 km'], xscale="log")
    plt.legend(fontsize="14")
    plot_save("./fig/landmarks_count_per_target_40_log.pdf",
              is_tight_layout=True)

## time needed

In [None]:
def cdf_time_needed_to_geoloc(data):
    time1 = []
    time2 = []
    time3 = []
    values = []
    for _, d in data.items():
        if d['tier1:done'] and 'tier1:duration' in d:
            time1.append(d['tier1:duration'])
        if d['tier2:done'] and 'tier2:duration' in d:
            time2.append(d['tier2:duration'])
        if d['tier3:done'] and 'tier3:duration' in d:
            time3.append(d['tier3:duration'])
            values.append(d['tier1:duration'] +
                          d['tier2:duration']+d['tier3:duration'])

    median1 = np.median(time1)
    median2 = np.median(time2)
    median3 = np.median(time3)
    median = np.median(values)

    print(f"tier 1 median duration = {median1}")
    print(f"tier 2 median duration = {median2}")
    print(f"tier 3 median duration = {median3}")
    print(f"Street Level median duration = {median}")

    plot_multiple_cdf([values], 1000, None, None,
                      'Time to geolocate a target (sec)', 'CDF of targets', None)
    plot_save("./fig/cdf_time_to_geolocate.pdf", is_tight_layout=True)

## Measured distance

In [None]:
def measured_distance_vs_distance(data):
    correlations = []
    mdvd = {}
    scater_plot_data = {}
    for target_ip, d in data.items():
        tmp_landmarks = {}
        for f in ['tier2:traceroutes', 'tier3:traceroutes']:
            if f in d:
                for t in d[f]:
                    # if t[3] == None or t[4]<0:
                    if t[4] < 0:
                        continue
                    landmarks_ip = t[2]
                    measured_distance = rtt_to_km(t[4], 4/9, 300)
                    distance = haversine(
                        (t[5], t[6]), (d['lat_c'], d['lon_c']))
                    if landmarks_ip not in tmp_landmarks:
                        tmp_landmarks[landmarks_ip] = (
                            measured_distance, distance)
                    if measured_distance < tmp_landmarks[landmarks_ip][0]:
                        tmp_landmarks[landmarks_ip] = (
                            measured_distance, distance)
        if len(tmp_landmarks) != 0:
            tmp_dict = {'md': [], 'd': []}
            for k, v in tmp_landmarks.items():
                all_diff = True
                for i in range(len(tmp_dict['d'])):
                    if v[1] == tmp_dict['d'][i]:
                        all_diff = False
                if all_diff:
                    tmp_dict['md'].append(v[0])
                    tmp_dict['d'].append(v[1])
            if len(tmp_dict['md']) > 1:
                correlation = pearsonr(tmp_dict['md'], tmp_dict['d'])[0]
                tmp_dict['correlation'] = correlation
                correlations.append(correlation)
                mdvd[d['target_ip']] = tmp_dict
            if len(tmp_dict['md']) >= 5:  # and len(tmp_dict['md']) <= 15:
                error = every_tier_result_and_errors(d)
                if error['error3'] < 45:
                    scater_plot_data[target_ip] = {
                        'geo_loc_data': d, 'error_data': error, 'mdvd_data': tmp_dict}

    medianc = np.median(correlations)
    minc = min(correlations)
    maxc = max(correlations)

    print(f"Measured Distance vs Distance median correlation = {medianc}")
    print(f"Measured Distance vs Distance min correlation = {minc}")
    print(f"Measured Distance vs Distance max correlation = {maxc}")

    plot_multiple_cdf([correlations], 10000, -1, 1,
                      'Correlation Coef MD Vs D', 'CDF of Correlation Coef', None)
    plot_save("./fig/cdf_md_vs_d.pdf", is_tight_layout=True)

    x1 = []
    x2 = []
    x3 = []
    x4 = []
    y1 = []
    y2 = []
    y3 = []
    y4 = []
    for _, d in scater_plot_data.items():
        if d['error_data']['error3'] != d['error_data']['error1'] and d['error_data']['error3'] < 1:
            if len(x1) == 0:
                x1 = d['mdvd_data']['d']
                y1 = d['mdvd_data']['md']
            if len(x1) > len(d['mdvd_data']['d']):
                x1 = d['mdvd_data']['d']
                y1 = d['mdvd_data']['md']
        if d['error_data']['error3'] != d['error_data']['error1'] and d['error_data']['error3'] < 6 and d['error_data']['error3'] > 4:
            if len(x2) == 0:
                x2 = d['mdvd_data']['d']
                y2 = d['mdvd_data']['md']
            if len(x2) > len(d['mdvd_data']['d']):
                x2 = d['mdvd_data']['d']
                y2 = d['mdvd_data']['md']
        if d['error_data']['error3'] != d['error_data']['error1'] and d['error_data']['error3'] < 11 and d['error_data']['error3'] > 9:
            if len(x3) == 0:
                x3 = d['mdvd_data']['d']
                y3 = d['mdvd_data']['md']
            if len(x3) > len(d['mdvd_data']['d']):
                x3 = d['mdvd_data']['d']
                y3 = d['mdvd_data']['md']
        if d['error_data']['error3'] != d['error_data']['error1'] and d['error_data']['error3'] < 41 and d['error_data']['error3'] > 39:
            if len(x4) == 0:
                x4 = d['mdvd_data']['d']
                y4 = d['mdvd_data']['md']
            if len(x4) > len(d['mdvd_data']['d']):
                x4 = d['mdvd_data']['d']
                y4 = d['mdvd_data']['md']

    list_color = ['r', 'b', 'g', 'y']
    list_mak = ['o', '*', 'x', '+']
    list_lab = ['< 1 km error', '5 km error', '10 km error', '40 km error']
    plot_scatter_multiple([x1, x2, x3, x4], [y1, y2, y3, y4], None, None, 1, None, "log", "log",
                          'Geographical distance (km)', 'Measured distance (km)', list_mak, list_color, [10, 10, 10, 10])
    plt.legend(list_lab, fontsize="14")
    plot_save("./fig/scater_md_vs_d.pdf", is_tight_layout=True)

## Density

In [None]:
def density_plot(data):
    with open(POPULATION_CITY_FILE_PATH, 'r') as json_file:
        pop_data = json.load(json_file)

    dens_lst = []
    error_lst = []
    for d in pop_data:
        ip = d['target_ip']
        pop = d['density']
        dens_lst.append(pop)
        errors = every_tier_result_and_errors(data[ip])
        error_lst.append(errors['error3'])

    fig, ax = plot_scatter_multiple([error_lst], [dens_lst], 0.1, 10000, 0.1, 100000, "log",
                                    "log", 'Error distance (km)', 'Population Density (people/km²)', ["x"], ["b"], [10])
    degree = 1
    coef = np.polyfit(error_lst, dens_lst, deg=degree)
    xseq = np.linspace(0, 10000, num=100)
    yseq = [0 for i in range(len(xseq))]
    for i in range(len(coef)):
        power = len(coef) - i - 1
        yseq = [(xseq[j]**power)*coef[i]+yseq[j] for j in range(len(xseq))]
    ax.plot(xseq, yseq, color="k", lw=2.5)
    plot_save("./fig/scater_density.pdf", is_tight_layout=True)

    plot_multiple_cdf([dens_lst], 10000, None, None,
                      'Population Density (people/km²)', 'CDF of targets', None, xscale="log")
    plot_save("./fig/cdf_density.pdf", is_tight_layout=True)