In [1]:
import pyasn
import numpy as np
import seaborn as sns
import math
import statistics

from scipy.stats import pearsonr
from ipaddress import ip_network
from pprint import pprint
from matplotlib import pyplot as plt
from clickhouse_driver import Client

from scripts.utils.file_utils import load_json, dump_json
from scripts.analysis.analysis import get_all_bgp_prefixes, is_same_bgp_prefix, every_tier_result_and_errors
from scripts.utils.helpers import haversine, rtt_to_km, is_within_cirle, polygon_centroid, circle_intersections, distance, get_points_in_poly
from default import IP_TO_ASN_FILE, ANALYZABLE_FILE, ROUND_BASED_ALGORITHM_FILE

### loading data

In [3]:
data = load_json(ANALYZABLE_FILE)

## CBG evaluation

In [4]:
bad = 0
good = 0
good_23_only = 0
empty_vps = 0
not_empty_vps = 0
targeted_traceroutes = 0
not_targeted_traceroutes = 0
vps_not_working = []
for _, d in data.items():
    if d['tier1:done']:
        good += 1
    else:
        bad += 1
        points = get_points_in_poly(d['vps'], 36, 5, 4/9)
        if len(points) != 0:
            print(len(points))
        tmp_vps = []
        for vp in d['vps']:
            tmp_vps.append((vp[0], vp[1], vp[2], None, None))
        points = get_points_in_poly(tmp_vps, 36, 5, 2/3)
        if len(points) != 0:
            good_23_only += 1
        else:
            if len(d['vps']) > 0:
                not_empty_vps += 1
                vps_not_working.append(d['target_ip'])
            else:
                empty_vps += 1
                client = Client('127.0.0.1')
                tmp_row = client.execute(
                    f'select src_addr, rtt, tstamp from bgp_interdomain_te.street_lvl_traceroutes where dst_addr = \'{d["target_ip"]}\'')
                if len(tmp_row) != 0:
                    targeted_traceroutes += 1
                    print(f"{d['target_ip']} was targeted by traceroutes")
                else:
                    not_targeted_traceroutes += 1

print(vps_not_working)
print(f"{bad} no intersection out or {bad+good} = {bad/(bad+good)}")
print(
    f"If the speed where to be 2/3 CBG would have worked for {good_23_only} more targets")
print(f"{empty_vps} no vp, {not_empty_vps} some vps")
print(
    f"When no vp found {targeted_traceroutes} target had a traceroute dedecated to it and {not_targeted_traceroutes} did not")

position_in = 0
position_out = 0
would_be_in = 0
for _, d in data.items():
    if not d['tier1:done']:
        continue
    all_in = True
    candidate_geo = (d['lat_c'], d['lon_c'])
    for vp in d['vps']:
        if not is_within_cirle((vp[0], vp[1]), vp[2], candidate_geo, speed_threshold=4/9):
            all_in = False
    if all_in:
        position_in += 1
    else:
        position_out += 1
        all_in = True
        for vp in d['vps']:
            if not is_within_cirle((vp[0], vp[1]), vp[2], candidate_geo, speed_threshold=2/3):
                all_in = False
        if all_in:
            would_be_in += 1
        else:
            print(f"{d['target_ip']} is always outside the CBG area")

print(f"the target was in the CBG area {position_in} times")
print(f"the target was out of the CBG area {position_out} times")
print(f"CBG failed {position_out*100/(position_in+position_out)}%")
print(
    f"If we would use 2/3 {would_be_in} extra targets would be in the CBG area")

['92.38.184.82']
1 no intersection out or 723 = 0.0013831258644536654
If the speed where to be 2/3 CBG would have worked for 0 more targets
0 no vp, 1 some vps
When no vp found 0 target had a traceroute dedecated to it and 0 did not
the target was in the CBG area 677 times
the target was out of the CBG area 45 times
CBG failed 6.232686980609419%
If we would use 2/3 45 extra targets would be in the CBG area


## Success rate

In [5]:
feilds_count = {'tier1:done': 0, 'tier2:done': 0, 'tier3:done': 0}
for _, d in data.items():
    for feild in feilds_count:
        if d[feild]:
            feilds_count[feild] += 1
print(f"{len(data)} Total targets done")
for k, v in feilds_count.items():
    print(f"{v} {k}")

dict_reasons = {
    'tier2_failed_because_no_zipcodes': 0,
    'tier2_failed_because_no_landmark': 0,
    'tier2_failed_because_no_valid_traceroute': 0,
    'tier2_failed_because_other': 0,
    'tier3_failed_because_no_zipcodes': 0,
    'tier3_failed_because_no_landmark': 0,
    'tier3_failed_because_no_valid_traceroute': 0,
    'tier3_failed_because_other': 0
}

for _, d in data.items():
    if not d['tier1:done']:  # Here you should analyse tier1
        continue
    if not d['tier2:done']:
        if d['tier2:inspected_points_count'] == 0:
            dict_reasons['tier2_failed_because_no_zipcodes'] += 1
            continue
        if d['tier2:landmark_count'] == 0:
            dict_reasons['tier2_failed_because_no_landmark'] += 1
            continue
        one_traceroute_found = False
        for t in d['tier2:traceroutes']:
            if t[4] > 0:
                one_traceroute_found = True
                break
        if not one_traceroute_found:
            dict_reasons['tier2_failed_because_no_valid_traceroute'] += 1
            continue
        dict_reasons['tier2_failed_because_other'] += 1
        continue
    if not d['tier3:done']:
        if d['tier3:inspected_points_count'] == 0:
            dict_reasons['tier3_failed_because_no_zipcodes'] += 1
            # if d['target_ip'] not in ['185.28.221.65', '46.183.219.225']:
            #    print(d['target_ip'])
            #    exit()
            continue
        if d['tier3:landmark_count'] == 0:
            dict_reasons['tier3_failed_because_no_landmark'] += 1
            continue
        one_traceroute_found = False
        for t in d['tier3:traceroutes']:
            if t[4] > 0:
                one_traceroute_found = True
                break
        if not one_traceroute_found:
            dict_reasons['tier3_failed_because_no_valid_traceroute'] += 1
            continue
        dict_reasons['tier3_failed_because_other'] += 1
        continue

for k, v in dict_reasons.items():
    print(f"{k} {v}")

723 Total targets done
722 tier1:done
660 tier2:done
425 tier3:done
tier2_failed_because_no_zipcodes 0
tier2_failed_because_no_landmark 45
tier2_failed_because_no_valid_traceroute 17
tier2_failed_because_other 0
tier3_failed_because_no_zipcodes 161
tier3_failed_because_no_landmark 74
tier3_failed_because_no_valid_traceroute 0
tier3_failed_because_other 0


## API calls count

In [6]:
zipcodes_counts = []
landmarks_counts = []
traceroutes_counts = []
for _, d in data.items():
    zipcodes_count = 0
    landmarks_count = 0
    traceroutes_count = 0

    for f in ['tier2:inspected_points_count', 'tier3:inspected_points_count']:
        if f in d:
            zipcodes_count += d[f]
    if zipcodes_count != 0:
        zipcodes_counts.append(zipcodes_count)

    for f in ["tier2:failed_dns_count", "tier2:failed_asn_count", "tier2:cdn_count", "tier2:non_cdn_count", "tier3:failed_dns_count", "tier3:failed_asn_count", "tier3:cdn_count", "tier3:non_cdn_count"]:
        if f in d:
            landmarks_count += d[f]
    if landmarks_count != 0:
        landmarks_counts.append(landmarks_count)

    for f in ['tier2:traceroutes', 'tier3:traceroutes']:
        if f in d:
            traceroutes_count += len(d[f])
    if traceroutes_count != 0:
        traceroutes_counts.append(traceroutes_count)

print(f"{np.median(zipcodes_counts)} Zipcode to check (median)")
print(f"{np.median(traceroutes_counts)} traceroutes to check (median)")
print(f"{np.median(landmarks_counts)} landmarks to check (median)")

total = 0
for zip in zipcodes_counts:
    total += zip
print(f"{total} Overpass queries")
total = 0
for x in landmarks_counts:
    total += x
print(f"{total} landmarks verification")
total = 0
for x in traceroutes_counts:
    total += x
print(f"{total} traceroutes")

920.0 Zipcode to check (median)
111.0 traceroutes to check (median)
2419.0 landmarks to check (median)
753428 Overpass queries
2755315 landmarks verification
143601 traceroutes


## Correlation same network

In [15]:
asn_coef_lst = []
bgp_coef_lst = []
asndb = pyasn.pyasn(IP_TO_ASN_FILE)
bgp_prefixes = get_all_bgp_prefixes()

for _, d in data.items():
    same_bgp_x = []
    same_bgp_y = []
    same_asn_x = []
    same_asn_y = []
    for f in ['tier2:traceroutes', 'tier3:traceroutes']:
        if f in d:
            for t in d[f]:
                if t[4] < 0:
                    continue
                distance = haversine(
                    (t[5], t[6]), (d['lat_c'], d['lon_c']))
                ipt = t[1]
                ipl = t[2]
                asnt = asndb.lookup(ipt)[0]
                asnl = asndb.lookup(ipl)[0]
                if asnl != None and asnt != None:
                    if asnt == asnl and distance not in same_asn_y:
                        same_asn_y.append(distance)
                        same_asn_x.append(t[4])

                if is_same_bgp_prefix(ipt, ipl, bgp_prefixes):
                    if distance not in same_bgp_y:
                        same_bgp_y.append(distance)
                        same_bgp_x.append(t[4])
    if len(same_asn_x) > 1:
        correlation = pearsonr(same_asn_x, same_asn_y)[0]
        asn_coef_lst.append(correlation)
    if len(same_bgp_x) > 1:
        correlation = pearsonr(same_bgp_x, same_bgp_y)[0]
        bgp_coef_lst.append(correlation)

print(f"{len(asn_coef_lst)} targets with correlation asn")
print(f"{len(bgp_coef_lst)} targets with correlation bgp")
print(f"{np.median(bgp_coef_lst)} median bgp correlation")
print(f"{np.median(asn_coef_lst)} median asn correlation")

NameError: name 'IP_TO_ASN_FILE_PATH' is not defined

## Table 1 of the paper

In [6]:
#round_based_algorithm_results = load_json(ROUND_BASED_ALGORITHM_FILE)
round_based_algorithm_results = load_json("test.json")
round_based_algorithm_results = {
int(x): round_based_algorithm_results[x] for x in round_based_algorithm_results}

In [7]:
for tier1_vps, results in sorted(round_based_algorithm_results.items()):
        tier1_vps = int(tier1_vps)
        n_vps_cdf = [r[2] + tier1_vps for r in results if r[2] is not None]
        print(tier1_vps, 3 * sum(n_vps_cdf))

10 5485404
100 4280667
300 3029640
500 2640603
1000 2639484


## Number of landmarks within a certain radius

### Table 2 of the paper

In [16]:
data = load_json(ANALYZABLE_FILE)

valid_landmarks_count = 0
unvalid_landmarks_count = 0
same_asn_lst = []
same_24_lst = []
same_bgp_lst = []
distances_to_landmarks = []
all_landmarks = []
asndb = pyasn.pyasn(IP_TO_ASN_FILE)
bgp_prefixes = get_all_bgp_prefixes()

for _, d in data.items():
    same_asn = 0
    diff_asn = 0
    same_bgp = 0
    diff_bgp = 0
    same_24 = 0
    diff_24 = 0
    all_landmarks.append(0)
    if "tier2:cdn_count" in d and "tier2:landmark_count" in d and "tier2:failed_header_test_count" in d:
        all_landmarks[-1] += d['tier2:landmark_count'] + \
            d['tier2:cdn_count'] + d['tier2:failed_header_test_count']
        valid_landmarks_count += d['tier2:landmark_count']
        unvalid_landmarks_count += d['tier2:cdn_count'] + \
            d['tier2:failed_header_test_count']
    if "tier3:cdn_count" in d and "tier3:landmark_count" in d and "tier3:failed_header_test_count" in d:
        all_landmarks[-1] += d['tier3:landmark_count'] + \
            d['tier3:cdn_count'] + d['tier3:failed_header_test_count']
        valid_landmarks_count += d['tier3:landmark_count']
        unvalid_landmarks_count += d['tier3:cdn_count'] + \
            d['tier3:failed_header_test_count']
    for f in ['tier2:traceroutes', 'tier3:traceroutes']:
        if f in d:
            for t in d[f]:

                ipt = t[1]
                ipl = t[2]
                asnt = asndb.lookup(ipt)[0]
                asnl = asndb.lookup(ipl)[0]
                if asnl != None and asnt != None:
                    if asnt == asnl:
                        same_asn += 1
                    else:
                        diff_asn += 1

                nt = ip_network(ipt+"/24", strict=False).network_address
                nl = ip_network(ipl+"/24", strict=False).network_address
                if nt == nl:
                    same_24 += 1
                else:
                    diff_24 += 1

                if is_same_bgp_prefix(ipt, ipl, bgp_prefixes):
                    same_bgp += 1
                else:
                    diff_bgp += 1

    distances = []
    for f in ['tier2:landmarks', 'tier3:landmarks']:
        target_geo = (d['lat_c'], d['lon_c'])
        if f in d:
            for l in d[f]:
                landmark_geo = (l[2], l[3])
                distances.append(haversine(target_geo, landmark_geo))
    distances_to_landmarks.append(distances)

    if same_asn != 0 or diff_asn != 0:
        same_asn_lst.append(same_asn/(same_asn+diff_asn))

    if same_24 != 0 or diff_24 != 0:
        same_24_lst.append(same_24/(same_24+diff_24))
        if same_24 != 0:
            print(
                f"Found {d['target_ip']} with a landmark in the same /24")
    if same_bgp != 0 or diff_bgp != 0:
        same_bgp_lst.append(same_bgp/(diff_bgp+same_bgp))

Found 78.128.211.119 with a landmark in the same /24
Found 77.109.180.62 with a landmark in the same /24
Found 103.143.136.43 with a landmark in the same /24


In [17]:
only_outside_asn = 0
for x in same_asn_lst:
    if x == 0:
        only_outside_asn += 1
only_outside_24 = 0
for x in same_24_lst:
    if x == 0:
        only_outside_24 += 1
only_outside_bgp = 0
for x in same_bgp_lst:
    if x == 0:
        only_outside_bgp += 1

print(f"{valid_landmarks_count} total valid landmarks")
print(f"{unvalid_landmarks_count} unvalid landmarks")
print(f"{(valid_landmarks_count*100)/(valid_landmarks_count+unvalid_landmarks_count)}% valid landmarks")

print(f"{only_outside_asn} targets has all its landmarks outside its AS out of {len(same_asn_lst)} {only_outside_asn*100/(len(same_asn_lst))}%")
print(f"{only_outside_24} targets has all its landmarks outside its /24 out of {len(same_24_lst)} {only_outside_24*100/(len(same_24_lst))}%")
print(f"{only_outside_bgp} targets has all its landmarks outside its BGP prefix out of {len(same_bgp_lst)} {only_outside_bgp*100/(len(same_bgp_lst))}%")

65325 total valid landmarks
2519202 unvalid landmarks
2.5275417900451416% valid landmarks
537 targets has all its landmarks outside its AS out of 669 80.26905829596413%
674 targets has all its landmarks outside its /24 out of 677 99.55686853766618%
618 targets has all its landmarks outside its BGP prefix out of 677 91.2850812407681%


In [18]:
landmarks_all = []
landmarks_less_1 = []
landmarks_less_5 = []
landmarks_less_10 = []
landmarks_less_40 = []
total_count_ping = 0

for landmark_distances in distances_to_landmarks:
    # if len(landmark_distances) == 0:
    #     continue
    landmarks_all.append(len(landmark_distances))
    landmarks_less_1.append(len([i for i in landmark_distances if i <= 1]))
    landmarks_less_5.append(len([i for i in landmark_distances if i <= 5]))
    landmarks_less_10.append(
        len([i for i in landmark_distances if i <= 10]))
    landmarks_less_40.append(
        len([i for i in landmark_distances if i <= 40]))
    total_count_ping += len([i for i in landmark_distances if i <= 40])

print(f"{total_count_ping} ping measurement to do")

lm_a_0 = len([i for i in all_landmarks if i > 0])
lmv_a_0 = len([i for i in landmarks_all if i > 0])
lm1_0 = len([i for i in landmarks_less_1 if i > 0])
lm5_0 = len([i for i in landmarks_less_5 if i > 0])
lm10_0 = len([i for i in landmarks_less_10 if i > 0])
lm40_0 = len([i for i in landmarks_less_40 if i > 0])

lm1_1 = len([i for i in landmarks_less_1 if i >= 1])
print(lm1_1)

len_all = len(data)
print(f"{lm_a_0} target have potentail landmarks or {lm_a_0/len_all}")
print(f"{lmv_a_0} target have valid landmarks or {lmv_a_0/len_all}")
print(f"{lm1_0} target with a landmark within 1 km or {lm1_0/len_all}")
print(f"{lm5_0} target with a landmark within 5 km or {lm5_0/len_all}")
print(f"{lm10_0} target with a landmark within 10 km or {lm10_0/len_all}")
print(f"{lm40_0} target with a landmark within 40 km or {lm40_0/len_all}")

39853 ping measurement to do
207
713 target have potentail landmarks or 0.9861687413554634
677 target have valid landmarks or 0.9363762102351314
207 target with a landmark within 1 km or 0.2863070539419087
419 target with a landmark within 5 km or 0.5795297372060858
464 target with a landmark within 10 km or 0.6417704011065007
552 target with a landmark within 40 km or 0.7634854771784232


In [21]:
query = get_min_rtt_per_src_dst_query_ping_table(
    'geolocation_replication', 'targets_to_landmarks_pings', '', 1000000)
client = Client('127.0.0.1')
db_table = client.execute(query)

rtts = []
remove_dict = {}
print(len(db_table))
for l in db_table:
    rtts.append(l[2])
    remove_dict[(l[0], l[1])] = l[2]
print(len(rtts))

error1 = []
error2 = []
error3 = []
error4 = []
error1ms = []
error2ms = []
error5ms = []
error10ms = []

for _, d in data.items():
    errors = every_tier_result_and_errors(d)
    error1.append(errors['error1'])
    error2.append(errors['error2'])
    error3.append(errors['error3'])
    error4.append(errors['error4'])
    err1ms = 50000
    err2ms = 50000
    err5ms = 50000
    err10ms = 50000
    for f in ['tier2:landmarks', 'tier3:landmarks']:
        if f in d:
            for l_ip, _, l_lat, l_lon in d[f]:
                dist = haversine((l_lat, l_lon), (d['lat_c'], d['lon_c']))
                key_rtt = (l_ip, d['target_ip'])
                if dist < err1ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 1):
                    err1ms = dist
                if dist < err2ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 2):
                    err2ms = dist
                if dist < err5ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 5):
                    err5ms = dist
                if dist < err10ms and (key_rtt not in remove_dict or remove_dict[key_rtt] <= 10):
                    err10ms = dist
    if err1ms != 50000:
        error1ms.append(err1ms)
    else:
        error1ms.append(error1[-1])
    if err2ms != 50000:
        error2ms.append(err2ms)
    else:
        error2ms.append(error1[-1])
    if err5ms != 50000:
        error5ms.append(err5ms)
    else:
        error5ms.append(error1[-1])
    if err10ms != 50000:
        error10ms.append(err10ms)
    else:
        error10ms.append(error1[-1])

for i in [1, 5, 10, 40, 9999999999]:
    c = len([j for j in error1ms if j <= i])
    print(f"{c} targets with landmarks (ping <= {i}) or {c/len(error1ms)}")

27026
27026
Tier1 Failed
119 targets with landmarks (ping <= 1) or 0.16459197786998617
356 targets with landmarks (ping <= 5) or 0.49239280774550487
423 targets with landmarks (ping <= 10) or 0.5850622406639004
517 targets with landmarks (ping <= 40) or 0.715076071922545
723 targets with landmarks (ping <= 9999999999) or 1.0
