In [2]:
import folium
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from shapely.geometry import Polygon
from sklearn.cluster import DBSCAN
from scipy.spatial import ConvexHull

from collections import defaultdict
from math import pi, cos
from dataclasses import dataclass
from loguru import logger

from georesolver.common.geoloc import distance
from georesolver.clickhouse.queries import load_targets, load_vps
from georesolver.common.files_utils import load_json, load_pickle, load_csv
from georesolver.common.ip_addresses_utils import get_prefix_from_ip
from georesolver.common.settings import PathSettings, ClickhouseSettings

path_settings = PathSettings()
clickhouse_settings = ClickhouseSettings()


@dataclass(frozen=True)
class ResultsScore:
    client_granularity: str
    answer_granularity: str
    scores: list
    inconsistent_mappings: list


targets = load_targets(clickhouse_settings.VPS_FILTERED_TABLE)
targets_coordinates = {}
for target in targets:
    targets_coordinates[target["addr"]] = (
        target["lat"],
        target["lon"],
        target["country_code"],
    )

vps = load_vps(clickhouse_settings.VPS_FILTERED_TABLE)
vps_coordinates = {}
vps_per_subnet = defaultdict(list)
for vp in vps:
    vps_coordinates[vp["addr"]] = (
        vp["lat"],
        vp["lon"],
        vp["country_code"],
    )

    subnet = get_prefix_from_ip(vp["addr"])
    vps_per_subnet[subnet].append(vp["addr"])


def plotDot(lat, lon, map, color="blue"):
    """input: series that contains a numeric named latitude and a numeric named longitude
    this function creates a CircleMarker and adds it to your this_map"""
    folium.CircleMarker(location=[lat, lon], radius=2, weight=5, color=color).add_to(
        map
    )


def find_exterior_points(points):
    # Create a numpy array from the list of points
    points_array = np.array(points)

    # Compute the convex hull
    hull = ConvexHull(points_array)

    # Extract the indices of exterior points
    exterior_indices = hull.vertices

    # Get the exterior points
    exterior_points = points_array[exterior_indices]

    return exterior_points, hull


def get_median_dist_error(target_results: dict, key: str) -> float:
    """return the median distance error"""
    return round(
        np.median([r[key]["d_error"] for r in target_results.values()]),
        2,
    )

[32m2025-04-14 15:17:56.969[0m | [34m[1mDEBUG   [0m | [36mgeoresolver.clickhouse.main[0m:[36mexecute[0m:[36m120[0m - [34m[1mquery=GetVPs; database=GeoResolver; table_name=vps_filtered  limit=None[0m
[32m2025-04-14 15:17:57.041[0m | [34m[1mDEBUG   [0m | [36mgeoresolver.clickhouse.main[0m:[36mexecute[0m:[36m120[0m - [34m[1mquery=GetVPs; database=GeoResolver; table_name=vps_filtered  limit=None[0m


In [3]:
eval_results = load_pickle(
    path_settings.RESULTS_PATH
    / "evaluation_1M_hostnames_answer_bgp_prefix_max_bgp_prefix_clusters.pickle"
)

RuntimeError: could not load json file: [Errno 2] No such file or directory: '/storage/hugo/georesolver/georesolver/common/../results/evaluation_1M_hostnames_answer_bgp_prefix_max_bgp_prefix_clusters.pickle'

In [None]:
single_cluster_results = {}

for target_addr, target_result in eval_results.items():
    if len(target_result["ecs_vps_per_cluster"]) > 2:
        continue

    no_ping_vp = target_result["ecs_vps"][0][0]
    vp_lat, vp_lon, vp_country_code = vps_coordinates[no_ping_vp]
    target_lat, target_lon, target_country_code = targets_coordinates[target_addr]

    d_error = distance(vp_lat, target_lat, vp_lon, target_lon)
    target_result["no_ping_vp_d_error"] = {}
    target_result["no_ping_vp_d_error"]["d_error"] = d_error

    if target_country_code == vp_country_code:
        target_result["correct_country_geoloc"] = True
    else:
        target_result["correct_country_geoloc"] = False

    single_cluster_results[target_addr] = target_result

# overall cluster analysis

In [None]:
logger.info(f"No ping geolocation over:: {len(eval_results)} targets")
no_ping_cluster_m_d = get_median_dist_error(eval_results, "no_ping_cluster_vp")
logger.info(f"No ping cluster:: median_error={round(no_ping_cluster_m_d, 2)} [km]")


logger.info(
    f"No ping geolocation over (single cluster):: {len(single_cluster_results)} targets"
)
single_no_ping_cluster_m_d = get_median_dist_error(
    single_cluster_results, "no_ping_vp_d_error"
)
logger.info(
    f"No ping cluster:: median_error={round(single_no_ping_cluster_m_d, 2)} [km]"
)

[32m2024-07-04 12:48:44.933[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mNo ping geolocation over:: 759 targets[0m
[32m2024-07-04 12:48:44.939[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mNo ping cluster:: median_error=104.24 [km][0m
[32m2024-07-04 12:48:44.941[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mNo ping geolocation over (single cluster):: 578 targets[0m
[32m2024-07-04 12:48:44.946[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mNo ping cluster:: median_error=34.81 [km][0m


# cluster country validation

In [None]:
correct_country_geoloc = 0
for target, target_result in single_cluster_results.items():
    if target_result["correct_country_geoloc"]:
        correct_country_geoloc += 1

correct_country_geoloc = round(
    correct_country_geoloc / len(single_cluster_results) * 100, 2
)
logger.info(
    f"Correct country geoloc with single cluster:: {correct_country_geoloc} [%]"
)

[32m2024-07-04 13:09:15.926[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mCorrect country geoloc with single cluster:: 97.06 [%][0m


# single cluster analysis

In [None]:
fig, ax1 = plt.subplots(1, 1)

x, y = ecdf([r["no_ping_cluster_vp"]["d_error"] for r in eval_results.values()])
ax1.plot(x, y, label=f"No ping cluster VP")

plt.xlabel("geolocation error [km]")
plt.ylabel("proportion of targets")
plt.legend(loc="upper left", fontsize=8)
plt.xscale("log")
plt.grid()
plt.title(
    f"CDF: Geolocation error \n maximum BGP prefix per CDN hostname selection",
    fontsize=13,
)
plt.savefig(path_settings.FIGURE_PATH / "geoloc_error_max_bgp_prefix.pdf")
plt.show()

In [None]:
granularity = "answer_bgp_prefix"
results = eval_results[granularity]

fig, ax1 = plt.subplots(1, 1)

x, y = ecdf([len(r["filtered_vps_per_cluster"]) for r in results.values()])
ax1.plot(x, y, label="ECS cluster selection")

avg_cost_cluster = round(
    np.mean([[len(r["filtered_vps_per_cluster"]) for r in results.values()]]),
    2,
)
median_cost_cluster = round(
    np.median([[len(r["filtered_vps_per_cluster"]) for r in results.values()]]),
    2,
)

logger.info(f"Avg Measurement cost cluster selection = {avg_cost_cluster}")
logger.info(f"Median Measurement cost cluster selection = {median_cost_cluster}")

plt.xlabel("Measurement cost")
plt.ylabel("proportion of targets")
plt.legend(loc="upper left", fontsize=8)
plt.grid()
plt.title(
    f"CDF: Measurement cost",
    fontsize=13,
)
# plt.savefig(path_settings.FIGURE_PATH / "measurement_cost_cluster_selection.pdf")
plt.show()

KeyError: 'answer_bgp_prefix'