In [1]:
import json
import pandas as pd

from default import ASNS_TYPE_CAIDA, ASNS_TYPE_STANFORD, REPRO_PROBES_AND_ANCHORS_FILE, REPRO_ANCHORS_FILE, REPRO_PROBES_FILE

# load datasets

In [2]:
with ASNS_TYPE_CAIDA.open("r") as f:
    asns_categories_caida = json.load(f)

with ASNS_TYPE_STANFORD.open("r") as f:
    asns_categories_stanford = json.load(f)
    
with REPRO_PROBES_AND_ANCHORS_FILE.open("r") as f:
    probes_and_anchors = json.load(f)

with REPRO_PROBES_FILE.open("r") as f:
    probes = json.load(f)

with REPRO_ANCHORS_FILE.open("r") as f:
    anchors = json.load(f)

In [3]:
def get_anchor_as_category(asns_category: dict, ripe_vps_dataset: dict) -> dict:
    """return one category per anchor"""
    ripe_categories = []

    for ripe_vp in ripe_vps_dataset:
        try:
            ripe_categories.append({
                "id": ripe_vp['id'],
                "category": asns_category[str(ripe_vp["asn_v4"])]
            })
        except KeyError:
            ripe_categories.append({
                "id": ripe_vp['id'],
                "category": "Unknown"
            })
            continue
    return ripe_categories

def get_categories_percentage(categories_df: pd.DataFrame) -> dict:
    """get percentage per categories from a set of categories"""
    category_repartition = dict()

    category_set = categories_df["category"].unique()
    for category in category_set:
        percentage = len(categories_df[categories_df["category"] == category]) * 100 / len(categories_df["id"])
        category_repartition[category] = percentage

        print(f"{category} : {len(categories_df[categories_df['category'] == category])} ({round(percentage,1)}%)")

    assert round(sum([v for v in category_repartition.values()])) == 100  

    return category_repartition

# Get targets type

In [4]:
category_caida_anchors = get_anchor_as_category(asns_categories_caida, anchors)
category_caida_probes = get_anchor_as_category(asns_categories_caida, probes)
category_caida_probes_and_anchors = get_anchor_as_category(asns_categories_caida, probes_and_anchors)

category_stanford_anchors = get_anchor_as_category(asns_categories_stanford, anchors)
category_stanford_probes = get_anchor_as_category(asns_categories_stanford, probes)
category_stanford_probes_and_anchors = get_anchor_as_category(asns_categories_stanford, probes_and_anchors)

caida_df_anchors = pd.DataFrame(category_caida_anchors, columns=["id", "category"])
caida_df_probes = pd.DataFrame(category_caida_probes, columns=["id", "category"])
caida_df_probes_and_anchors = pd.DataFrame(category_caida_probes_and_anchors, columns=["id", "category"])

stanford_df_anchors = pd.DataFrame(category_stanford_anchors, columns=["id", "category"])
stanford_df_probes = pd.DataFrame(category_stanford_probes, columns=["id", "category"])
stanford_df_probes_and_anchors = pd.DataFrame(category_stanford_probes_and_anchors, columns=["id", "category"])

# Caida categories

In [5]:
print("Anchors results: \n")
ripe_vps_categories_caida = get_categories_percentage(caida_df_anchors)
print()

print("Probes results: \n")
ripe_vps_categories_caida = get_categories_percentage(caida_df_probes)
print()

print("Probes and anchors results: \n")
ripe_vps_categories_caida = get_categories_percentage(caida_df_probes_and_anchors)
print()

Anchors results: 

Content : 229 (31.7%)
Access : 211 (29.2%)
Transit/Access : 197 (27.2%)
Enterprise : 55 (7.6%)
tier-1 : 6 (0.8%)
Unknown : 25 (3.5%)

Probes results: 

Access : 9124 (75.2%)
Transit/Access : 1005 (8.3%)
Enterprise : 410 (3.4%)
Unknown : 312 (2.6%)
Content : 1112 (9.2%)
tier-1 : 166 (1.4%)

Probes and anchors results: 

Access : 9347 (72.4%)
Transit/Access : 1221 (9.5%)
Enterprise : 472 (3.7%)
Unknown : 339 (2.6%)
Content : 1361 (10.5%)
tier-1 : 174 (1.3%)



# Stanford categories

In [6]:
print("Anchors results: \n")
ripe_vps_categories_caida = get_categories_percentage(stanford_df_anchors)
print()

print("Probes results: \n")
ripe_vps_categories_caida = get_categories_percentage(stanford_df_probes)
print()

print("Probes and anchors results: \n")
ripe_vps_categories_caida = get_categories_percentage(stanford_df_probes_and_anchors)
print()

Anchors results: 

Computer and Information Technology : 521 (72.1%)
Education and Research : 38 (5.3%)
Community Groups and Nonprofits : 33 (4.6%)
Health Care Services : 2 (0.3%)
Finance and Insurance : 6 (0.8%)
Unknown : 53 (7.3%)
Media, Publishing, and Broadcasting : 21 (2.9%)
Service : 25 (3.5%)
Construction and Real Estate : 5 (0.7%)
Travel and Accommodation : 2 (0.3%)
Government and Public Administration : 3 (0.4%)
Retail Stores, Wholesale, and E-commerce Sites : 5 (0.7%)
Utilities (Excluding Internet Service) : 1 (0.1%)
Manufacturing : 2 (0.3%)
Other : 4 (0.6%)
Museums, Libraries, and Entertainment : 1 (0.1%)
Freight, Shipment, and Postal Services : 1 (0.1%)

Probes results: 

Computer and Information Technology : 10028 (82.7%)
Community Groups and Nonprofits : 129 (1.1%)
Unknown : 842 (6.9%)
Education and Research : 352 (2.9%)
Construction and Real Estate : 60 (0.5%)
Manufacturing : 25 (0.2%)
Service : 300 (2.5%)
Media, Publishing, and Broadcasting : 183 (1.5%)
Other : 14 (0.1%