In [2]:
import json
import pandas as pd

from default import ASNS_TYPE_CAIDA, ASNS_TYPE_STANFORD, REPRO_PROBES_AND_ANCHORS_FILE, REPRO_ANCHORS_FILE

# load datasets

In [4]:
with ASNS_TYPE_CAIDA.open("r") as f:
    asns_categories_caida = json.load(f)

with ASNS_TYPE_STANFORD.open("r") as f:
    asns_categories_stanford = json.load(f)
    
with REPRO_PROBES_AND_ANCHORS_FILE.open("r") as f:
    probes_and_anchors = json.load(f)

with REPRO_ANCHORS_FILE.open("r") as f:
    anchors = json.load(f)

In [5]:
def get_anchor_as_category(asns_category: dict, ripe_vps_dataset: dict) -> dict:
    """return one category per anchor"""
    ripe_categories = []

    for ripe_vp in ripe_vps_dataset:
        try:
            ripe_categories.append({
                "id": ripe_vp['id'],
                "category": asns_category[str(ripe_vp["asn_v4"])]
            })
        except KeyError:
            ripe_categories.append({
                "id": ripe_vp['id'],
                "category": "Unknown"
            })
            continue
    return ripe_categories

def get_categories_percentage(categories_df: pd.DataFrame) -> dict:
    """get percentage per categories from a set of categories"""
    category_repartition = dict()

    category_set = categories_df["category"].unique()
    for category in category_set:
        percentage = len(categories_df[categories_df["category"] == category]) * 100 / len(categories_df["id"])
        category_repartition[category] = percentage

        print(f"{category} : {len(categories_df[categories_df['category'] == category])} ({round(percentage,1)}%)")

    assert round(sum([v for v in category_repartition.values()])) == 100  

    return category_repartition

# Get targets type

In [6]:
anchors_category_caida = get_anchor_as_category(asns_categories_caida, anchors)
anchors_category_stanford = get_anchor_as_category(asns_categories_stanford, anchors)

caida_df = pd.DataFrame(anchors_category_caida, columns=["id", "category"])
stanford_df = pd.DataFrame(anchors_category_stanford, columns=["id", "category"])

# Caida categories

In [7]:
print("AS category repartition CAIDA:")
ripe_vps_categories_caida = get_categories_percentage(caida_df)

AS category repartition CAIDA:
Content : 229 (31.7%)
Access : 211 (29.2%)
Transit/Access : 197 (27.2%)
Enterprise : 55 (7.6%)
tier-1 : 6 (0.8%)
Unknown : 25 (3.5%)


# Stanford categories

In [8]:
print("AS category repartition STANFORD:")
ripe_vps_categories_stanford = get_categories_percentage(stanford_df)

AS category repartition STANFORD:
Computer and Information Technology : 521 (72.1%)
Education and Research : 38 (5.3%)
Community Groups and Nonprofits : 33 (4.6%)
Health Care Services : 2 (0.3%)
Finance and Insurance : 6 (0.8%)
Unknown : 53 (7.3%)
Media, Publishing, and Broadcasting : 21 (2.9%)
Service : 25 (3.5%)
Construction and Real Estate : 5 (0.7%)
Travel and Accommodation : 2 (0.3%)
Government and Public Administration : 3 (0.4%)
Retail Stores, Wholesale, and E-commerce Sites : 5 (0.7%)
Utilities (Excluding Internet Service) : 1 (0.1%)
Manufacturing : 2 (0.3%)
Other : 4 (0.6%)
Museums, Libraries, and Entertainment : 1 (0.1%)
Freight, Shipment, and Postal Services : 1 (0.1%)
