In [1]:
import json
import pandas as pd

from default import ASNS_TYPE_CAIDA, ASNS_TYPE_STANFORD, REPRO_PROBES_AND_ANCHORS_FILE, REPRO_ANCHORS_FILE

# load datasets

In [2]:
with ASNS_TYPE_CAIDA.open("r") as f:
    asns_categories_caida = json.load(f)

with ASNS_TYPE_STANFORD.open("r") as f:
    asns_categories_stanford = json.load(f)
    
with REPRO_PROBES_AND_ANCHORS_FILE.open("r") as f:
    probes_and_anchors = json.load(f)

with REPRO_ANCHORS_FILE.open("r") as f:
    anchors = json.load(f)

In [None]:
def get_anchor_as_category(asns_category: dict, ripe_vps_dataset: dict) -> dict:
    """return one category per anchor"""
    ripe_categories = []

    for ripe_vp in ripe_vps_dataset:
        try:
            ripe_categories.append({
                "id": ripe_vp['id'],
                "category": asns_category[str(ripe_vp["asn_v4"])]
            })
        except KeyError:
            ripe_categories.append({
                "id": ripe_vp['id'],
                "category": "Unknown"
            })
            continue
    return ripe_categories

def get_categories_percentage(categories_df: pd.DataFrame) -> dict:
    """get percentage per categories from a set of categories"""
    category_repartition = dict()

    category_set = categories_df["category"].unique()
    for category in category_set:
        percentage = len(categories_df[categories_df["category"] == category]) * 100 / len(categories_df["id"])
        category_repartition[category] = percentage

        print(f"{category} : {len(categories_df[categories_df['category'] == category])} ({round(percentage,1)}%)")

    assert round(sum([v for v in category_repartition.values()])) == 100  

    return category_repartition

# Get targets type

In [None]:
anchors_category_caida = get_anchor_as_category(asns_categories_caida, anchors)
anchors_category_stanford = get_anchor_as_category(asns_categories_stanford, anchors)

caida_df = pd.DataFrame(anchors_category_caida, columns=["id", "category"])
stanford_df = pd.DataFrame(anchors_category_stanford, columns=["id", "category"])

# Caida categories

In [8]:
print("AS category repartition CAIDA:")
ripe_vps_categories_caida = get_categories_percentage(caida_df)

category = 'Access': 25.58%
category = 'Enterprise': 8.7%
category = 'Transit/Access': 29.03%
category = 'tier-1': 2.17%
category = 'Unknown': 4.99%
category = 'Content': 29.54%


# Stanford categories

In [9]:
print("AS category repartition STANFORD:")
ripe_vps_categories_stanford = get_categories_percentage(stanford_df)

category = 'Construction and Real Estate': 0.64%
category = 'Media, Publishing, and Broadcasting': 0.51%
category = 'Retail Stores, Wholesale, and E-commerce Sites': 1.28%
category = 'Computer and Information Technology': 73.79%
category = 'Freight, Shipment, and Postal Services': 0.13%
category = 'Museums, Libraries, and Entertainment': 0.13%
category = 'Other': 0.51%
category = 'Community Groups and Nonprofits': 4.48%
category = 'Travel and Accommodation': 0.26%
category = 'Service': 3.71%
category = 'Finance and Insurance': 0.77%
category = 'Education and Research': 5.5%
category = 'Unknown': 7.29%
category = 'Health Care Services': 0.13%
category = 'Manufacturing': 0.26%
category = 'Agriculture, Mining, and Refineries (Farming, Greenhouses, Mining, Forestry, and Animal Farming)': 0.0%
category = 'Utilities (Excluding Internet Service)': 0.26%
category = 'Government and Public Administration': 0.38%
