In [1]:
import json
import pandas as pd

from default import ASNS_TYPE_CAIDA, ASNS_TYPE_STANFORD, PROBES_AND_ANCHORS_FILE, ANCHORS_FILE

# load datasets

In [2]:
with ASNS_TYPE_CAIDA.open("r") as f:
    asns_types_caida = json.load(f)

with ASNS_TYPE_STANFORD.open("r") as f:
    asns_types_stanford = json.load(f)
    
with PROBES_AND_ANCHORS_FILE.open("r") as f:
    probes_and_anchors = json.load(f)

with ANCHORS_FILE.open("r") as f:
    anchors = json.load(f)

# Get targets type

In [3]:

def get_anchor_as_category(asns_category: dict) -> dict:
    """return one category per anchor"""
    anchors_category = []

    for anchor in anchors:
        try:
            anchors_category.append({
                "id": anchor['id'],
                "type": asns_category[str(anchor["asn_v4"])]
            })
        except KeyError:
            anchors_category.append({
                "id": anchor['id'],
                "type": "Unknown"
            })
            continue
    return anchors_category

anchors_category = dict()
anchors_category["caida"] = get_anchor_as_category(asns_types_caida)
anchors_category["stanford"] = get_anchor_as_category(asns_types_stanford)

print(f"unmapped target caida: {len([anchor for anchor in anchors_category['caida'] if anchor['type'] == 'Unknown'])}")
print(f"unmapped target stanford: {len([anchor for anchor in anchors_category['stanford'] if anchor['type'] == 'Unknown'])}")

unmapped target caida: 39
unmapped target stanford: 57


In [4]:
caida_df = pd.DataFrame(anchors_category["caida"], columns=["id", "type"])
stanford_df = pd.DataFrame(anchors_category["stanford"], columns=["id", "type"])

In [5]:
print(caida_df.info())
print(caida_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      782 non-null    int64 
 1   type    782 non-null    object
dtypes: int64(1), object(1)
memory usage: 12.3+ KB
None
     id            type
0  6025         Content
1  6031          Access
2  6039  Transit/Access
3  6042  Transit/Access
4  6043  Transit/Access


In [6]:
print(stanford_df.info())
print(stanford_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782 entries, 0 to 781
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      782 non-null    int64 
 1   type    782 non-null    object
dtypes: int64(1), object(1)
memory usage: 12.3+ KB
None
     id                                 type
0  6025  Computer and Information Technology
1  6031  Computer and Information Technology
2  6039  Computer and Information Technology
3  6042  Computer and Information Technology
4  6043               Education and Research


In [7]:
# get respective datasert categories
caida_categories = set(asns_types_caida.values())
stanford_categories = set(asns_types_stanford.values())

# add unknown category
caida_categories.add("Unknown")
stanford_categories.add("Unknown")

print(f"{caida_categories = }")
print(f"{stanford_categories = }")

caida_categories = {'Access', 'Enterprise', 'Transit/Access', 'tier-1', 'Unknown', 'Content'}
stanford_categories = {'Construction and Real Estate', 'Media, Publishing, and Broadcasting', 'Retail Stores, Wholesale, and E-commerce Sites', 'Computer and Information Technology', 'Freight, Shipment, and Postal Services', 'Museums, Libraries, and Entertainment', 'Other', 'Community Groups and Nonprofits', 'Travel and Accommodation', 'Service', 'Finance and Insurance', 'Education and Research', 'Unknown', 'Health Care Services', 'Manufacturing', 'Agriculture, Mining, and Refineries (Farming, Greenhouses, Mining, Forestry, and Animal Farming)', 'Utilities (Excluding Internet Service)', 'Government and Public Administration'}


# Caida categories

In [8]:
percentage_per_category_caida = dict()
for category in caida_categories:
    percentage = len(caida_df[caida_df["type"] == category]) * 100 / len(caida_df["id"])
    percentage_per_category_caida[category] = percentage

    print(f"{category = }: {round(percentage, 2)}%")

assert round(sum([v for v in percentage_per_category_caida.values()])) == 100  

category = 'Access': 25.58%
category = 'Enterprise': 8.7%
category = 'Transit/Access': 29.03%
category = 'tier-1': 2.17%
category = 'Unknown': 4.99%
category = 'Content': 29.54%


# Stanford categories

In [9]:
percentage_per_category_stanford = dict()
for category in stanford_categories:
    percentage = len(stanford_df[stanford_df["type"] == category]) * 100 / len(stanford_df["id"])
    percentage_per_category_stanford[category] = percentage

    print(f"{category = }: {round(percentage, 2)}%")

assert round(sum([v for v in percentage_per_category_stanford.values()])) == 100  

category = 'Construction and Real Estate': 0.64%
category = 'Media, Publishing, and Broadcasting': 0.51%
category = 'Retail Stores, Wholesale, and E-commerce Sites': 1.28%
category = 'Computer and Information Technology': 73.79%
category = 'Freight, Shipment, and Postal Services': 0.13%
category = 'Museums, Libraries, and Entertainment': 0.13%
category = 'Other': 0.51%
category = 'Community Groups and Nonprofits': 4.48%
category = 'Travel and Accommodation': 0.26%
category = 'Service': 3.71%
category = 'Finance and Insurance': 0.77%
category = 'Education and Research': 5.5%
category = 'Unknown': 7.29%
category = 'Health Care Services': 0.13%
category = 'Manufacturing': 0.26%
category = 'Agriculture, Mining, and Refineries (Farming, Greenhouses, Mining, Forestry, and Animal Farming)': 0.0%
category = 'Utilities (Excluding Internet Service)': 0.26%
category = 'Government and Public Administration': 0.38%
