## Dependency analysis

This notebook contains analysis of dependencies in Common Criteria certificates.

### Interesting occurences I have noticed during analysing data
- 3970 certificates referencing no other certificates: directly_affecting == Nan && indirectly_affecting == NaN
- 158 certificates which are affected by at least one certificates and affecting no certificates.
- 311 certificates are affected by at least one archived certicates.
- 16 BSI certificates affecting ANSSI certificates out of total 831 BSI certs.
- 38 ANSSI certificates affecting BSI certificates out of total 682 ANSSI certs.
- 25 certificates are crossed referenced
- Certificates with security level EAL6+ are directly affecting other certificates with levels: {'EAL6+': 38, 'EAL5+': 6, 'EAL4+': 5} (EAL6+ is directly affecting certificates with lower security levels)
- Most common security level among smart-cards is EAL5+ with 671 occurences.
- Highest Smart Card BSI level: EAL6+, most common level: EAL4+
- Highest Smart Card ANSSI level: EAL7, most common level: EAL5+
- Lowest security level among smart cards in dataset: EAL1+ ['ATMEL AT90SC6464C Integrated circuit (reference AT568A9 rev. F)',
 'CT2000 embedded Component (reference ST16RFHD50/RSG-A)',
 'M/Chip Select v2.0.5.2 Application',
 'MODEUS electronic purse : MODEUS carrier card v1.1 (reference : ST16RF58/RSE+) and SAM TC/C v1.1 retailer security module (reference : ST19SF16FF/RVN)',
 "Oberthur B0' application v1.0.1 and GemClub v1.3 loaded on Javacard/VOP GemXpresso platform 211 V2",
 'Palmera Protect platform V2.0 JavaCard (SLE66CX320P/SB62 embedded component)',
 'VOP 2.0.1 / Javacard 2.1.1 JPH33V2 Operating system version 1 installed on Integrated circuit PHILIPS P8WE5033',
 'Javacard/VOP GemXpresso 211 platform (Philips Integrated circuit P8WE5032/MPH02)',
 'Javacard/VOP GemXpresso 211 platform V2 (Philips P8WE5032/MPH04 embedded component, A000000018434D Card Manager)',
 'S3C8975 for smart cards Integrated circuit',
 "'Mondex Purse 2' electronic purse version 0203 component SLE66CX160S, MULTOS V4.1N operating system)",
 "B4/B0' V2 bank application of the MONEO/CB hybrid card (reference : ST19SF16B RCL version B303/B002)",
 "Javacard/VOP GemXpresso 211 platform (Philips P8WE5032/MPH02 Integrated circuit ) with Oberthur B0' v0.32 and Visa VSDC v1.08 applets",
 'MONEO electronic wallet card carrier (ST19SF16B RCL v. B303) and PSAM retailer security module (ST19SF16B RCL v. C103)']


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import collections 
import datetime

from sec_certs.dataset import CCDataset
from typing import Tuple, List

%matplotlib inline
plt.style.use("seaborn-whitegrid")
sns.set_palette("deep")
sns.set_context("notebook")  # Set to "paper" for use in paper :)

In [None]:
dset: CCDataset = CCDataset.from_web_latest()
dset._compute_dependencies()
df = dset.to_pandas()

print(f"Dataset has {df.shape[0]} rows and {df.shape[1]} columns.")

In [None]:
df.head()

In [None]:
df.info()

### How many active and archived certificates are in dataset?

In [None]:
df.status.value_counts()

### Which certificates are referenced at least by one certificate? 

In [None]:
def is_directly_affected_by(references):
    if references is np.nan:
        return False
    
    return True

def count_directly_affected_by(references):
    if references is np.nan:
        return np.nan
    return len(references)

directly_referencing_df = df.copy()
directly_referencing_df["is_directly_referencing"] = df["directly_referencing"].apply(is_directly_affected_by)
directly_referencing_df["directly_referencing_sum"] = directly_affected_by_df["directly_referencing"].apply(count_directly_affected_by)
directly_referencing_df.sort_values(by="directly_referencing_sum", ascending=False, inplace=True)
directly_referencing_df.head()

In [None]:
fig, (ax1, ax2) = plt.subplots(2,1, figsize=(12,20))
normalized_serie = directly_affected_by_df["category"].value_counts(normalize=True)
plt.rcParams.update({'font.size': 20})
sns.countplot(y="category", hue="is_directly_referencing", data=directly_referencing_df, ax=ax1).set_title("Which certificates are referenced at least by one certificate vs. which are affected by no certificates")
sns.barplot(y=normalized_serie.index, x=normalized_serie.values, ax=ax2).set_title("Normalized values for each category")
plt.show()

### Which certificates are referencing no other?

In [None]:
no_affecting_df = df[df["directly_affecting"].isna() & df["indirectly_affecting"].isna()]

print(f"There are total {no_affecting_df.shape[0]} certificates referencing no other certificates.")

In [None]:
no_affecting_df.head()

### How many no affecting certificates are affected by other certificates?

In [None]:
affected_but_no_affecting_df = no_affecting_df[no_affecting_df["directly_affected_by"].notna() & no_affecting_df["indirectly_affected_by"].notna()]
print(f"There are total of {affected_but_no_affecting_df.shape[0]} certificates which are affected by other certificates and affecting no certificates.")

### How many certificates are not affected by other certificates, nor affecting other certificates?

In [None]:
def is_no_affecting_nor_affected(directly_affecting, indirectly_affecting, directly_affected_by, indirectly_affected_by):
    if directly_affecting is np.nan and indirectly_affecting is np.nan and directly_affected_by is np.nan and indirectly_affected_by is np.nan:
        return True
    
    return False


no_affecting_no_affected_df = df.copy()
no_affecting_no_affected_df["is_no_affecting_nor_affected"] = df.apply(lambda x: is_no_affecting_nor_affected(x["directly_affecting"], x["indirectly_affecting"], x["directly_affected_by"], x["indirectly_affected_by"]), axis=1)
no_affecting_no_affected_df.head()

In [None]:
fig, (ax1, ax2) = plt.subplots(2,1, figsize=(12,20))
normalized_serie = no_affecting_no_affected_df["scheme"].value_counts(normalize=True)
plt.rcParams.update({'font.size': 20})
sns.countplot(y="scheme", hue="is_no_affecting_nor_affected", data=no_affecting_no_affected_df, ax=ax1).set_title("Distribution of schemes which certs from categories are not affecting, nor affected by other certs")
sns.barplot(y=normalized_serie.index, x=normalized_serie.values, ax=ax2).set_title("Normalized values for each scheme")
plt.show()

### Which certificates are dependent on the archived certificates?

In [None]:
archived_cert_id_list = df[df["cert_id"].notna() & (df["status"] == "archived")]["cert_id"].tolist()

def contains_archived_cert_dependency(affected_by):
    if affected_by is np.nan:
        return False
    
    for cert_id in affected_by:
        if cert_id in archived_cert_id_list:
            return True
        
    return False


depends_on_archived_df = df.copy()
depends_on_archived_df["depends_on_archived"] = depends_on_archived_df["directly_affected_by"].apply(contains_archived_cert_dependency)
total_records_dependent = sum(depends_on_archived_df["depends_on_archived"])
print(f"Total {total_records_dependent} certificates are affected by at least one archived certicates.")

In [None]:
fig, (ax1, ax2) = plt.subplots(2,1, figsize=(12,20))
normalized_serie = depends_on_archived_df["category"].value_counts(normalize=True)
plt.rcParams.update({'font.size': 20})
sns.countplot(y="category", hue="depends_on_archived", data=depends_on_archived_df, ax=ax1).set_title("Distribution of categories among certificates dependent on archived certs.")
sns.barplot(y=normalized_serie.index, x=normalized_serie.values, ax=ax2).set_title("Normalized values for each category")
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(2,1, figsize=(12,20))
normalized_serie = depends_on_archived_df["scheme"].value_counts(normalize=True)
plt.rcParams.update({'font.size': 20})
sns.countplot(y="scheme", hue="depends_on_archived", data=depends_on_archived_df, ax=ax1).set_title("Distribution of schemes among certificates dependent on archived certs.")
sns.barplot(y=normalized_serie.index, x=normalized_serie.values, ax=ax2).set_title("Normalized values for each scheme")
plt.show()

### How frequently are BSI certificates referencing ANSSI certs and vice versa?

In [None]:
from typing import Set


def is_bsi_cert(cert_id: str) -> bool:
    if cert_id is np.nan:
        return False
    
    if cert_id.lower().startswith("bsi"):
        return True
    
    return False


def is_anssi_cert(cert_id: str) -> bool:
    if cert_id is np.nan:
        return False
    
    if cert_id.lower().startswith("anssi"):
        return True

    return False


def is_affecting_anssi(directly_affecting: Set[str]) -> bool:
    if directly_affecting is np.nan:
        return False
    
    for cert_id in directly_affecting:
        if is_anssi_cert(cert_id):
            return True
        
    return False


def is_affecting_bsi(directly_affecting: Set[str]) -> bool:
    if directly_affecting is np.nan:
        return False
    
    for cert_id in directly_affecting:
        if is_bsi_cert(cert_id):
            return True
    
    return False
    
    
df["is_bsi_cert"] = df["cert_id"].apply(is_bsi_cert)
df["is_anssi_cert"] = df["cert_id"].apply(is_anssi_cert)

bsi_df = df[df["is_bsi_cert"] == True].copy()
anssi_df = df[df["is_anssi_cert"] == True].copy()

bsi_df["is_affecting_anssi"] = bsi_df["directly_affecting"].apply(is_affecting_anssi)
bsi_affecting_anssi_df = bsi_df[bsi_df["is_affecting_anssi"] == True]

anssi_df["is_affecting_bsi"] = anssi_df["directly_affecting"].apply(is_affecting_bsi)
anssi_affecting_bsi_df = anssi_df[anssi_df["is_affecting_bsi"] == True]

bsi_total_records = bsi_df.shape[0]
anssi_total_records = anssi_df.shape[0]
bsi_affecting_records = bsi_affecting_anssi_df.shape[0]
anssi_affecting_records = anssi_affecting_bsi_df.shape[0]

print(f"There are {bsi_affecting_records} BSI certs affecting ANSSI certs out of total {bsi_total_records} BSI certs.")
print(f"There are {anssi_affecting_records} ANSSI certs affecting BSI certs out of total {anssi_total_records} ANSSI certs.")

print(f"Success hit for BSI certs: {bsi_affecting_records / bsi_total_records}")
print(f"Success hit for ANSSI certs: {anssi_affecting_records / anssi_total_records}")

In [None]:
bsi_affecting_anssi_df.head()

### Which certificates are referencing each other? (= are crossed referenced)

In [None]:
def is_already_involved(cross_reference_list: List[Tuple[str, str]], certs_set: Set[str]) -> bool:
    return certs_set in cross_reference_list

def is_cert_affecting_other_cert(root_cert_id: str, affected_cert_id: str) -> bool:
    return affected_cert_id in cross_df[cross_df["cert_id"] == root_cert_id].iloc[0]["directly_affecting"]

cross_reference_list: List[Set[str]] = []
cross_df = df[(df["cert_id"].notna()) & (df["directly_affecting"].notna())]
count = 1
total = cross_df.shape[0]

for cert_record in cross_df.itertuples():
    cert_id = cert_record.cert_id

    for another_cert_record in cross_df.itertuples():

        another_cert_id = another_cert_record.cert_id
        
        if cert_record.cert_id == another_cert_record.cert_id:
            continue
            
        certs_set = set([cert_id, another_cert_id])
        
        if is_cert_affecting_other_cert(cert_id, another_cert_id) and is_cert_affecting_other_cert(another_cert_id, cert_id) and not is_already_involved(cross_reference_list, certs_set):
            cross_reference_list.append(certs_set)
    count += 1 
    
    
print(f"Total of {len(cross_reference_list)} crossed referenced certificates.")
print(cross_reference_list)

### What are the EAL levels typically affecting a certificate? E.g. are certificates referencing EAL5 typically higher or same level?

In [None]:
df.head()

In [None]:
# Introduce security level EAL variable
eals = ['EAL1', 'EAL1+', 'EAL2', 'EAL2+', 'EAL3', 'EAL3+', 'EAL4', 'EAL4+', 'EAL5', 'EAL5+', 'EAL6+', 'EAL7', 'EAL7+']
df['highest_security_level'] = df.security_level.map(lambda all_levels: [eal for eal in all_levels if eal.startswith('EAL')] if all_levels else np.nan)
df.highest_security_level = df.highest_security_level.map(lambda x: x[0] if x and isinstance(x, list) else np.nan)
df.highest_security_level = pd.Categorical(df.highest_security_level, categories=eals, ordered=True)

In [None]:
levels_df = df[(df["highest_security_level"].notna()) & (df["directly_affecting"].notna()) & (df["cert_id"].notna())].copy()
levels_df.head()

In [None]:
from typing import Dict

def get_cert_id_security_level(cert_id: str) -> str:
    cert_id_df = df[df["cert_id"] == cert_id]
    
    if cert_id_df.empty:  # we do not have record in main dset for this cert_id
        return None
    
    return cert_id_df.iloc[0]["highest_security_level"]


def get_levels_of_affected_certs(affected_certs: Set[str]) -> Dict[str, int]:
    result = {}
    
    for affected_cert_id in affected_certs:
        security_level = get_cert_id_security_level(affected_cert_id)
        
        if security_level is None:  # cert_id does not follow condition for levels_df
            continue
            
        result[security_level] = result.get(security_level, 0) + 1
        
    return result
        

levels_df["affecting_security_levels"] = levels_df["directly_affecting"].apply(get_levels_of_affected_certs)
levels_df.head(20)

In [None]:
result = {}

for security_level in eals:
    security_level_list = []
    counter = collections.Counter()
    security_level_df = levels_df[levels_df["highest_security_level"] == security_level]["affecting_security_levels"]
    
    for security_dict in security_level_df:
        counter.update(security_dict)
                
    print(f"Certs with security level {security_level} are directly affecting other certificates with levels: {dict(counter)}")
        
    result[security_level] = counter

In [None]:
plt.figure(figsize=(10,5))
plt.rcParams.update({'font.size': 20})
heatmap_result = []


for security_level, counter in result.items():
    security_level_list = []
    for security_level_key in eals:
        security_level_list.append(counter.get(security_level_key, 0))
    
    heatmap_result.append(security_level_list)
    
sns.set(style="whitegrid")
ax = sns.heatmap(heatmap_result, xticklabels=eals, yticklabels=eals,cmap="Greens").set_title("Archived certs vs. active certs")
plt.ylabel("Specific security level")
plt.xlabel("Security levels affected by specific security level")

### Basic Analysis of most common category

In [None]:
cards_df = df[df["category"] == "ICs, Smart Cards and Smart Card-Related Devices and Systems"]
print(f"There are total {cards_df.shape[0]} rows ICs, Smart Cards and Smart Card-Related Devices and Systems category.")

#### How many certificates are active/archived

In [None]:
total_archived_certs = sum(cards_df["status"] == "archived")
total_active_certs = sum(cards_df["status"] == "active")

print(f"There are total {total_archived_certs} archived records among smart-cards")
print(f"There are total {total_active_certs} active records among smart-cards")

#### Which manufacturer is the most common in this category?

In [None]:
most_common_smart_card_manufacturer = cards_df["manufacturer"].value_counts().index[0]
print(f"The most common manufacturer in smart cards category is: {most_common_smart_card_manufacturer}")

#### Analysis of security levels of smart-cards

In [None]:
# The most common security level among smart-cards
most_common_sec_level = cards_df["highest_security_level"].value_counts().index[0]
sec_level_amount = cards_df["highest_security_level"].value_counts()[0]

print(f"Most common security level among smart-cards is {most_common_sec_level} with {sec_level_amount} occurences.")

In [None]:
# The lowest common security level achieved in dataset
security_level_occurences = cards_df["highest_security_level"].value_counts()
filtered_sec_levels = [sec_level for sec_level, count in security_level_occurences.items() if count > 0]
level_numbers = {x: y for x, y in zip(eals, range(len(eals)))}

lowest_smart_card_security_level = None
lowest_security_level_int = None

for sec_level in filtered_sec_levels:
    if lowest_security_level_int is None:
        lowest_security_level_int = level_numbers[sec_level]
        lowest_smart_card_security_level = sec_level
    
    if level_numbers[sec_level] < lowest_security_level_int:
        lowest_security_level_int = level_numbers[sec_level]
        lowest_smart_card_security_level = sec_level
        
print(f"Lowest security level among smart cards in dataset: {lowest_smart_card_security_level}")

In [None]:
# The highest common security level in smart-card dataset
highest_smart_card_security_level = None
highest_security_level_int = None

for sec_level in filtered_sec_levels:
    if highest_security_level_int is None:
        highest_security_level_int = level_numbers[sec_level]
        highest_smart_card_security_level = sec_level
    
    if level_numbers[sec_level] > highest_security_level_int:
        highest_security_level_int = level_numbers[sec_level]
        highest_smart_card_security_level = sec_level
        
print(f"Highest security level among smart cards in dataset: {highest_smart_card_security_level}")

#### View data with lowest security level (EAL1+)

In [None]:
eal1_plus_df = cards_df[cards_df["highest_security_level"] == "EAL1+"]
eal1_plus_df.head()

In [None]:
eal1_plus_df["scheme"].value_counts()

#### View data with highest security level (EAL7)

In [None]:
eal7_df = cards_df[cards_df["highest_security_level"] == highest_smart_card_security_level]
eal7_df.head()

In [None]:
eal7_df["scheme"].value_counts()

In [None]:
eal7_df[eal7_df["status"] == "active"]

#### BSI certs in smart cards dataset

In [None]:
bsi_smart_cards_df = cards_df[cards_df["is_bsi_cert"]]

print(f"There is total of {bsi_smart_cards_df.shape[0]} BSI records among smart cards")

#### Most common security levels among BSI smart card records

In [None]:
bsi_smart_cards_df["highest_security_level"].value_counts()

#### ANSSI certs in smart cards dataset


In [None]:
anssi_smart_cards_df = cards_df[cards_df["is_anssi_cert"]]

print(f"There is total of {anssi_smart_cards_df.shape[0]} records ANSSI among smart cards")

#### Most common security levels among ANSSI smart card records 

In [None]:
anssi_smart_cards_df["highest_security_level"].value_counts()

#### Smarts cards which expires next year


In [None]:
next_year = datetime.datetime.now().year + 1

def is_expiring_next_year(series_datetime):
    return series_datetime.year == next_year


cards_next_year_expires_df = cards_df[cards_df["not_valid_after"].apply(is_expiring_next_year)]
cards_next_year_expires_df.head()

### Which schemes are directly affecting certs with other schemes

In [None]:
df["scheme"].value_counts()

In [None]:
scheme_df = df[df["directly_affecting"].notna()]
print(f"Total of {scheme_df.shape[0]} certs are directly affecting other certs.")

In [None]:
def get_scheme_from_cert_id(cert_id: str) -> str:
    scheme_list = df[df["cert_id"] == cert_id]["scheme"].tolist()
    
    if not scheme_list:
        return None 
    
    
    return df[df["cert_id"] == cert_id]["scheme"].tolist()[0]

In [None]:
CC_SCHEMES = ["US", "FR", "DE", "JP", "CA", "NL", "ES", "KR", "UK", "AU", "NO", "SE", "MY", "TR", "IT", "IN", "SG"]
result = {}


for scheme in CC_SCHEMES:
    counter = collections.Counter()
    scheme_affecting_series = scheme_df[scheme_df["scheme"] == scheme]["directly_affecting"]
    
    for affecting_set in scheme_affecting_series:
        tmp_dict = {}
        
        for cert_id in affecting_set:
            current_scheme = get_scheme_from_cert_id(cert_id)
            tmp_dict[current_scheme] = tmp_dict.get(current_scheme, 0) + 1
            
        counter.update(tmp_dict)
    
    result[scheme] = counter    

print(result)

In [None]:
plt.figure(figsize=(10,5))
plt.rcParams.update({'font.size': 20})
heatmap_result = []


for scheme, counter in result.items():
    print(scheme, counter)
    scheme_list = []
    for scheme_key in CC_SCHEMES:
        scheme_list.append(counter.get(scheme_key, 0))
    
    heatmap_result.append(scheme_list)
    
print(heatmap_result)
sns.set(style="whitegrid")
ax = sns.heatmap(heatmap_result, xticklabels=CC_SCHEMES, yticklabels=CC_SCHEMES,cmap="Greens").set_title("Archived certs vs. active certs")
plt.ylabel("Specific cert scheme")
plt.xlabel("Schemes affected by specific scheme")

#### Dependencies among scheme


In [None]:
def return_unique_years_in_dataset():    
    unique_years = set()

    for timestamp_record in scheme_df["not_valid_before"]:
        unique_years.add(timestamp_record.year)
        
    return unique_years

In [None]:
CC_SCHEMES = ["US", "FR", "DE", "JP", "CA", "NL", "ES", "KR", "UK", "AU", "NO", "SE", "MY", "TR", "IT", "IN", "SG"]

def discover_scheme_dependiencies_in_dataset(dataset):
    result = {}

    for scheme in CC_SCHEMES:
        counter = collections.Counter()
        scheme_affecting_series = dataset[dataset["scheme"] == scheme]["directly_affecting"]

        for affecting_set in scheme_affecting_series:
            tmp_dict = {}

            for cert_id in affecting_set:
                current_scheme = get_scheme_from_cert_id(cert_id)
                tmp_dict[current_scheme] = tmp_dict.get(current_scheme, 0) + 1

            counter.update(tmp_dict)

        result[scheme] = counter

    return result

discover_scheme_dependiencies_in_dataset(scheme_df)

In [None]:
CC_SCHEMES = ["US", "FR", "DE", "JP", "CA", "NL", "ES", "KR", "UK", "AU", "NO", "SE", "MY", "TR", "IT", "IN", "SG"]

def discover_scheme_dependiencies_in_dataset(year: int):
    result = {}

    for scheme in CC_SCHEMES:
        counter = collections.Counter()
        current_scheme_df = scheme_df[scheme_df["scheme"] == scheme]  # ["directly_affecting"]
        
        for index, row in current_scheme_df.iterrows():
            if row["not_valid_before"].year != year:
                continue
            
            tmp_dict = {}
            for cert_id in row["directly_affecting"]:
                current_scheme = get_scheme_from_cert_id(cert_id)
                tmp_dict[current_scheme] = tmp_dict.get(current_scheme, 0) + 1

            counter.update(tmp_dict)

        result[scheme] = counter

    return result

In [None]:
UNIQUE_YEARS = return_unique_years_in_dataset()
year_result = {}

for year in UNIQUE_YEARS:
    scheme_year_df = scheme_df[scheme_df["not_valid_before"] == year]
    year_result[year] = discover_scheme_dependiencies_in_dataset(year)

print(year_result)