# References

This notebook contains analysis of certificate references in Common Criteria certificates.

The notebook has two parts, an analysis part and a network visualization part.
But first some common initialization and data loading.

In [1]:
import networkx as nx
import networkx.algorithms.community as nx_comm
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from sec_certs.dataset.common_criteria import CCDataset
import pandas as pd
import seaborn as sns
import numpy as np
from pysankey import sankey

%matplotlib inline

matplotlib.use("pgf")
sns.set_theme(style='white')
plt.rcParams["axes.linewidth"] = 0.5
plt.rcParams["legend.fontsize"] = 6.5
plt.rcParams["xtick.labelsize"] = 8
plt.rcParams["ytick.labelsize"] = 8
plt.rcParams["ytick.left"] = True
plt.rcParams['ytick.major.size'] = 5
plt.rcParams['ytick.major.width'] = 0.5
plt.rcParams['ytick.major.pad'] = 0
plt.rcParams["xtick.bottom"] = True
plt.rcParams['xtick.major.size'] = 5
plt.rcParams['xtick.major.width'] = 0.5
plt.rcParams['xtick.major.pad'] = 0
plt.rcParams["pgf.texsystem"] = "pdflatex"
plt.rcParams["font.family"] = "serif"
plt.rcParams["text.usetex"] = True
plt.rcParams["pgf.rcfonts"] = False
plt.rcParams["axes.titlesize"] = 8
plt.rcParams["legend.handletextpad"] = 0.3
plt.rcParams['lines.markersize'] = 4
plt.rcParams['savefig.pad_inches'] = 0.01
sns.set_palette("deep")

#plt.style.use("seaborn-whitegrid")
#sns.set_palette("deep")
#sns.set_context("notebook")  # Set to "paper" for use in paper :)

#plt.rcParams['figure.figsize'] = (10, 6)

In [2]:
# Initialize
dset = CCDataset.from_json("../cc_09_10_2022/cc_new.json")

In [3]:
df = dset.to_pandas()
df_id_rich = df.loc[df.cert_id.notnull()].copy()

## Reference analysis


### Count numbers of reference-rich certificates

- From the numbers follows that whenever a certificate is directly referencing some else, it also indirectly references some else
- We have more outgoing references than ingoing references, which kinda makes sense. You don't have to be aware that some other cert references you

In [4]:
df["has_outgoing_direct_references"] = df.directly_referencing.notnull()
df["has_incoming_direct_references"] = df.directly_referenced_by.notnull()
df["has_outgoing_indirect_references"] = df.indirectly_referencing.notnull()
df["has_incoming_indirect_references"] = df.indirectly_referenced_by.notnull()

#df.loc[:, ["directly_referenced_by", "indirectly_referenced_by", "directly_referencing", "indirectly_referencing"]].notnull().describe()

print(f"\\newcommand{{\\numCcAllDirectReferencing}}{{{df.has_outgoing_direct_references.sum()}}}")
print(f"\\newcommand{{\\numCcAllNotDirectReferencing}}{{{len(df) - df.has_outgoing_direct_references.sum()}}}")

df_id_rich["has_outgoing_direct_references"] = df_id_rich.directly_referencing.notnull()
df_id_rich["has_incoming_direct_references"] = df_id_rich.directly_referenced_by.notnull()
df_id_rich["has_outgoing_indirect_references"] = df_id_rich.indirectly_referencing.notnull()
df_id_rich["has_incoming_indirect_references"] = df_id_rich.indirectly_referenced_by.notnull()

print(f"\\newcommand{{\\numCcWithIdDirectReferencing}}{{{df_id_rich.has_outgoing_direct_references.sum()}}}")
print(f"\\newcommand{{\\numCcWithIdNotDirectReferencing}}{{{len(df_id_rich) - df_id_rich.has_outgoing_direct_references.sum()}}}")

#df_id_rich.loc[:, ["directly_referenced_by", "indirectly_referenced_by", "directly_referencing", "indirectly_referencing"]].notnull().describe()

\newcommand{\numCcAllDirectReferencing}{1497}
\newcommand{\numCcAllNotDirectReferencing}{3632}
\newcommand{\numCcWithIdDirectReferencing}{1497}
\newcommand{\numCcWithIdNotDirectReferencing}{3556}


In [5]:
print(f"\\newcommand{{\\numCCActiveDirectReferencing}}{{{df_id_rich.loc[df_id_rich.status == 'active'].has_outgoing_direct_references.sum()}}}")

archived_cert_id_list = set(df_id_rich[df_id_rich.status == "archived"].cert_id)
def contains_archived_cert_dependency(referencing):
    if referencing is np.nan:
        return False
    
    return bool(archived_cert_id_list.intersection(referencing))
print(f"\\newcommand{{\\numCCActiveDirectReferencingArchived}}{{{df_id_rich[df_id_rich.status == 'active'].directly_referencing.apply(contains_archived_cert_dependency).sum()}}}")

\newcommand{\numCCActiveDirectReferencing}{545}
\newcommand{\numCCActiveDirectReferencingArchived}{165}


### Plot direct references per category

In [6]:
figure, axes = plt.subplots(1, 2)
figure.set_size_inches(16, 10)
figure.set_tight_layout(True)

col_to_depict = ["has_outgoing_direct_references", "has_incoming_direct_references"]

for index, col in enumerate(col_to_depict):
    countplot = sns.countplot(data=df, x="category", hue=col, ax=axes[index])
    countplot.set(
        xlabel="Category",
        ylabel="Outgoing direct references",
        title=f"Countplot of {' '.join(col.split('_'))}",
    )
    countplot.tick_params(axis="x", rotation=90)
    countplot.legend(title=' '.join(col.split('_')), bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)


In [7]:
cert_id_to_category_mapping = dict(zip(df.cert_id, df.category))
cert_id_to_category_mapping[np.NaN] = "No references"

exploded = df_id_rich.loc[:, ["category", "directly_referencing"]].explode("directly_referencing")

exploded["ref_category"] = exploded.directly_referencing.map(cert_id_to_category_mapping)
exploded = exploded.loc[exploded.ref_category.notnull()]

exploded_with_refs = exploded.loc[exploded.ref_category != "No references"]
print(f"\\newcommand{{\\numCCDirectRefsSameCategory}}{{{(exploded_with_refs.category == exploded_with_refs.ref_category).sum()}}}")
print(f"\\newcommand{{\\numCCDirectRefsOtherCategory}}{{{(exploded_with_refs.category != exploded_with_refs.ref_category).sum()}}}")
print(f"\\newcommand{{\\numCCDirectRefs}}{{{len(exploded_with_refs)}}}")
print(f"\\newcommand{{\\numCCDirectRefsFromSmartcards}}{{{(exploded_with_refs.category == 'ICs, Smart Cards and Smart Card-Related Devices and Systems').sum()}}}")

all_categories = set(exploded.category.unique()) | set(exploded.ref_category.unique())
colors = list(sns.color_palette("hls", len(all_categories), as_cmap=False).as_hex())
color_dict = dict(zip(all_categories, colors))

figure, axes = plt.subplots(1, 1)
figure.set_size_inches(24, 10)
figure.set_tight_layout(True)

sankey(exploded.category, exploded.ref_category, colorDict=color_dict, leftLabels=list(exploded.category.unique()), rightLabels=list(exploded.ref_category.unique()), fontsize=12, ax=axes)

figure.savefig("category_references.pdf", bbox_inches="tight")
figure.savefig("category_references.pgf", bbox_inches="tight")
plt.close(figure)

\newcommand{\numCCDirectRefsSameCategory}{2123}
\newcommand{\numCCDirectRefsOtherCategory}{192}
\newcommand{\numCCDirectRefs}{2315}
\newcommand{\numCCDirectRefsFromSmartcards}{1886}


### Plot direct references per scheme

In [8]:
figure, axes = plt.subplots(1, 2)
figure.set_size_inches(14, 4)
figure.set_tight_layout(True)

col_to_depict = ["has_outgoing_direct_references", "has_incoming_direct_references"]

for index, col in enumerate(col_to_depict):
    countplot = sns.countplot(data=df, x="scheme", hue=col, ax=axes[index])
    countplot.set(
        xlabel="Category",
        ylabel="Outgoing direct references",
        title=f"Countplot of {' '.join(col.split('_'))}",
    )
    countplot.tick_params(axis="x", rotation=90)
    countplot.legend(title=' '.join(col.split('_')), bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)

### Number of certificates referencing archived certificates

In [9]:
def references_archived_cert(references):
    if pd.isnull(references):
        return False

    return any([x in cert_ids] for x in references)

cert_ids = set(df.loc[((df.cert_id.notnull()) & (df.status == "archived")), "cert_id"].tolist())
df["references_archived_cert"] = df.directly_referenced_by.map(references_archived_cert)

print(f"Number of certificates that reference some archived certificate: {df.loc[df.references_archived_cert].shape[0]}")

col_to_depict = ["category", "scheme"]

figure, axes = plt.subplots(1, 2)
figure.set_size_inches(14, 8)
figure.set_tight_layout(True)

for index, col in enumerate(col_to_depict):
    countplot = sns.countplot(data=df, x=col, hue="references_archived_cert", ax=axes[index])
    countplot.set(
        xlabel=col,
        ylabel="Outgoing direct references",
        title="Countplot of certificates that reference some archived certificate",
    )
    countplot.tick_params(axis="x", rotation=90)
    countplot.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)

Number of certificates that reference some archived certificate: 929


### Count scheme references

In [10]:
cert_id_to_scheme_mapping = dict(zip(df.cert_id, df.scheme))

df_ref_rich = df_id_rich.loc[df.directly_referencing.notnull()]
exploded = df_ref_rich.loc[:, ["scheme", "directly_referencing"]].explode("directly_referencing")

exploded["ref_scheme"] = exploded.directly_referencing.map(cert_id_to_scheme_mapping)
exploded = exploded.loc[exploded.ref_scheme.notnull()]

all_schemes = set(exploded.scheme.unique()) | set(exploded.ref_scheme.unique())
colors = list(sns.color_palette("hls", len(all_schemes), as_cmap=False).as_hex())
color_dict = dict(zip(all_schemes, colors))

figure, axes = plt.subplots(1, 1)
figure.set_size_inches(4, 5)
figure.set_tight_layout(True)

sankey(exploded.scheme, exploded.ref_scheme, colorDict=color_dict, leftLabels=list(exploded.scheme.unique()), rightLabels=list(exploded.ref_scheme.unique()), fontsize=7, ax=axes)

figure.savefig("scheme_references.pdf", bbox_inches="tight")
figure.savefig("scheme_references.pgf", bbox_inches="tight")
plt.close(figure)

In [11]:
print(f"\\newcommand{{\\numCCUSReferencing}}{{{len(df_id_rich.loc[(df_id_rich.scheme == 'US') & (df_id_rich.directly_referencing.notnull())])}}}")
print(f"\\newcommand{{\\numCCUS}}{{{len(df_id_rich.loc[(df_id_rich.scheme == 'US')])}}}")

\newcommand{\numCCUSReferencing}{4}
\newcommand{\numCCUS}{959}


### Temporal evolution of references

Shows plot with relative number of certificates for a given year that reference some other certificate

In [12]:
df_temporal = df.loc[df.year_from < 2022].groupby(["year_from"])["directly_referencing"].count().reset_index().set_index("year_from")
n_issued_certs = df.groupby("year_from").name.count().reset_index().rename(columns={"name": "n_certs"}).set_index("year_from")
df_temporal.directly_referencing = 100 * df_temporal.directly_referencing / n_issued_certs.n_certs

line = sns.lineplot(data=df_temporal, x="year_from", y="directly_referencing")
line.yaxis.set_major_formatter(mtick.PercentFormatter())

### Cross references

In [13]:
# Plotting w.r.t. scheme and category (both are interesting)

## Reference network visualization

In [14]:
certs_with_ids = {cert.heuristics.cert_id: cert for cert in dset if cert.heuristics.cert_id}

print(f"Certificates in dataset: {len(dset)}")
print(f"Certificates with extracted IDs: {len(certs_with_ids)}")

Certificates in dataset: 5129
Certificates with extracted IDs: 4979


### Certificate report references

In [None]:
refs_cr = nx.DiGraph()
for cert_id, cert in certs_with_ids.items():
    refs_cr.add_node(cert_id, cert=cert)
for cert_id, cert in certs_with_ids.items():
    if cr_refs := cert.heuristics.report_references.directly_referencing:
        for ref_id in cr_refs:
            if ref_id in certs_with_ids:
                refs_cr.add_edge(cert_id, ref_id, type=("cr",))
print(f"References in certificate reports: {len(refs_cr.edges)}")

### Security target references

In [None]:
refs_st = nx.DiGraph()
for cert_id, cert in certs_with_ids.items():
    refs_st.add_node(cert_id, cert=cert)
for cert_id, cert in certs_with_ids.items():
    if st_refs := cert.heuristics.st_references.directly_referencing:
        for ref_id in st_refs:
            if ref_id in certs_with_ids:
                refs_st.add_edge(cert_id, ref_id, type=("st",))
print(f"References in security targets: {len(refs_st.edges)}")

### Combined references

In [None]:
refs = nx.DiGraph()
for cert_id, cert in certs_with_ids.items():
    refs.add_node(cert_id, cert=cert)

for cert_id, cert in certs_with_ids.items():
    cr_refs = cert.heuristics.report_references.directly_referencing
    st_refs = cert.heuristics.st_references.directly_referencing
    cr_refs = set(cr_refs) if cr_refs is not None else set()
    st_refs = set(st_refs) if st_refs is not None else set()
    both = cr_refs.union(st_refs)
    for ref in both:
        if ref not in certs_with_ids:
            continue
        if ref in cr_refs and ref not in st_refs:
            refs.add_edge(cert_id, ref, type=("cr", ))
        elif ref in st_refs and ref not in cr_refs:
            refs.add_edge(cert_id, ref, type=("st", ))
        else:
            refs.add_edge(cert_id, ref, type=("cr", "st"))
print(f"Combined references (not double counted): {len(refs.edges)}")

### Certificate overview
Enter the certificate you are interested in below and see its reference graph component.

In [None]:
cert_id = "ANSSI-CC-2019/02"

In [None]:
cert = certs_with_ids.get(cert_id)
if cert is None:
    print(f"Certificate with id {cert_id} is not present in the dataset.")

for component in nx.weakly_connected_components(refs):
    if cert_id in component:
        break

view = nx.subgraph_view(refs, lambda node: node in component)
print(f"Certificate with id {cert_id}:")
print(f" - is in a component with {len(view.nodes)} certificates and {len(view.edges)} references.")
print(f" - references {list(view[cert_id].keys())}")
print(f" - is referenced by {list(view.predecessors(cert_id))}")
print(f" - its page is at https://seccerts.org/cc/{cert.dgst}/")

In [None]:
nx.draw(view, pos=nx.planar_layout(view), with_labels=True)

## Some graph metrics
From <https://dataground.io/2021/09/29/simple-graph-metrics-networkx-for-beginners/> and
<https://theslaps.medium.com/centrality-metrics-via-networkx-python-e13e60ba2740>.
Also good <https://www.geeksforgeeks.org/network-centrality-measures-in-a-graph-using-networkx-python/>

In [None]:
print(f"Density = {nx.density(refs)}")
print(f"Transitivity = {nx.transitivity(refs)}")

In [None]:
print("Degree centrality <Popularity> (top 20):")
degree_centrality_vals = [(node, val) for node, val in nx.degree_centrality(refs).items()]
degree_centrality_vals.sort(key=lambda pair: pair[1], reverse=True)
for pair in degree_centrality_vals[:20]:
    print(f"\t{pair[0]} = {pair[1]}")

In [None]:
print("Eigenvector centrality <Influence> (top 20):")
eigenvector_centrality_vals = [(node, val) for node, val in nx.eigenvector_centrality(refs).items()]
eigenvector_centrality_vals.sort(key=lambda pair: pair[1], reverse=True)
for pair in eigenvector_centrality_vals[:20]:
    print(f"\t{pair[0]} = {pair[1]}")

In [None]:
print("Closeness centrality <Centralness> (top 20):")
closeness_centrality_vals = [(node, val) for node, val in nx.closeness_centrality(refs).items()]
closeness_centrality_vals.sort(key=lambda pair: pair[1], reverse=True)
for pair in closeness_centrality_vals[:20]:
    print(f"\t{pair[0]} = {pair[1]}")

In [None]:
print("Betweenness centrality <Bridge> (top 20):")
betweenness_centrality_vals = [(node, val) for node, val in nx.betweenness_centrality(refs).items()]
betweenness_centrality_vals.sort(key=lambda pair: pair[1], reverse=True)
for pair in betweenness_centrality_vals[:20]:
    print(f"\t{pair[0]} = {pair[1]}")

In [None]:
component_lengths = list(filter(lambda comp_len: comp_len > 1, map(len, nx.weakly_connected_components(refs))))
component_lengths.sort(reverse=True)
print(component_lengths)

In [None]:
big_boy = refs.subgraph(max(nx.weakly_connected_components(refs), key=len))
communities = list(nx_comm.greedy_modularity_communities(big_boy))
print(len(communities))

In [None]:
for com in communities:
    for i in sorted(com):
        print(f"\t{i}")