# FIPS References

This notebook provides insights into the references of FIPS certificates.

In [1]:
import sys
import pandas as pd
import seaborn as sns
import logging
import networkx as nx
import matplotlib
import matplotlib.pyplot as plt

from sec_certs.dataset.fips import FIPSDataset
from collections import Counter
from pprint import pprint

# Typing imports
from pandas.core.frame import DataFrame, Series

#%matplotlib inline
#matplotlib.use("pgf")
sns.set_theme(style='white')
plt.rcParams["axes.linewidth"] = 0.5
plt.rcParams["legend.fontsize"] = 6.5
plt.rcParams["xtick.labelsize"] = 8
plt.rcParams["ytick.labelsize"] = 8
plt.rcParams["ytick.left"] = True
plt.rcParams["ytick.major.size"] = 5
plt.rcParams["ytick.major.width"] = 0.5
plt.rcParams["ytick.major.pad"] = 0
plt.rcParams["xtick.bottom"] = True
plt.rcParams["xtick.major.size"] = 5
plt.rcParams["xtick.major.width"] = 0.5
plt.rcParams["xtick.major.pad"] = 0
# plt.rcParams["pgf.texsystem"] = "pdflatex"
plt.rcParams["font.family"] = "serif"
# plt.rcParams["text.usetex"] = True
# plt.rcParams["pgf.rcfonts"] = False
plt.rcParams["axes.titlesize"] = 10
plt.rcParams["legend.handletextpad"] = 0.3
plt.rcParams["lines.markersize"] = 4
plt.rcParams["savefig.pad_inches"] = 0.01
sns.set_palette("deep")

In [None]:
dset = FIPSDataset.from_web_latest()

Downloading FIPS Dataset:   4%|█▌                                    | 2.44M/57.2M [00:05<02:50, 338kB/s]

In [None]:
df = dset.to_pandas()

In [None]:
def get_references_count(references: set[str] | float) -> int:
    return 0 if pd.isna(references) else len(references)

In [None]:
df["outgoing_direct_references_count"] = df["module_directly_referencing"].apply(get_references_count)
df["incoming_direct_references_count"] = df["module_directly_referenced_by"].apply(get_references_count)
df["outgoing_indirect_references_count"] = df["module_indirectly_referencing"].apply(get_references_count)
df["incoming_indirect_references_count"] = df["module_indirectly_referenced_by"].apply(get_references_count)

## Analysis of FIPS references

In [None]:
REFS_COLUMNS: list[str] = [
    "cert_id",
    "status",
    "standard",
    "type",
    "level",
    "embodiment",
    "year_from",
    "related_cves",
    "module_directly_referenced_by", 
    "module_indirectly_referenced_by",
    "module_directly_referencing",
    "module_indirectly_referencing",
    "policy_directly_referenced_by",
    "policy_indirectly_referenced_by",
    "policy_directly_referencing",
    "policy_indirectly_referencing",
    "outgoing_direct_references_count",
    "incoming_direct_references_count",
    "outgoing_indirect_references_count",
    "incoming_indirect_references_count"
]
df = df[REFS_COLUMNS]

### References EDA

#### Direct module references

In [None]:
df[df["outgoing_direct_references_count"] > 0]["outgoing_direct_references_count"].describe()

#### Indirect module references

In [None]:
df[df["outgoing_indirect_references_count"] > 0]["outgoing_indirect_references_count"].describe()

#### Direct references per status

In [None]:
refs_direct_eda_df: DataFrame = df[df["outgoing_direct_references_count"] > 0]
refs_direct_eda_df["status"].value_counts().loc[lambda x: x > 0].plot.pie(autopct="%.0f%%")

plt.title("Status distribution among directly referencing certificates")
plt.show()

#### Direct references per type


In [None]:
refs_direct_eda_df["type"].value_counts().loc[lambda x: x > 0].plot.pie(autopct="%.0f%%")

plt.title("Certificate type distribution among directly referencing certificates")
plt.show()

#### Direct references per level

In [None]:
refs_direct_eda_df["level"].value_counts().loc[lambda x: x > 0].plot.pie(autopct="%.0f%%")

plt.title("Certificate level distribution among directly referencing certificates")
plt.show()

#### Direct references per embodiment

In [None]:
embodiment_counts_df = refs_direct_eda_df["embodiment"].value_counts().loc[lambda x: x > 0].plot.pie(autopct="%.0f%%")

plt.title("Certificate embodiment distribution among directly referencing certificates")
plt.show()

#### Time analysis of mean of references per years

In [None]:
referencing_df = df[df["outgoing_direct_references_count"] > 0][
    ["year_from", "outgoing_direct_references_count"]
]
refs_grouped_by = referencing_df.select_dtypes(include=["number"]).groupby(["year_from"]).mean()
line = sns.lineplot(data=refs_grouped_by, x="year_from", y="outgoing_direct_references_count")

plt.title("Time analysis of references mean per year")
plt.show()

#### Distribution 

In [None]:
YEAR: int = 2016
modern_refs = df[(df["outgoing_direct_references_count"] > 0) & (df["year_from"] >= YEAR)]
sns.histplot(modern_refs["outgoing_direct_references_count"], kde=True)

plt.title(f"Histogram of references count since {YEAR}")
plt.show()

#### Distribution of embodiments with assigned at least one CVE

In [None]:
cve_df: DataFrame = df[df["related_cves"].notna()]

embodiments_counts = cve_df["embodiment"].value_counts()
sns.barplot(x=embodiments_counts.index, y=embodiments_counts.values)

plt.title("Distribution of embodiments of certificates with at least one CVE")
plt.xlabel("Embodiment")
plt.ylabel("Count")
plt.show()

#### Distribution of levels with assigned at least one CVE

In [None]:
level_counts = cve_df["level"].value_counts()
sns.barplot(x=level_counts.index, y=level_counts.values)

plt.title("Distribution of levels of certificates with at least one CVE")
plt.xlabel("Level")
plt.ylabel("Count")
plt.show()

#### Distribution of types with assigned at least one CVE

In [None]:
type_counts = cve_df["type"].value_counts()
sns.barplot(x=type_counts.index, y=type_counts.values)

plt.title("Distribution of types of certificates with at least one CVE")
plt.xlabel("Type")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()

#### Distribution of statuses with assigned at least one CVE

In [None]:
status_counts = cve_df["status"].value_counts()
sns.barplot(x=status_counts.index, y=status_counts.values)

plt.title("Distribution of statuses of certificates with at least one CVE")
plt.xlabel("Status")
plt.ylabel("Count")
plt.show()

### Statistics how many certificates are referenced and have at least one CVE assigned

In [None]:
cve_ref_df: DataFrame = df[(df["related_cves"].notna()) & (df["incoming_direct_references_count"] > 0)]
total_references: int = cve_ref_df["incoming_direct_references_count"].sum()

print(f"Total number of certificates referencing at least one certificate with at least one assigned CVE: {total_references}")

#### The certificate with at least one CVE assigned and maximum of references from other certs

In [None]:
maximum_references: int = cve_ref_df["incoming_direct_references_count"].max()
print(f"Maximum amount of references from the other certificates referencing the vulnerable certificate: {maximum_references}")

In [None]:
cve_ref_df[cve_ref_df["incoming_direct_references_count"] == maximum_references]

### Active certificates which reference at least one historical cert

In [None]:
def get_cert_property(df: DataFrame, cert_id: int, column: str) -> str | None:
    if column not in df.columns:
        raise ValueError(f"Dataset does not have column '{column}'")
    
    sub_df = df[df["cert_id"] == int(cert_id)]
    
    if not sub_df.shape[0]:  # Certificate is not in the dataset
        print(f"Cert ID: {cert_id} not in dataset")
        return None
    
    if sub_df.shape[0] > 1:  # There are more than one occurence with same ID
        print(f"Error Cert ID: {cert_id} has {sub_df.shape[0]} occurrences.")
        return None
    
    return sub_df.iloc[0][column]

In [None]:
active_certs_referencing_historical: list[int] = []
# NOTE: "cert_id" field is represented as integer, however "module_directly_referencing" is set of strings
historical_cert_ids: set[str] = set(df[df["status"] == "historical"]["cert_id"].apply(str))
active_referencing_certs: DataFrame = df[(df["status"] == "active") & (df["outgoing_direct_references_count"] > 0)]

for _, rows in active_referencing_certs.iterrows():
    referencing_ids = rows["module_directly_referencing"]
    intersection = referencing_ids & historical_cert_ids
    
    if intersection:
        active_certs_referencing_historical.append(rows.cert_id)

hit_ratio: float = len(active_certs_referencing_historical) / len(active_referencing_certs)
print(f"Total active certificates referencing at least one historical: {len(active_certs_referencing_historical)}")
print(f"Total active certificates referencing at least one certificate: {len(active_referencing_certs)}")
print(f"Hit ratio: {round(hit_ratio, 2)}")

#### Active certificate referencing at least one historical certificate with assigned CVE

In [None]:
active_cert_referencing_historical_with_cves: list[tuple[int, int]] = []


for _, cert in df[df["cert_id"].isin(active_certs_referencing_historical)].iterrows():
    cert_id = cert["cert_id"]
    
    for referenced_cert_id in cert["module_directly_referencing"]:
        referenced_cert_id_int = int(referenced_cert_id)
        related_cves = get_cert_property(df, referenced_cert_id_int, "related_cves")
        
        if not pd.isna(related_cves):
            print(f"Active certificate {cert_id} is referencing historical certificate {referenced_cert_id} with assigned CVE")
            active_cert_referencing_historical_with_cves.append((cert_id, referenced_cert_id_int))
            

active_cert_referencing_historical_with_cves

### Certificates with higher levels referencing certificates with lower levels

In [None]:
referencing_certs: DataFrame = df[df["outgoing_direct_references_count"] > 0]
cert_level_ids: dict[int, set[str]] = {}
unique_levels: list[int] = df["level"].apply(int).unique()

for level in df["level"].unique():
    level_cert_ids: set[str] = set(df[df["level"] == level]["cert_id"].apply(str))
    cert_level_ids[int(level)] = level_cert_ids

In [None]:
def get_cert_ids_referencing_lower_level_cert(level_referencing_certs_df: DataFrame, lower_cert_ids: set[str]) -> list[int]:
    cert_ids = []
    
    for _, cert in level_referencing_certs_df.iterrows():
        if cert["module_directly_referencing"] & lower_cert_ids:
            cert_ids.append(cert["cert_id"])
    
    return cert_ids            

#### Level 2 referencing at least one certificate of level 1

In [None]:
LEVEL2: int = 2 
below_level2_cert_ids: set[str] = cert_level_ids[1]
level2_ref_certs = referencing_certs[referencing_certs["level"] == LEVEL2]
level2_referencing_lower_level = get_cert_ids_referencing_lower_level_cert(level2_ref_certs, below_level2_cert_ids)
ratio: float = len(level2_referencing_lower_level) / level2_ref_certs.shape[0]

print(f"Total amount of certificates with level {LEVEL2} referencing at least one certificate: {level2_ref_certs.shape[0]}")
print(f"Total amount of certicates with reference at least one certificate with lower level: {len(level2_referencing_lower_level)}")
print(f"Hit ratio: {round(ratio, 2)}")

#### Level 3 referencing at least one certificate of level 1 or 2

In [None]:
LEVEL3: int = 3
below_level3_cert_ids = below_level2_cert_ids | cert_level_ids[2]
level3_ref_certs = referencing_certs[referencing_certs["level"] == LEVEL3]
level3_referencing_lower_level = get_cert_ids_referencing_lower_level_cert(level3_ref_certs, below_level3_cert_ids)
ratio: float = len(level3_referencing_lower_level) / level3_ref_certs.shape[0]

print(f"Total amount of certificates with level {LEVEL3} referencing at least one certificate: {level3_ref_certs.shape[0]}")
print(f"Total amount of certicates with reference at least one certificate with lower level: {len(level3_referencing_lower_level)}")
print(f"Hit ratio: {round(ratio, 2)}")

#### Level 4 referencing at least one certificate of level 1, 2 or 3

In [None]:
LEVEL4: int = 4
below_level4_cert_ids = below_level3_cert_ids | cert_level_ids[3]
level4_ref_certs = referencing_certs[referencing_certs["level"] == LEVEL4]
level4_referencing_lower_level = get_cert_ids_referencing_lower_level_cert(level4_ref_certs, below_level4_cert_ids)
print(f"Total amount of certificates with level {LEVEL4} referencing at least one certificate: {level4_ref_certs.shape[0]}")

### Distribution of references among embodiments

In [None]:
def get_embodiment_references(df: DataFrame, embodiment: str) -> dict[str, int]:
    result: dict[str, int] = {}
    sub_df = df[(df["embodiment"] == embodiment) & (df["outgoing_direct_references_count"] > 0)]
    
    for references in sub_df["module_directly_referencing"]:
        for cert_id in references:
            referenced_embodiment: str = get_cert_property(df, cert_id, "embodiment")
            result[referenced_embodiment] = result.get(referenced_embodiment, 0) + 1
    
    return result

In [None]:
final_embodiment_statistics: dict[str, dict[str, int]] = {} 

for embodiment in df["embodiment"].unique():
    final_embodiment_statistics[embodiment] = get_embodiment_references(df, embodiment)

pprint(final_embodiment_statistics)

In [None]:
embodiment_df = pd.DataFrame(final_embodiment_statistics).fillna(0).astype(int).transpose()
embodiment_df.plot(kind="bar", stacked=True)

plt.xlabel("Referencing cert embodiment")
plt.ylabel("Number of referenced certificates")
plt.title("Direct references among FIPS embodiments")
plt.show()

### Distribution of references among types

In [None]:
def get_type_references(df: DataFrame, cert_type: str) -> dict[str, int]:
    result = {}
    sub_df = df[(df["type"] == cert_type) & (df["outgoing_direct_references_count"] > 0)]
    
    for references in sub_df["module_directly_referencing"]:
        for cert_id in references:
            referenced_type: str = get_cert_property(df, cert_id, "type")
            result[referenced_type] =result.get(referenced_type, 0) + 1

    return result

In [None]:
final_type_statistics: dict[str, dict[str, int]] = {}

for cert_type in df["type"].unique():
    final_type_statistics[cert_type] = get_type_references(df, cert_type)

pprint(final_type_statistics)

In [None]:
cert_type_df = pd.DataFrame(final_type_statistics).fillna(0).astype(int).transpose()
cert_type_df.plot(kind="bar", stacked=True)

plt.xlabel("Referencing cert type")
plt.ylabel("Number of referenced certificates")
plt.title("Direct references among FIPS certificate types")
plt.show()

### Temporal evolution of references in time

In [None]:
def convert_refences_count_to_relative(number: int, n_issued_certs: int) -> float:
    return 100 * number / n_issued_certs

THRESHOLD_YEAR: int = 2023

temporal_df: DataFrame = df[(df["outgoing_direct_references_count"] > 0) & (df["year_from"] < THRESHOLD_YEAR)]
total_referencing_certs: int = temporal_df.shape[0]
grouped_df: DataFrame = temporal_df.groupby("year_from")
converted_df: DataFrame = grouped_df.size().reset_index(name="year_references")

# Fill missing years
dataset_years: set[int] = set(converted_df["year_from"])
years_interval: set[int] = set(range(min(dataset_years), max(dataset_years) + 1))
missing_years: set[int] = years_interval - dataset_years

for year in missing_years:
    converted_df.loc[len(converted_df.index)] = (year, 0)

n_issues_certs = converted_df["year_references"].sum()
converted_df["percentage"] = converted_df["year_references"].apply(convert_refences_count_to_relative, args=(total_referencing_certs,))
line = sns.lineplot(data=converted_df, x="year_from", y="percentage")

plt.title("Temporal evolution of referenced certificate in time (percentages)")
plt.show()

### Cross-references among certificates

Certificate A directly references certificate B and at the same time certificate B directly references certificate A.
In other words intersection of `module_directly_referenced_by` and `module_directly_referencing` results into discovery of cross references in the dataset.

In [None]:
result: list[tuple[int, int]] = []
cross_references_df: DataFrame = df[(df["incoming_direct_references_count"] > 0) & (df["outgoing_direct_references_count"] > 0)] 


for _, cert in cross_references_df.iterrows():
    referenced_by = cert["module_directly_referenced_by"]
    referencing = cert["module_directly_referencing"]
    cert_id = cert["cert_id"]
    
    intersection: set[str] = referenced_by & referencing
    
    
    for another_cert_id in intersection:
        another_cert_id_int = int(another_cert_id)
        
        if not (another_cert_id_int, cert_id) in result:
            result.append((cert_id, int(another_cert_id)))

print(result)
# TODO - Investigate how are 3382, 3383, 3384 related

In [None]:
df[df["cert_id"].isin([3382, 3383, 3384])]

### Referenced certs, which are not referencing any other cert

In [None]:
not_referencing_df: DataFrame = df[(df["outgoing_direct_references_count"] == 0) & (df["incoming_direct_references_count"] > 0)]
print(f"In the dataset is total of {not_referencing_df.shape[0]} certificates which are referenced by at least one cert and not referencing any other certs.")

#### Status of the certificates

In [None]:
not_referencing_df["status"].value_counts().loc[lambda x: x > 0].plot.pie(autopct="%.0f%%")

plt.title("Distribution of statuses across referenced, not referencing certs")
plt.show()

#### Histogram of CVEs

In [None]:
number_of_cves: int = 10
counter: Counter = Counter()

for cve_set in not_referencing_df["related_cves"]:
    if pd.isna(cve_set):
        continue
    counter.update(cve_set)

not_referencing_cve_df: DataFrame = pd.DataFrame.from_dict(counter, orient="index").reset_index()
not_referencing_cve_df.columns = ("CVE", "count")
not_referencing_cve_df.sort_values(by="count", ascending=False, inplace=True)
sns.barplot(x="CVE", y="count", data=not_referencing_cve_df.head(number_of_cves))

plt.title(f"The most common {number_of_cves} CVEs in the dataset")
plt.xticks(rotation=90)
plt.show()

### Certificates which are referenced, but not referencing any other certificate

In [None]:
not_referenced_df: DataFrame = df[(df["outgoing_direct_references_count"] > 0) & (df["incoming_direct_references_count"] == 0)]
print(f"There are total of {not_referenced_df.shape[0]} certs, which are referencing at least one other cert and referenced by no other cert")

In [None]:
not_referenced_df["status"].value_counts().loc[lambda x: x > 0].plot.pie(autopct="%.0f%%")

plt.title("Cert status distribution of referencing, no referenced by certs")
plt.show()

#### Histogram of CVEs

In [None]:
number_of_cves: int = 10
counter: Counter = Counter()
cve_rich_certs_df: DataFrame = not_referenced_df[not_referenced_df["related_cves"].notna()]
    
for cve_set in cve_rich_certs_df["related_cves"]:
    counter.update(cve_set)
    
not_referenced_by_df: DataFrame = pd.DataFrame.from_dict(counter, orient="index").reset_index()
not_referenced_by_df.columns = ("CVE", "count")
not_referenced_by_df.sort_values(by="count", ascending=False, inplace=True)

sns.barplot(x="CVE", y="count", data=not_referenced_by_df.head(number_of_cves))

plt.xticks(rotation=90)
plt.show()

### Certificate overview

In [None]:
CHOSEN_CERT_COLOR: str = "lightgreen"
REFERENCING_COLOR: str = "lightblue"
REFERENCED_BY_COLOR: str = "red"

In [None]:
cert_id: int = 4512  # Enter the certificate ID you are interested in

In [None]:
# TODO - Enrich graph with additional information (e.g. assigned CVEs to certificate, active/historical)
refs_df: DataFrame = df[df["outgoing_direct_references_count"] > 0]

if not refs_df[refs_df["cert_id"] == cert_id].shape[0]:
    print("Cert ID is not in the dataset, or does not references any other certificates")
    sys.exit()

cert_id_series: Series = refs_df[refs_df["cert_id"] == cert_id].iloc[0]
cert_dgst: str = refs_df[refs_df["cert_id"] == cert_id].index[0]
color_map: list[str] = [CHOSEN_CERT_COLOR]
graph: nx.DiGraph = nx.DiGraph()
graph.add_node(cert_id)

# Display which certificates are directly referenced by the chosen certificate
for referenced_cert_id in cert_id_series["module_directly_referencing"]:
    graph.add_node(referenced_cert_id)
    graph.add_edge(cert_id, referenced_cert_id)
    color_map.append(REFERENCING_COLOR)


# Display which certificates are directly referencing the chosen certificate
for referencing_cert_id in cert_id_series["module_directly_referenced_by"]:
    graph.add_node(referencing_cert_id)
    graph.add_edge(referencing_cert_id, cert_id)
    color_map.append(REFERENCED_BY_COLOR)


pos = nx.circular_layout(graph)
nx.draw(graph, pos, arrows=True, node_color=color_map, with_labels=True, node_size=900)

In [None]:
print(f"Certificate with id {cert_id}:")
print(f" - references the certificates {cert_id_series['module_directly_referencing']}")
print(f" - is referenced by certificates {cert_id_series['module_directly_referenced_by']}")
print(f" - its page is at https://seccerts.org/fips/{cert_dgst}/")

### Graph of the most referenced certificate

In [None]:
STATUS_TO_COLOUR_MAPPING: dict[str, str] = {
    "active": "lightblue",
    "historical": "red",
    "revoked": "blue",
}

In [None]:
# TODO - Enrich the graph nodes with assigned CVEs
graph = nx.DiGraph()
node_colors: list[str] = []
max_referenced_by_num: int = df["incoming_direct_references_count"].max()
most_referenced_certificate: Series = df[df["incoming_direct_references_count"] == max_referenced_by_num].iloc[0]

origin_cert_id: int = most_referenced_certificate["cert_id"]
origin_cert_status: str = most_referenced_certificate["status"]
graph.add_node(origin_cert_id)
node_colors.append(STATUS_TO_COLOUR_MAPPING[origin_cert_status])

for cert_id_str in most_referenced_certificate["module_directly_referenced_by"]:
    cert_id_int = int(cert_id_str)
    graph.add_node(cert_id_int)
    graph.add_edge(cert_id_int, origin_cert_id)
    cert_status: str = get_cert_property(df, cert_id_int, "status")
    node_colors.append(STATUS_TO_COLOUR_MAPPING[cert_status])

fig, ax = plt.subplots(figsize=(15, 15))
pos = nx.circular_layout(graph)
nx.draw(graph, pos, node_color=node_colors, arrows=True, with_labels=True, node_size=1000)
plt.show()

### Graph of the most referencing certitificate

In [None]:
# TODO - Enrich the graph nodes with assigned CVEs
graph = nx.DiGraph()
node_colors: list[str] = []
max_referencing_num: int = df["outgoing_direct_references_count"].max()
most_referencing_cert: Series = df[df["outgoing_direct_references_count"] == max_referencing_num].iloc[0]
origin_cert_id: int = most_referencing_cert["cert_id"]
origin_cert_status: str = most_referencing_cert["status"]
node_colors.append(STATUS_TO_COLOUR_MAPPING[origin_cert_status])


for cert_id_str in most_referencing_cert["module_directly_referencing"]:
    cert_id_int = int(cert_id_str)
    graph.add_node(cert_id_int)
    graph.add_edge(origin_cert_id, cert_id_int)
    cert_status: str = get_cert_property(df, cert_id_int, "status")
    node_colors.append(STATUS_TO_COLOUR_MAPPING[cert_status])

    
fig, ax = plt.subplots(figsize=(10, 10))
pos = nx.circular_layout(graph)
nx.draw(graph, pos, node_color=node_colors, arrows=True, with_labels=True, node_size=1000)
plt.show()