Skip to content

Commit

Permalink
Merge pull request #233 from crocs-muni:fix/group-fine-grained-cwes
Browse files Browse the repository at this point in the history
new func get_coarse_grained_cwes()
  • Loading branch information
adamjanovsky committed Jun 15, 2022
2 parents 5ea7bfc + 8dcc3f3 commit 304b289
Show file tree
Hide file tree
Showing 2 changed files with 450 additions and 175 deletions.
550 changes: 379 additions & 171 deletions notebooks/cc/vulnerabilities.ipynb

Large diffs are not rendered by default.

75 changes: 71 additions & 4 deletions sec_certs/pandas_helpers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from __future__ import annotations

import copy
import tempfile
import xml.etree.ElementTree as ET
import zipfile
from dataclasses import dataclass
from pathlib import Path
from shutil import copyfile
from typing import Final, List, Optional, Set, Tuple, Union
from typing import Any, Final, List, Optional, Set, Tuple, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -222,7 +223,9 @@ def expand_cc_df_with_cve_cols(cc_df: pd.DataFrame, cve_dset: CVEDataset) -> pd.
return df


def prepare_cwe_df(cc_df: pd.DataFrame, cve_dset: CVEDataset) -> Tuple[pd.DataFrame, pd.DataFrame]:
def prepare_cwe_df(
cc_df: pd.DataFrame, cve_dset: CVEDataset, fine_grained: bool = False
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
This function does the following:
1. Filter CC DF to columns relevant for CWE examination (eal, related_cves, category)
Expand All @@ -233,6 +236,7 @@ def prepare_cwe_df(cc_df: pd.DataFrame, cve_dset: CVEDataset) -> Tuple[pd.DataFr
:param pd.DataFrame cc_df: DataFrame obtained from CCDataset, should be limited to rows with >0 vulnerabilities
:param CVEDataset cve_dset: CVEDataset instance to retrieve CWE data from
:param bool fine_grained: If se to True, CWEs won't be merged into weaknesses of higher abstraction
:return Tuple[pd.DataFrame, pd.DataFrame]: returns two dataframes:
- DF obtained from CC Dataset, fully exploded to CWEs
- DF obtained from CWE webpage, contains IDs, names, types, urls of all CWEs
Expand Down Expand Up @@ -273,18 +277,36 @@ def prepare_cwe_df(cc_df: pd.DataFrame, cve_dset: CVEDataset) -> Tuple[pd.DataFr

weaknesses = root.find("{http://cwe.mitre.org/cwe-6}Weaknesses")
categories = root.find("{http://cwe.mitre.org/cwe-6}Categories")
dct: dict[str, List[Optional[str]]] = {"cwe_id": [], "cwe_name": [], "cwe_description": [], "type": []}
dct: dict[str, Any] = {
"cwe_id": [],
"cwe_name": [],
"cwe_description": [],
"type": [],
"child_of": [],
}

assert weaknesses
for weakness in weaknesses:
assert weakness
description = weakness.find("{http://cwe.mitre.org/cwe-6}Description")
related_weaknesses = weakness.find("{http://cwe.mitre.org/cwe-6}Related_Weaknesses")

dct["cwe_id"].append("CWE-" + weakness.attrib["ID"])
dct["cwe_name"].append(weakness.attrib["Name"])
dct["cwe_description"].append(description.text if description is not None else None)
dct["type"].append("weakness")

if related_weaknesses:
dct["child_of"].append(
{
"CWE-" + x.attrib["CWE_ID"]
for x in related_weaknesses
if x.tag == "{http://cwe.mitre.org/cwe-6}Related_Weakness" and x.attrib["Nature"] == "ChildOf"
}
)
else:
dct["child_of"].append(np.nan)

assert categories
for category in categories:
assert category
Expand All @@ -294,12 +316,57 @@ def prepare_cwe_df(cc_df: pd.DataFrame, cve_dset: CVEDataset) -> Tuple[pd.DataFr
dct["cwe_name"].append(category.attrib["Name"])
dct["cwe_description"].append(summary.text if summary is not None else None)
dct["type"].append("category")
dct["child_of"].append(np.nan)

cwe_df = pd.DataFrame(dct).set_index("cwe_id")
cwe_df["url"] = cwe_df.index.map(lambda x: "https://cwe.mitre.org/data/definitions/" + x.split("-")[1] + ".html")
cwe_df = cwe_df.replace(r"\n", " ", regex=True)

return df_cwe_relevant, cwe_df
if fine_grained:
return df_cwe_relevant, cwe_df
else:
return get_coarse_grained_cwes(df_cwe_relevant, cwe_df), cwe_df


def get_coarse_grained_cwes(fine_grained_df: pd.DataFrame, cwe_df: pd.DataFrame) -> pd.DataFrame:
"""
Oddly enough, NVD contains CWEs at different levels of abstraction, which makes it difficult to compare between them.
Among others, some three different CWEs appear in the CVEDataset: CWE-20, CWE-119, CWE-787. Problem is that CWE-787
is child of CWE-119, which in turn is child of CWE-20. It makes no sense to compute stats of most prevalent CWEs
unless categories are aligned to the top-most level.
This function aligns the categories to the top-most level. It works in loop. When an iteration is performed without
replacing any CWEs with their parents, the algorithm terminates.
The algorithm inspects every CWE and replaces it with all its parents on condition that they appear in the CVE Dataset.
:param pd.DataFrame fine_grained_df: First element of the output of `prepare_cwe_df` function
:param pd.DataFrame cwe_df: Second element of the output of `prepare_cwe_df` function
:return pd.DataFrame: DF obtained from CC Dataset, fully exploded to coarse-grained CWEs
"""
all_cwes_in_original_df = set(fine_grained_df.cwe_id.unique())
parent_dict = cwe_df.child_of.to_dict()
new_set = set(fine_grained_df.cwe_id.unique())
mapping = {x: {x} for x in new_set}

while True:
old_set = copy.deepcopy(new_set)
for cwe in old_set:
parents = parent_dict[cwe]
if parents and parents is not np.nan and any(x in all_cwes_in_original_df for x in parents):
new_set.remove(cwe)
new_set.update({x for x in parents if x in all_cwes_in_original_df})
for val in mapping.values():
if cwe in val:
val.remove(cwe)
val.update({x for x in parents if x in all_cwes_in_original_df})
if new_set == old_set:
break

# Now we should have complete mapping of fine_grained -> coarse_grained CWEs
new_df = fine_grained_df.copy()
new_df.cwe_id = new_df.cwe_id.map(mapping)

return new_df.explode(column="cwe_id")


def get_top_n_cwes(
Expand Down

0 comments on commit 304b289

Please sign in to comment.