In [11]:
import pandas as pd
import numpy as np
import os, sys
import json
import logging
from localutils.errorhandler import error_handler

In [12]:
logger = logging.getLogger(__name__)

In [13]:
def load_nvd_cve_data(start_year: int, end_year: int, directory: str) -> pd.DataFrame:
    data = []
    cves_list = []
    for file in os.listdir(directory):
        # check if file is .json and is within the range of years
        if file.endswith(".json") and int(file.split("-")[2].split(".")[0]) in range(start_year, end_year + 1):
            file_path = os.path.join(directory, file)
            with open(file_path, 'r') as file:
                data = json.load(file)
                logger.info(f"Loaded {file_path}")
                for _, cve_data in enumerate(data.get("CVE_Items", [])):
                    if cve_data.get("impact"):
                        for key in cve_data.get("impact"):
                            if key == "baseMetricV3":
                                cve_id = cve_data.get("cve").get("CVE_data_meta").get("ID", "") if cve_data.get("cve") else ""
                                description = cve_data.get("cve").get("description").get("description_data")[0].get("value", "")
                                published_date = cve_data.get("publishedDate", "")
                                last_modified_date = cve_data.get("lastModifiedDate", "")
                                problemtype_data = cve_data.get("cve", {}).get("problemtype", {}).get("problemtype_data", [{}])[0].get("description", [])
                                cwe_id = problemtype_data[0].get("value", "") if problemtype_data else ""
                                cvss_version = cve_data.get("impact").get("baseMetricV3").get("cvssV3").get("version") 
                                severity = cve_data.get("impact").get("baseMetricV3").get("cvssV3").get("baseSeverity")
                                base_score = cve_data.get("impact").get("baseMetricV3").get("cvssV3").get("baseScore")
                                exploitability_score = cve_data.get("impact").get("baseMetricV3").get("exploitabilityScore")
                                vector_string = cve_data.get("impact").get("baseMetricV3").get("cvssV3").get("vectorString")
                                attack_vector = cve_data.get("impact").get("baseMetricV3").get("cvssV3").get("attackVector")
                                attack_complexity = cve_data.get("impact").get("baseMetricV3").get("cvssV3").get("attackComplexity")
                                authentication = cve_data.get("impact").get("baseMetricV3").get("cvssV3").get("privilegesRequired")
                                user_interaction = cve_data.get("impact").get("baseMetricV3").get("cvssV3").get("userInteraction")
                                confidentiality_impact = cve_data.get("impact").get("baseMetricV3").get("cvssV3").get("confidentialityImpact")
                                integrity_impact = cve_data.get("impact").get("baseMetricV3").get("cvssV3").get("integrityImpact")
                                availability_impact = cve_data.get("impact").get("baseMetricV3").get("cvssV3").get("availabilityImpact")
                                break
                            elif key == "baseMetricV2":
                                cve_id = cve_data.get("cve").get("CVE_data_meta").get("ID", "") if cve_data.get("cve") else ""
                                description = cve_data.get("cve").get("description").get("description_data")[0].get("value", "")
                                published_date = cve_data.get("publishedDate", "")
                                last_modified_date = cve_data.get("lastModifiedDate", "")
                                problemtype_data = cve_data.get("cve", {}).get("problemtype", {}).get("problemtype_data", [{}])[0].get("description", [])
                                cwe_id = problemtype_data[0].get("value", "") if problemtype_data else ""
                                cvss_version = cve_data.get("impact").get("baseMetricV2").get("cvssV2").get("version") 
                                severity = cve_data.get("impact").get("baseMetricV2").get("severity")
                                base_score = cve_data.get("impact").get("baseMetricV2").get("cvssV2").get("baseScore")
                                exploitability_score = cve_data.get("impact").get("baseMetricV2").get("exploitabilityScore")
                                vector_string = cve_data.get("impact").get("baseMetricV2").get("cvssV2").get("vectorString")
                                attack_vector = cve_data.get("impact").get("baseMetricV2").get("cvssV2").get("accessVector")
                                attack_complexity = cve_data.get("impact").get("baseMetricV2").get("cvssV2").get("accessComplexity")
                                authentication = cve_data.get("impact").get("baseMetricV2").get("cvssV2").get("authentication")
                                user_interaction = cve_data.get("impact").get("baseMetricV2").get("userInteractionRequired")
                                confidentiality_impact = cve_data.get("impact").get("baseMetricV2").get("cvssV2").get("confidentialityImpact")
                                integrity_impact = cve_data.get("impact").get("baseMetricV2").get("cvssV2").get("integrityImpact")
                                availability_impact = cve_data.get("impact").get("baseMetricV2").get("cvssV2").get("availabilityImpact")
                                break
                    cves_list.append((cve_id, description, published_date, last_modified_date, cvss_version, cwe_id, vector_string, attack_vector, attack_complexity, authentication, user_interaction, base_score, severity, exploitability_score, confidentiality_impact, integrity_impact, availability_impact))

    cves_df = pd.DataFrame(cves_list, columns=["cve_id", "description", "published_date", "last_modified_date", "cvss_version", "cwe_id", "cvss_vector", "attack_vector", "attack_complexity", "privileges_required", "user_interaction", "base_score", "base_severity", "exploitability_score", "confidentiality_impact", "integrity_impact", "availability_impact"])
    return cves_df

In [14]:
df = load_nvd_cve_data(2015, 2025, "data/download/NVDCVE")
df.drop(df[df['cve_id'] == 'not_found'].index, inplace=True)

In [5]:
df["cvss_version"].value_counts()

cvss_version
3.1    120663
3.0     40718
2.0      9447
Name: count, dtype: int64

In [6]:
df = pd.read_csv("output/cves_with_epss_kevs_controlled2015-2025_2025-03-01.csv")

  df = pd.read_csv("output/cves_with_epss_kevs_controlled2015-2025_2025-03-01.csv")


In [16]:
import requests
from localutils.errorhandler import error_handler

# function which returns a position in the "metrics" list where the "name" key matches the given value
def get_metric_position_of_other(metrics_list):
    for i, metric in enumerate(metrics_list):
        if "other" in metric:
            return i
    return None

# make output flattened
def flatten_vulnrichment_output(vulnrichment_output):
    if vulnrichment_output is None:
        return None
    flattened_output = {}
    for keyval in vulnrichment_output:
        if isinstance(keyval, dict):
            for k, v in keyval.items():
                flattened_output[k] = v
        else:
            return None
    return flattened_output

# function for downloading given JSON file for a given CVE ID from the CISAGOV vulnrichment repository

def cve_vulnrichment(cve_id):
    logger.info(f"Processing cve_id -> {cve_id}")
    directory = "data/download/vulnrichment"
    year = cve_id.split("-")[1]  # Example: "2021"
    number = int(cve_id.split("-")[2])  # Example: "1891" â†’ 1891
    thousands_group = f"{(number // 1000)}xxx"  # Calculate folder name, e.g., 1xxx
    cve_dir = f"{directory}/{year}/{thousands_group}"  # Example: "data/download/vulnrichment/2021/1xxx"

    # Construct file_path for the JSON file
    file_path = f"{cve_dir}/{cve_id}.json"
    logger.info(f"File path: {file_path}")

    # check if the file already exists
    if os.path.exists(file_path):
        logger.info(f"Processing data in {file_path}")
        # read the file and return the options
        with open(file_path, "r") as file:
            cve = json.load(file)
            if cve.get("cveMetadata", {}).get("state") != "REJECTED":
                adp_list = cve.get("containers", []).get("adp", [])
                for i, item in enumerate(adp_list):
                    if "CISA ADP Vulnrichment" in item.get("title"):
                        adp_position = i
                other = adp_list[adp_position].get("metrics", {})
                position = get_metric_position_of_other(adp_list[adp_position].get("metrics", {}))
                logger.info(f"Found positions: {adp_position}, {position}")
                return other[position].get("other").get("content").get("options")
            else:
                return [{"Exploitation": None}, {"Automatable": None}, {"Technical Impact": None}]
        
    # check if the file does not exist and return None
    else: 
            return [{"Exploitation": None}, {"Automatable": None}, {"Technical Impact": None}]

def update_row_with_details(row):
    details = flatten_vulnrichment_output(cve_vulnrichment(row['cve_id']))  # Fetch details for the current row's cve_id
    if not details:
        return 
    for key, value in details.items():
        row[key] = value  # Add each detail as a new column to the row
    return row

In [17]:
df = df.apply(lambda row: update_row_with_details(row) if isinstance(row["cve_id"], str) else None, axis=1)

In [None]:
df