<a href="https://colab.research.google.com/github/ds7389/CS_6233-Introduction_to_OS-Rearch_Project/blob/main/Pull_NIST_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

*Disclaimer:*
"This product uses data from the NVD API but is not endorsed or certified by the NVD."


All NIST publications are available in the public domain according to Title 17 of the United States Code, however services which utilize or access the NVD are asked to display the following notice prominently within the application:

"This product uses data from the NVD API but is not endorsed or certified by the NVD."

You may use the NVD name to identify the source of the data. You may not use the NVD name, to imply endorsement of any product, service, or entity, not-for-profit, commercial or otherwise. For information on how to the cite the NVD, including the database's Digital Object Identifier (DOI), please consult NIST's Public Data Repository.

## Setup Code

In [2]:
###--- Mount Drive ---###
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
###-- Libraries ---###
import json
import requests
import os
import time
import datetime
import calendar # Import the calendar module to get the number of days in a month
import pandas as pd




In [3]:
%pip install xlsxwriter



## General Helper Functions

In [4]:
################################################################################
## Helper Function to create count of entries in a data dict
################################################################################


def count_entries(data_dict: dict) -> tuple[dict, int]:
  """
  Counts the number of entries per year and the total number of entries in a dictionary.
  Args:
      data_dict (dict): A dictionary where keys are years and values are lists of entries.
  Returns:
      tuple: A tuple containing:
          - dict: A dictionary with years as keys and entry counts as values.
          - int: The total number of entries.
  """
  entries_per_year = {}
  total_entries = 0
  for year, entries in data_dict.items():
    count = len(entries)
    entries_per_year[year] = count
    total_entries += count
  return entries_per_year, total_entries

################################################################################
### Helper function to print entries total and per year
################################################################################

def print_entries_total_and_per_year(data_dict: dict):
  """
  Prints the total number of entries and the number of entries per year in a formatted way.

  """

  entries_per_year_count, total_entries_count = count_entries(data_dict)
  print("\nNumber of entries per year:")
  for year, count in entries_per_year_count.items():
      print(f"Year {year}: {count} entries")
  print(f"\nTotal number of entries: {total_entries_count}")

  #print(f"\nThis is the total number: {total_entries} entries")




In [6]:
################################################################################
### Helper function to create fully qualified paths to file
################################################################################

def path_to_file_in_top_level(file_name: str) -> str:
  """
  Takes the file name, and gives it a fully qualified path to the top level folder.
  Returns:
    str: The fully qualified path to the file.
  """
  file_path = os.path.join('/content/drive/MyDrive', file_name)
  return file_path



print(path_to_file_in_top_level("nvd_vulnerability.json"))



/content/drive/MyDrive/nvd_vulnerability.json


## Project Controls

In [6]:
###---  Control Varibales ---###
DEBUG = False               # Set to True if need to debug funcationality
ORIGINAL_DATA = False       # Set to True if you want to repull the original dataset.

In [7]:
# Define the file path in Google Drive for the project
# You can change 'nvd_vulnerabilities.json' to your desired filename
json_file_path = path_to_file_in_top_level("nvd_vulnerabilities.json")


print(json_file_path)

/content/drive/MyDrive/nvd_vulnerabilities.json


## Pull Data via API

In [8]:
################################################################################
###  API Call to pull full data set from NIST and store in dict nvd_data
################################################################################

###--- Libraries for this Cell ---###
#import datetime
#import time
#import os
#import calendar # Import the calendar module to get the number of days in a month

if(ORIGINAL_DATA):
  BASE_URL = "https://services.nvd.nist.gov"
  ENDPOINT = "/rest/json/cves/2.0"
  YEARS = list(range(2020, 2026))
  API_KEY = os.environ.get('NVD_API_KEY') # Access API key from Colab secrets

  def get_vulnerabilities_by_month(year, month):
      """Retrieves vulnerability data for a given month from the NVD API."""
      start_date = datetime.datetime(year, month, 1, 0, 0, 0).strftime('%Y-%m-%dT%H:%M:%S.000Z')
      # Calculate the last day of the month
      last_day = calendar.monthrange(year, month)[1]
      end_date_obj = datetime.datetime(year, month, last_day, 23, 59, 59, 999999)
      end_date = end_date_obj.strftime('%Y-%m-%dT%H:%M:%S.999Z')

      all_vulnerabilities = []
      start_index = 0
      results_per_page = 2000 # NVD API allows up to 2000 results per page

      headers = {}
      if API_KEY:
          headers['apiKey'] = API_KEY

      while True:
          params = {
              'pubStartDate': start_date,
              'pubEndDate': end_date,
              'resultsPerPage': results_per_page,
              'startIndex': start_index
          }
          request_url = f"{BASE_URL}{ENDPOINT}"

          try:
              print(f"Fetching data for {year}-{month:02d}, startIndex {start_index}...")
              response = requests.get(request_url, params=params, headers=headers)
              response.raise_for_status()  # Raise an exception for bad status codes

              data = response.json()
              vulnerabilities = data.get('vulnerabilities', [])
              total_results = data.get('totalResults', 0)

              all_vulnerabilities.extend(vulnerabilities)

              if start_index + len(vulnerabilities) >= total_results:
                  break  # All data retrieved
              else:
                  start_index += len(vulnerabilities)
                  time.sleep(6) # Add a delay to avoid hitting API limits

          except requests.exceptions.RequestException as e:
              print(f"Error fetching data for {year}-{month:02d}, startIndex {start_index}: {e}")
              # If we get an error for a month, log it but continue to the next
              break # Stop fetching for this month if an error occurs

      return all_vulnerabilities

  # Retrieve data for each year by iterating through months
  nvd_data = {}
  for year in YEARS:
      print(f"\n--- Fetching data for year {year} ---")
      year_vulnerabilities = []
      # Only fetch data up to the current month for the current year (2025-07)
      end_month = 12
      if year == datetime.datetime.now().year:
        end_month = datetime.datetime.now().month


      for month in range(1, end_month + 1):
          month_data = get_vulnerabilities_by_month(year, month)
          year_vulnerabilities.extend(month_data)
          # Add a longer delay between months to be safe
          time.sleep(10)

      nvd_data[year] = year_vulnerabilities
      print(f"Finished fetching data for year {year}. Retrieved {len(nvd_data[year])} vulnerabilities.")
      # Add a longer delay between years
      time.sleep(20)

  # The retrieved data is stored in the 'nvd_data' dictionary.
  # Print a summary of the retrieved data
  print("\n--- Summary of Retrieved Data ---")
  total_vulnerabilities = 0
  for year, data in nvd_data.items():
      print(f"Year {year}: {len(data)} vulnerabilities")
      total_vulnerabilities += len(data)
  print(f"Total vulnerabilities retrieved: {total_vulnerabilities}")


In [9]:
################################################################################
###  Save nvd_data dict as json file.
################################################################################
"""
This only needs to be run when the original data is pulled.
"""

if(ORIGINAL_DATA):

  # Save the nvd_data dictionary to a JSON file
  with open(json_file_path, 'w') as f:
      json.dump(nvd_data, f, indent=4)

  print(f"Successfully saved NVD vulnerability data to {json_file_path}")

## Load Data from file to dict "nvd_data"

In [10]:
################################################################################
### Load the saved data from Google Drive and store as nvd_data
################################################################################

###--- Libraries for this Cell ---###
#import json

## Define the file path in Google Drive where the JSON data is saved
#json_file_path = '/content/drive/MyDrive/CY-GY_6233/Reserach_Paper/DATA/NIST_NVD/nvd_vulnerabilities.json'

# Load the JSON data from the file into the nvd_data variable
try:
    with open(json_file_path, 'r') as f:
        nvd_data = json.load(f)
    print(f"Successfully loaded data from {json_file_path}")
except FileNotFoundError:
    print(f"Error: The file {json_file_path} was not found.")
    nvd_data = None # Set nvd_data to None if the file is not found
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {json_file_path}. Check the file format.")
    nvd_data = None # Set nvd_data to None if there's a JSON decoding error

print(type(nvd_data))

Successfully loaded data from /content/drive/MyDrive/nvd_vulnerabilities.json
<class 'dict'>
