**Fetch CVE Data Using API**

In [None]:
import requests
import json

api_url = "https://services.nvd.nist.gov/rest/json/cves/2.0"
params = {
    "resultsPerPage": 2000,
    "startIndex": 0
}

# Fetch data from the API
response = requests.get(api_url, params=params)
data = response.json()

# Saving the data to a JSON file
with open('cve_data.json', 'w') as f:
    json.dump(data, f, indent=4)


**Data Pre-Processing**

In [None]:
import json
import re

with open('cve_data.json') as f:
    data = json.load(f)

def clean_text(text):
    # Removing unnecessary characters and normalize the text
    text = re.sub(r'\s+', ' ', text)  # Replaced multiple spaces with a single space
    text = text.strip()
    return text

original_descriptions = []

for vuln in data.get('vulnerabilities', []):
    original_descs = []
    for desc in vuln.get('cve', {}).get('descriptions', []):
        original_descs.append(desc['value'])
    original_descriptions.append(original_descs)

# Preprocessing descriptions
for vuln in data.get('vulnerabilities', []):
    for desc in vuln.get('cve', {}).get('descriptions', []):
        desc['value'] = clean_text(desc['value'])

top_entries = data.get('vulnerabilities', [])[:5]

for i, (original_descs, vuln) in enumerate(zip(original_descriptions, top_entries)):
    print(f"Entry {i + 1}:")

    print("Original Descriptions:")
    for orig_desc in original_descs:
        print(f" - {orig_desc}")

    print("Cleaned Descriptions:")
    for desc in vuln.get('cve', {}).get('descriptions', []):
        print(f" - {desc['value']}")

    print()


Entry 1:
Original Descriptions:
 - The debug command in Sendmail is enabled, allowing attackers to execute commands as root.
 - El comando de depuración de Sendmail está activado, permitiendo a atacantes ejecutar comandos como root.
Cleaned Descriptions:
 - The debug command in Sendmail is enabled, allowing attackers to execute commands as root.
 - El comando de depuración de Sendmail está activado, permitiendo a atacantes ejecutar comandos como root.

Entry 2:
Original Descriptions:
 - CWD ~root command in ftpd allows root access.
Cleaned Descriptions:
 - CWD ~root command in ftpd allows root access.

Entry 3:
Original Descriptions:
 - Buffer overflow in passwd in BSD based operating systems 4.3 and earlier allows local users to gain root privileges by specifying a long shell or GECOS field.
Cleaned Descriptions:
 - Buffer overflow in passwd in BSD based operating systems 4.3 and earlier allows local users to gain root privileges by specifying a long shell or GECOS field.

Entry 4:
Or

**Tokenizing Vulnerability Descriptions for NLP Processing**

In [None]:
import json
import spacy

with open('cve_data.json') as f:
    data = json.load(f)

nlp = spacy.load('en_core_web_sm')

def tokenize_text(text):
    doc = nlp(text)
    return [token.text for token in doc]

original_descriptions = []

for vuln in data.get('vulnerabilities', []):
    original_descs = []
    for desc in vuln.get('cve', {}).get('descriptions', []):
        original_descs.append(desc['value'])
    original_descriptions.append(original_descs)

# Applying tokenization
for vuln in data.get('vulnerabilities', []):
    for desc in vuln.get('cve', {}).get('descriptions', []):
        desc['tokens'] = tokenize_text(desc['value'])

top_entries = data.get('vulnerabilities', [])[:5]

for i, (original_descs, vuln) in enumerate(zip(original_descriptions, top_entries)):
    print(f"Entry {i + 1}:")

    print("Original Descriptions:")
    for orig_desc in original_descs:
        print(f" - {orig_desc}")

    print("Tokenized Descriptions:")
    for desc in vuln.get('cve', {}).get('descriptions', []):
        print(f" - {desc['tokens']}")

    print()


Entry 1:
Original Descriptions:
 - The debug command in Sendmail is enabled, allowing attackers to execute commands as root.
 - El comando de depuración de Sendmail está activado, permitiendo a atacantes ejecutar comandos como root.
Tokenized Descriptions:
 - ['The', 'debug', 'command', 'in', 'Sendmail', 'is', 'enabled', ',', 'allowing', 'attackers', 'to', 'execute', 'commands', 'as', 'root', '.']
 - ['El', 'comando', 'de', 'depuración', 'de', 'Sendmail', 'está', 'activado', ',', 'permitiendo', 'a', 'atacantes', 'ejecutar', 'comandos', 'como', 'root', '.']

Entry 2:
Original Descriptions:
 - CWD ~root command in ftpd allows root access.
Tokenized Descriptions:
 - ['CWD', '~root', 'command', 'in', 'ftpd', 'allows', 'root', 'access', '.']

Entry 3:
Original Descriptions:
 - Buffer overflow in passwd in BSD based operating systems 4.3 and earlier allows local users to gain root privileges by specifying a long shell or GECOS field.
Tokenized Descriptions:
 - ['Buffer', 'overflow', 'in', 'p

In [None]:
!pip install nltk
!pip install hmmlearn


Collecting hmmlearn
  Downloading hmmlearn-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Downloading hmmlearn-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (164 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.6/164.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hmmlearn
Successfully installed hmmlearn-0.3.3


**Feature Extraction: POS Tagging**

In [None]:
from hmmlearn import hmm
import numpy as np
import nltk

# Download required NLTK resources
nltk.download('brown')
nltk.download('universal_tagset')
from nltk.corpus import brown

# Prepare training data
tagged_sentences = brown.tagged_sents(tagset='universal')
train_data = [[(word.lower(), tag) for word, tag in sent] for sent in tagged_sentences]

tag_list = list(set(tag for sent in train_data for _, tag in sent))
tag2idx = {tag: idx for idx, tag in enumerate(tag_list)}
word_list = list(set(word for sent in train_data for word, _ in sent))
word2idx = {word: idx for idx, word in enumerate(word_list)}

X = [[word2idx.get(word, 0) for word, _ in sent] for sent in train_data]
y = [[tag2idx[tag] for _, tag in sent] for sent in train_data]

lengths = [len(seq) for seq in X]
X = np.concatenate(X).reshape(-1, 1)
y = np.concatenate(y)

# Train HMM model
model = hmm.MultinomialHMM(n_components=len(tag_list), n_iter=100)
model.fit(X, lengths)

# POS tagging function using HMM
def pos_tag_hmm(text):
    words = text.split()
    encoded_words = np.array([[word2idx.get(word.lower(), 0)] for word in words])
    logprob, state_sequence = model.decode(encoded_words, algorithm="viterbi")
    tagged_output = [(words[i], tag_list[state]) for i, state in enumerate(state_sequence)]
    return tagged_output

# Apply POS tagging to each description and print the output
for vuln in data.get('vulnerabilities', []):
    for desc in vuln.get('cve', {}).get('descriptions', []):
        pos_tags = pos_tag_hmm(desc['value'])
        desc['pos_tags'] = pos_tags
        print(f"Description: {desc['value']}")
        print(f"POS Tags: {pos_tags}")
        print("-" * 50)


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
POS Tags: [('ICMP', 'X'), ('information', 'ADJ'), ('such', 'X'), ('as', 'ADJ'), ('(1)', 'X'), ('netmask', 'ADJ'), ('and', 'X'), ('(2)', 'ADJ'), ('timestamp', 'X'), ('is', 'ADJ'), ('allowed', 'X'), ('from', 'ADJ'), ('arbitrary', 'X'), ('hosts.', 'ADJ')]
--------------------------------------------------
Description: Información ICMP como (1) máscara de red y (2) marca de tiempo está permitida desde hosts arbitrarios.
POS Tags: [('Información', 'X'), ('ICMP', 'ADJ'), ('como', 'X'), ('(1)', 'ADJ'), ('máscara', 'X'), ('de', 'ADJ'), ('red', 'X'), ('y', 'ADJ'), ('(2)', 'X'), ('marca', 'ADJ'), ('de', 'X'), ('tiempo', 'ADJ'), ('está', 'X'), ('permitida', 'ADJ'), ('desde', 'X'), ('hosts', 'ADJ'), ('arbitrarios.', 'X')]
--------------------------------------------------
Description: An attacker can write to syslog files from any location, causing a denial of service by filling up the logs, and hiding activities.
POS Tags: [('An', '

In [None]:
!pip install crfsuite
!pip install sklearn_crfsuite

Collecting crfsuite
  Downloading crfsuite-0.3.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (1.6 kB)
Collecting milksnake (from crfsuite)
  Downloading milksnake-0.1.6-py2.py3-none-any.whl.metadata (4.0 kB)
Downloading crfsuite-0.3.1-py2.py3-none-manylinux2010_x86_64.whl (957 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m957.4/957.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading milksnake-0.1.6-py2.py3-none-any.whl (11 kB)
Installing collected packages: milksnake, crfsuite
Successfully installed crfsuite-0.3.1 milksnake-0.1.6
Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn_crfsuite)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86

**Named Entity Recognition and Dependency Parsing**

In [None]:
import json
import spacy

with open('cve_data.json') as f:
    data = json.load(f)

nlp = spacy.load('en_core_web_sm')

def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

original_descriptions = []

for vuln in data.get('vulnerabilities', []):
    original_descs = []
    for desc in vuln.get('cve', {}).get('descriptions', []):
        original_descs.append(desc['value'])
    original_descriptions.append(original_descs)

for vuln in data.get('vulnerabilities', []):
    for desc in vuln.get('cve', {}).get('descriptions', []):
        desc['entities'] = extract_entities(desc['value'])

top_entries = data.get('vulnerabilities', [])[:5]

for i, (original_descs, vuln) in enumerate(zip(original_descriptions, top_entries)):
    print(f"Entry {i + 1}:")

    print("Original Descriptions:")
    for orig_desc in original_descs:
        print(f" - {orig_desc}")

    print("Extracted Entities:")
    for desc in vuln.get('cve', {}).get('descriptions', []):
        print(f" - Entities: {desc['entities']}")

    print()


Entry 1:
Original Descriptions:
 - The debug command in Sendmail is enabled, allowing attackers to execute commands as root.
 - El comando de depuración de Sendmail está activado, permitiendo a atacantes ejecutar comandos como root.
Extracted Entities:
 - Entities: [('Sendmail', 'GPE')]
 - Entities: [('El comando de depuración de Sendmail', 'ORG'), ('como', 'ORG')]

Entry 2:
Original Descriptions:
 - CWD ~root command in ftpd allows root access.
Extracted Entities:
 - Entities: [('CWD', 'ORG')]

Entry 3:
Original Descriptions:
 - Buffer overflow in passwd in BSD based operating systems 4.3 and earlier allows local users to gain root privileges by specifying a long shell or GECOS field.
Extracted Entities:
 - Entities: [('BSD', 'ORG'), ('4.3', 'CARDINAL'), ('GECOS', 'PERSON')]

Entry 4:
Original Descriptions:
 - Vulnerability in restore in SunOS 4.0.3 and earlier allows local users to gain privileges.
Extracted Entities:
 - Entities: [('4.0.3', 'TIME')]

Entry 5:
Original Descriptions:


**Extracting Syntactic Dependencies from Vulnerability Descriptions**

In [None]:
import json
import spacy

with open('cve_data.json') as f:
    data = json.load(f)

nlp = spacy.load('en_core_web_sm')

def extract_dependencies(text):
    doc = nlp(text)
    return [(token.text, token.dep_, token.head.text) for token in doc]

original_descriptions = []

for vuln in data.get('vulnerabilities', []):
    original_descs = []
    for desc in vuln.get('cve', {}).get('descriptions', []):
        original_descs.append(desc['value'])
    original_descriptions.append(original_descs)

for vuln in data.get('vulnerabilities', []):
    for desc in vuln.get('cve', {}).get('descriptions', []):
        desc['dependencies'] = extract_dependencies(desc['value'])

top_entries = data.get('vulnerabilities', [])[:5]

for i, (original_descs, vuln) in enumerate(zip(original_descriptions, top_entries)):
    print(f"Entry {i + 1}:")

    print("Original Descriptions:")
    for orig_desc in original_descs:
        print(f" - {orig_desc}")

    print("Extracted Dependencies:")
    for desc in vuln.get('cve', {}).get('descriptions', []):
        print(f" - Dependencies: {desc['dependencies']}")

    print()


Entry 1:
Original Descriptions:
 - The debug command in Sendmail is enabled, allowing attackers to execute commands as root.
 - El comando de depuración de Sendmail está activado, permitiendo a atacantes ejecutar comandos como root.
Extracted Dependencies:
 - Dependencies: [('The', 'det', 'command'), ('debug', 'amod', 'command'), ('command', 'nsubjpass', 'enabled'), ('in', 'prep', 'command'), ('Sendmail', 'pobj', 'in'), ('is', 'auxpass', 'enabled'), ('enabled', 'ROOT', 'enabled'), (',', 'punct', 'enabled'), ('allowing', 'advcl', 'enabled'), ('attackers', 'nsubj', 'execute'), ('to', 'aux', 'execute'), ('execute', 'ccomp', 'allowing'), ('commands', 'dobj', 'execute'), ('as', 'prep', 'execute'), ('root', 'pobj', 'as'), ('.', 'punct', 'enabled')]
 - Dependencies: [('El', 'compound', 'activado'), ('comando', 'compound', 'activado'), ('de', 'nmod', 'activado'), ('depuración', 'compound', 'activado'), ('de', 'compound', 'Sendmail'), ('Sendmail', 'compound', 'activado'), ('está', 'compound', '

**4. Text Classification**

In [None]:
!pip install transformers
!pip install torch




In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)  # Assuming 4 classes: Critical, High, Medium, Low


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def preprocess_text(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        pad_to_max_length=True,
        return_tensors='pt'
    )
    return inputs['input_ids'], inputs['attention_mask']

def classify_vulnerability(vuln):
    text = vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', "")
    input_ids, attention_mask = preprocess_text(text)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        probs = softmax(outputs.logits, dim=1)
        pred = torch.argmax(probs, dim=1).item()

    labels = ['Low', 'Medium', 'High', 'Critical']
    return labels[pred]


In [None]:
for vuln in data.get('vulnerabilities', []):
    vuln['classification'] = classify_vulnerability(vuln)

for vuln in data.get('vulnerabilities', [])[:5]:
    vuln_id = vuln.get('cve', {}).get('id', 'Unknown ID')
    classification = vuln.get('classification', 'Not Classified')
    print(f"ID: {vuln_id}, Classification: {classification}")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


ID: CVE-1999-0095, Classification: Critical
ID: CVE-1999-0082, Classification: Critical
ID: CVE-1999-1471, Classification: Critical
ID: CVE-1999-1122, Classification: Medium
ID: CVE-1999-1467, Classification: Critical


In [None]:
import json
from collections import defaultdict

def store_vulnerabilities_by_classification(data):
    """
    Stores already classified vulnerabilities in JSON files, with proper description extraction
    """
    # First, classify all vulnerabilities if not already classified


    # Group vulnerabilities by classification
    classified_vulns = defaultdict(list)

    for vuln in data.get('vulnerabilities', []):
        # Get CVE ID and classification
        cve_id = vuln.get('cve', {}).get('id', 'Unknown ID')
        classification = vuln.get('classification', 'Not Classified')

        # Get the description correctly, use the first description if multiple exist
        description = vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', "")

        # Append to the appropriate classification list
        classified_vulns[classification].append({
            'CVE_Id': cve_id,
            'description': description
        })

    # Store all classifications in a single JSON file
    with open('classified_vulnerabilities.json', 'w', encoding='utf-8') as f:
        json.dump(classified_vulns, f, indent=2)
    print(f"Stored all vulnerabilities in classified_vulnerabilities.json")

    # Create and store vulnerability counts
    vulnerability_counts = {
        classification: len(vulns)
        for classification, vulns in classified_vulns.items()
    }

    with open('vulnerability_counts.json', 'w', encoding='utf-8') as f:
        json.dump(vulnerability_counts, f, indent=2)
    print("Stored vulnerability counts in vulnerability_counts.json")



# Then store them in JSON files
store_vulnerabilities_by_classification(data)

Stored all vulnerabilities in classified_vulnerabilities.json
Stored vulnerability counts in vulnerability_counts.json


**Generating Similarity-Based Vulnerability Recommendations Using BERT Embeddings**

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

# Load BERT tokenizer and model for embedding generation
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
embedding_model = BertModel.from_pretrained('bert-base-uncased')

def get_embedding(text):
    """Generates a BERT embedding for a given text."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = embedding_model(**inputs)
    # Take the mean of the token embeddings
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embedding

# Generate embeddings for all descriptions and calculate similarity
def generate_similar_vulnerabilities(data):
    embeddings = []
    for vuln in data['vulnerabilities']:
        description = vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', "")
        embedding = get_embedding(description)
        embeddings.append(embedding)

    similarity_matrix = cosine_similarity(embeddings)

    # Add top similar vulnerabilities to each entry
    for i, vuln in enumerate(data['vulnerabilities']):
        similar_indices = similarity_matrix[i].argsort()[-6:-1][::-1]  # Get top 5 similar entries (excluding itself)
        similar_vulns = [{"id": data['vulnerabilities'][j].get('cve', {}).get('id', 'Unknown ID')} for j in similar_indices]
        vuln['related_vulnerabilities'] = similar_vulns

    return data

# Update data with related vulnerabilities
data = generate_similar_vulnerabilities(data)




In [None]:
# Print related vulnerabilities for each entry in data
for vuln in data['vulnerabilities']:
    vuln_id = vuln.get('cve', {}).get('id', 'Unknown ID')
    related_vulns = vuln.get('related_vulnerabilities', [])

    print(f"Vulnerability ID: {vuln_id}")
    print("Top 5 Similar Vulnerabilities:")
    for related in related_vulns:
        print(f" - {related['id']}")
    print("-" * 50)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Vulnerability ID: CVE-1999-1457
Top 5 Similar Vulnerabilities:
 - CVE-1999-1147
 - CVE-1999-1321
 - CVE-2000-0472
 - CVE-1999-1046
 - CVE-1999-1177
--------------------------------------------------
Vulnerability ID: CVE-1999-1508
Top 5 Similar Vulnerabilities:
 - CVE-1999-1189
 - CVE-1999-1130
 - CVE-1999-1292
 - CVE-1999-1020
 - CVE-1999-0490
--------------------------------------------------
Vulnerability ID: CVE-1999-1549
Top 5 Similar Vulnerabilities:
 - CVE-2000-0353
 - CVE-2000-0266
 - CVE-2000-1205
 - CVE-2000-0209
 - CVE-1999-1437
--------------------------------------------------
Vulnerability ID: CVE-1999-0793
Top 5 Similar Vulnerabilities:
 - CVE-2000-0242
 - CVE-1999-0877
 - CVE-1999-0891
 - CVE-1999-0870
 - CVE-1999-0802
--------------------------------------------------
Vulnerability ID: CVE-1999-1092
Top 5 Similar Vulnerabilities:
 - CVE-1999-1353
 - CVE-1999-1236
 - CVE-2000-0361
 - CVE-1999-1475
 - CVE-1

In [None]:
import json
from collections import defaultdict

def get_cve_description(data, cve_id):
    """
    Extracts the description of a given CVE ID from the provided data.
    """
    for vuln in data.get('vulnerabilities', []):
        vuln_id = vuln.get('cve', {}).get('id')
        if vuln_id == cve_id:
            descriptions = vuln.get('cve', {}).get('descriptions', [])
            if descriptions:
                # Get the English description or fallback to an empty string if not found
                for description in descriptions:
                    if description.get('lang') == 'en':
                        return description.get('value', "No description found")
    return "CVE ID not found"

def store_related_vulnerabilities(data):
    """
    Stores related vulnerabilities in a JSON file where each CVE ID maps to its related vulnerabilities
    """
    related_vulns_map = defaultdict(list)
    processed_count = 0

    # Debug: Print structure of first vulnerability
    if data.get('vulnerabilities'):
        first_vuln = data['vulnerabilities'][0]
        print("\nFirst vulnerability structure:")
        print(json.dumps(first_vuln, indent=2))
        if first_vuln.get('related_vulnerabilities'):
            print("\nFirst related vulnerability structure:")
            print(json.dumps(first_vuln['related_vulnerabilities'][0], indent=2))

    for vuln in data.get('vulnerabilities', []):
        vuln_id = vuln.get('cve', {}).get('id', 'Unknown ID')
        related_vulns = vuln.get('related_vulnerabilities', [])

        # Get the description for the main CVE ID
        description = get_cve_description(data, vuln_id)

        for related in related_vulns:
            # Try multiple paths for description of related vulnerabilities
            related_description = (
                related.get('cve', {}).get('descriptions', [{}])[0].get('value') or
                related.get('descriptions', [{}])[0].get('value') or
                related.get('description') or
                related.get('cve', {}).get('description', {}).get('description_data', [{}])[0].get('value') or
                ""
            )

            # Get the description for the related CVE ID
            related_cve_id = related.get('id')
            related_cve_description = get_cve_description(data, related_cve_id)

            # Print debug info for first few entries
            if processed_count < 2:
                print(f"\nDebug - Related vulnerability for {vuln_id}:")
                print(f"Related ID: {related_cve_id}")
                print(f"Full related data: {json.dumps(related, indent=2)}")
                print(f"Found description for related CVE: {related_cve_description}")

            related_info = {
                'id': related_cve_id,
                'description': related_cve_description  # Use the fetched description for related CVE
            }
            related_vulns_map[vuln_id].append(related_info)

        processed_count += 1

    # Store in JSON file
    output_file = 'related_vulnerabilities.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(related_vulns_map, f, indent=2)


store_related_vulnerabilities(data)



First vulnerability structure:
{
  "cve": {
    "id": "CVE-1999-0095",
    "sourceIdentifier": "cve@mitre.org",
    "published": "1988-10-01T04:00:00.000",
    "lastModified": "2019-06-11T20:29:00.263",
    "vulnStatus": "Modified",
    "cveTags": [],
    "descriptions": [
      {
        "lang": "en",
        "value": "The debug command in Sendmail is enabled, allowing attackers to execute commands as root."
      },
      {
        "lang": "es",
        "value": "El comando de depuraci\u00f3n de Sendmail est\u00e1 activado, permitiendo a atacantes ejecutar comandos como root."
      }
    ],
    "metrics": {
      "cvssMetricV2": [
        {
          "source": "nvd@nist.gov",
          "type": "Primary",
          "cvssData": {
            "version": "2.0",
            "vectorString": "AV:N/AC:L/Au:N/C:C/I:C/A:C",
            "accessVector": "NETWORK",
            "accessComplexity": "LOW",
            "authentication": "NONE",
            "confidentialityImpact": "COMPLETE",
     

**Extracting Key Noun Phrases from Vulnerability Descriptions**

In [None]:
def extract_noun_phrases(text):
    """Extracts noun phrases from a text."""
    doc = nlp(text)
    return [chunk.text for chunk in doc.noun_chunks]

# Add noun phrases to each vulnerability description
for vuln in data['vulnerabilities']:
    for desc in vuln.get('cve', {}).get('descriptions', []):
        desc['key_phrases'] = extract_noun_phrases(desc['value'])


In [None]:
# Print each vulnerability's key phrases
for vuln in data['vulnerabilities']:
    vuln_id = vuln.get('cve', {}).get('id', 'Unknown ID')
    print(f"Vulnerability ID: {vuln_id}")

    for desc in vuln.get('cve', {}).get('descriptions', []):
        print(f"Description: {desc['value']}")
        print("Key Phrases:")
        for phrase in desc.get('key_phrases', []):
            print(f" - {phrase}")
        print("-" * 50)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 - service
 - malformed FTP commands
 - which
 - Apache
 - core
--------------------------------------------------
Vulnerability ID: CVE-1999-1294
Description: Office Shortcut Bar (OSB) in Windows 3.51 enables backup and restore permissions, which are inherited by programs such as File Manager that are started from the Shortcut Bar, which could allow local users to read folders for which they do not have permission.
Key Phrases:
 - Office Shortcut Bar
 - (OSB
 - Windows
 - backup and restore permissions
 - which
 - programs
 - File Manager
 - that
 - the Shortcut Bar
 - which
 - local users
 - folders
 - which
 - they
--------------------------------------------------
Vulnerability ID: CVE-1999-1300
Description: Vulnerability in accton in Cray UNICOS 6.1 and 6.0 allows local users to read arbitrary files and modify system accounting configuration.
Key Phrases:
 - Vulnerability
 - accton
 - Cray
 - UNICOS
 - local users
 -

In [None]:
import json
from collections import defaultdict

def extract_keywords_and_map(data):
    """
    Extracts keywords from descriptions and maps them to CVE IDs
    """
    keyword_to_cve_map = defaultdict(list)

    for vuln in data.get('vulnerabilities', []):
        vuln_id = vuln.get('cve', {}).get('id', 'Unknown ID')

        # Check all descriptions for keywords
        for desc in vuln.get('cve', {}).get('descriptions', []):
            description = desc.get('value', '')

            # Get the list of key phrases (assuming it's available in the data)
            key_phrases = desc.get('key_phrases', [])

            # If there are key phrases, map them to the CVE ID
            for phrase in key_phrases:
                keyword_to_cve_map[phrase].append({
                    'cve_id': vuln_id,
                    'description': description
                })

    # Save the keyword-to-CVE mapping to a JSON file
    output_file = 'keywords_to_cve_mapping.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(keyword_to_cve_map, f, indent=2)

    print(f"Mapping saved to {output_file}")

# Assuming 'data' contains the vulnerabilities data
extract_keywords_and_map(data)


Mapping saved to keywords_to_cve_mapping.json


**Semantic Search for Vulnerabilities Based on User Query**

In [None]:
def search_vulnerabilities(query, data, top_n=5):
    """Search for vulnerabilities that are semantically similar to a query."""
    query_embedding = get_embedding(query)
    vuln_embeddings = [get_embedding(vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', "")) for vuln in data['vulnerabilities']]

    # Calculate similarity scores
    scores = cosine_similarity([query_embedding], vuln_embeddings)[0]
    top_indices = scores.argsort()[-top_n:][::-1]

    # Fetch top N vulnerabilities
    results = [{"id": data['vulnerabilities'][idx].get('cve', {}).get('id', 'Unknown ID'), "description": data['vulnerabilities'][idx].get('cve', {}).get('descriptions', [{}])[0].get('value', "")} for idx in top_indices]

    return results

# Example usage of semantic search
query = "Show critical vulnerabilities affecting Linux servers"
search_results = search_vulnerabilities(query, data)
print("Search Results:", search_results)


Search Results: [{'id': 'CVE-2000-0373', 'description': 'Vulnerabilities in the KDE kvt terminal program allow local users to gain root privileges.'}, {'id': 'CVE-1999-0918', 'description': 'Denial of service in various Windows systems via malformed, fragmented IGMP packets.'}, {'id': 'CVE-1999-1162', 'description': 'Vulnerability in passwd in SCO UNIX 4.0 and earlier allows attackers to cause a denial of service by preventing users from being able to log into the system.'}, {'id': 'CVE-1999-0092', 'description': 'Various vulnerabilities in the AIX portmir command allows local users to obtain root access.'}, {'id': 'CVE-2000-0173', 'description': 'Vulnerability in the EELS system in SCO UnixWare 7.1.x allows remote attackers to cause a denial of service.'}]


**Extract software names using a NER model**

In [None]:
software_keywords = [
    "Windows", "Linux", "Ubuntu", "Debian", "Fedora", "Red Hat", "CentOS", "macOS",
    "Apache", "Nginx", "IIS", "MySQL", "PostgreSQL", "MongoDB", "Oracle", "SQLite",
    "Microsoft Office", "Adobe Acrobat", "Photoshop", "Illustrator", "Docker", "Kubernetes",
    "OpenSSL", "Java", "Python", "PHP", "Node.js", "Ruby", "Perl", "WordPress", "Joomla",
    "Drupal", "Android", "iOS", "Chrome", "Firefox", "Safari", "Edge", "Internet Explorer",
    "VMware", "Hyper-V", "VirtualBox", "Cisco", "Palo Alto", "Fortinet", "Checkpoint",
    "Salesforce", "SAP", "Workday", "Zoom", "Slack", "GitLab", "GitHub", "Bitbucket",
    "TensorFlow", "PyTorch", "OpenCV", "Elasticsearch", "Redis", "RabbitMQ", "Kafka",
    "Spark", "Hadoop", "Cloudera", "MapR", "Confluent", "Wireshark", "Nmap", "Metasploit"
]


In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
!python -m spacy download en_core_web_lg


Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy

# Load a larger pre-trained SpaCy model
nlp = spacy.load('en_core_web_lg')

# Function to extract software names
def extract_software_name(text):
    """Extract software names using a NER model."""
    doc = nlp(text)
    software_names = [ent.text for ent in doc.ents if ent.label_ == "PRODUCT"]
    return software_names if software_names else ["Unknown Software"]

# Apply this function to your data
for vuln in data['vulnerabilities']:
    for desc in vuln.get('cve', {}).get('descriptions', []):
        desc['software_name'] = extract_software_name(desc['value'])


In [None]:
import spacy
import re

# Load a larger pre-trained SpaCy model
nlp = spacy.load('en_core_web_lg')

# Function to extract software names using the CPE match and descriptions
def extract_software_name(vuln):
    """Extract software names using CPE match and descriptions."""
    software_names = set()

    # Check the descriptions
    descriptions = vuln.get('cve', {}).get('descriptions', [])
    for desc in descriptions:
        text = desc.get('value', "")
        doc = nlp(text)
        # Extracting entities labeled as "PRODUCT" in descriptions
        software_names.update([ent.text for ent in doc.ents if ent.label_ == "PRODUCT"])

    # Check the CPE configurations for software names
    configurations = vuln.get('cve', {}).get('configurations', [])
    for config in configurations:
        for node in config.get('nodes', []):
            for cpe in node.get('cpeMatch', []):
                # Extract the product from CPE criteria
                if cpe.get('vulnerable') is True:
                    cpe_criteria = cpe.get('criteria', '')
                    # Extract software name from CPE (e.g., "sendmail")
                    match = re.match(r"cpe:2.3:a:([a-z0-9_]+):", cpe_criteria)
                    if match:
                        software_names.add(match.group(1).replace('_', ' ').capitalize())

    # If no software name found, leave empty (no unknown software)
    return list(software_names) if software_names else []

# Apply this function to your data
for vuln in data['vulnerabilities']:
    vuln['software_name'] = extract_software_name(vuln)

# Print the results
for vuln in data['vulnerabilities']:
    print(f"Vulnerability ID: {vuln.get('cve', {}).get('id', 'Unknown ID')}")
    print(f"Extracted Software: {vuln.get('software_name', 'No software extracted')}")
    print("-" * 50)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Extracted Software: []
--------------------------------------------------
Vulnerability ID: CVE-1999-0027
Extracted Software: []
--------------------------------------------------
Vulnerability ID: CVE-1999-0028
Extracted Software: []
--------------------------------------------------
Vulnerability ID: CVE-1999-0029
Extracted Software: []
--------------------------------------------------
Vulnerability ID: CVE-1999-0030
Extracted Software: []
--------------------------------------------------
Vulnerability ID: CVE-1999-1182
Extracted Software: ['Delix']
--------------------------------------------------
Vulnerability ID: CVE-1999-0122
Extracted Software: []
--------------------------------------------------
Vulnerability ID: CVE-1999-0247
Extracted Software: ['Isc']
--------------------------------------------------
Vulnerability ID: CVE-1999-1208
Extracted Software: []
--------------------------------------------------
V

In [None]:
import json

def categorize_vulnerabilities_by_software(data):
    """
    Categorizes vulnerabilities by software name and stores them in a structured JSON file with CVE ID and description.
    """
    software_categories = {}

    # Extract software name for each vulnerability and categorize
    for vuln in data['vulnerabilities']:
        software_names = extract_software_name(vuln)
        vuln['software_name'] = software_names  # Store software names in the vuln object

        # Categorize vulnerabilities by software and store CVE ID and description
        for software in software_names:
            if software not in software_categories:
                software_categories[software] = []
            software_categories[software].append({
                'cve_id': vuln.get('cve', {}).get('id', 'Unknown ID'),
                'description': vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description available')
            })

    # Save the categorized data into a JSON file
    output_file = 'software_categorized_vulnerabilities.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(software_categories, f, indent=2)

    # Print a summary message
    print(f"Vulnerabilities categorized by software have been saved to '{output_file}'.")

# Assuming 'data' contains the vulnerabilities data (already fetched or provided)
categorize_vulnerabilities_by_software(data)


Vulnerabilities categorized by software have been saved to 'software_categorized_vulnerabilities.json'.


In [None]:
for vuln in data['vulnerabilities']:
    vuln['software_name'] = extract_software_name(vuln)

# Categorize vulnerabilities by software
software_categories = {}

for vuln in data['vulnerabilities']:
    software_names = vuln.get('software_name', [])
    for software in software_names:
        if software not in software_categories:
            software_categories[software] = []
        software_categories[software].append(vuln)

# Print the categorized vulnerabilities by software
for software, vulnerabilities in software_categories.items():
    print(f"Software: {software}")
    for vuln in vulnerabilities:
        print(f"  Vulnerability ID: {vuln.get('cve', {}).get('id', 'Unknown ID')}")
        print(f"  Description: {vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description available')}")
        print("-" * 50)

Software: Eric allman
  Vulnerability ID: CVE-1999-0095
  Description: The debug command in Sendmail is enabled, allowing attackers to execute commands as root.
--------------------------------------------------
  Vulnerability ID: CVE-1999-0145
  Description: Sendmail WIZ command enabled, allowing root access.
--------------------------------------------------
  Vulnerability ID: CVE-1999-0203
  Description: In Sendmail, attackers can gain root privileges via SMTP by specifying an improper "mail from" address and an invalid "rcpt to" address that would cause the mail to bounce to a program.
--------------------------------------------------
  Vulnerability ID: CVE-1999-0131
  Description: Buffer overflow and denial of service in Sendmail 8.7.5 and earlier through GECOS field gives root access to local users.
--------------------------------------------------
  Vulnerability ID: CVE-1999-0206
  Description: MIME buffer overflow in Sendmail 8.8.0 and 8.8.1 gives root access.
-----------

In [None]:
import json

def categorize_vulnerabilities_by_software(data):
    """
    Categorizes vulnerabilities by software name and stores only CVE ID and description in a JSON file.
    """
    software_categories = {}

    # Extract software name for each vulnerability
    for vuln in data['vulnerabilities']:
        software_names = extract_software_name(vuln)
        vuln['software_name'] = software_names  # Store software names in the vuln object

    # Categorize vulnerabilities by software and store only CVE ID and description
    for vuln in data['vulnerabilities']:
        software_names = vuln.get('software_name', [])
        for software in software_names:
            if software not in software_categories:
                software_categories[software] = []
            software_categories[software].append({
                'cve_id': vuln.get('cve', {}).get('id', 'Unknown ID'),
                'description': vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description available')
            })

    # Store only CVE ID and description in the JSON file
    output_file = 'cve_and_description.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(software_categories, f, indent=2)

    # Print a summary
    print(f"Vulnerabilities (CVE ID and description) have been saved to '{output_file}'.")

# Assuming 'data' contains the vulnerabilities data
categorize_vulnerabilities_by_software(data)


Vulnerabilities (CVE ID and description) have been saved to 'cve_and_description.json'.


In [None]:
def query_vulnerabilities_by_software(software_name):
    """Query vulnerabilities related to the given software name."""
    software_name = software_name.capitalize()  # Make the input case-insensitive
    if software_name in software_categories:
        print(f"Vulnerabilities related to {software_name}:")
        for vuln in software_categories[software_name]:
            print(f"  Vulnerability ID: {vuln.get('cve', {}).get('id', 'Unknown ID')}")
            print(f"  Description: {vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description available')}")
            print("-" * 50)
    else:
        print(f"No vulnerabilities found related to {software_name}.")

# Example usage: Querying vulnerabilities related to "Linux"
query_vulnerabilities_by_software("Linux")


Vulnerabilities related to Linux:
  Vulnerability ID: CVE-1999-1229
  Description: Quake 2 server 3.13 on Linux does not properly check file permissions for the config.cfg configuration file, which allows local users to read arbitrary files via a symlink from config.cfg to the target file.
--------------------------------------------------
  Vulnerability ID: CVE-1999-0262
  Description: Hylafax faxsurvey CGI script on Linux allows remote attackers to execute arbitrary commands via shell metacharacters in the query string.
--------------------------------------------------
  Vulnerability ID: CVE-1999-1381
  Description: Buffer overflow in dbadmin CGI program 1.0.1 on Linux allows remote attackers to execute arbitrary commands.
--------------------------------------------------
  Vulnerability ID: CVE-1999-0403
  Description: A bug in Cyrix CPUs on Linux allows local users to perform a denial of service.
--------------------------------------------------
  Vulnerability ID: CVE-1999-07

**Function to query vulnerabilities by time period**

In [None]:
import spacy
import re
from datetime import datetime

# Load a larger pre-trained SpaCy model
nlp = spacy.load('en_core_web_lg')

# Function to extract software names using the CPE match and descriptions
def extract_software_name(vuln):
    """Extract software names using CPE match and descriptions."""
    software_names = set()

    # Check the descriptions
    descriptions = vuln.get('cve', {}).get('descriptions', [])
    for desc in descriptions:
        text = desc.get('value', "")
        doc = nlp(text)
        # Extracting entities labeled as "PRODUCT" in descriptions
        software_names.update([ent.text for ent in doc.ents if ent.label_ == "PRODUCT"])

    # Check the CPE configurations for software names
    configurations = vuln.get('cve', {}).get('configurations', [])
    for config in configurations:
        for node in config.get('nodes', []):
            for cpe in node.get('cpeMatch', []):
                # Extract the product from CPE criteria
                if cpe.get('vulnerable') is True:
                    cpe_criteria = cpe.get('criteria', '')
                    # Extract software name from CPE (e.g., "sendmail")
                    match = re.match(r"cpe:2.3:a:([a-z0-9_]+):", cpe_criteria)
                    if match:
                        software_names.add(match.group(1).replace('_', ' ').capitalize())

    # If no software name found, leave empty (no unknown software)
    return list(software_names) if software_names else []

# Function to extract the publication date from the CVE information
def extract_publication_date(vuln):
    """Extract publication date from the CVE data."""
    published_date = vuln.get('cve', {}).get('published', "")
    try:
        # Convert to datetime object (assumes the date is in "YYYY-MM-DD" format)
        return datetime.strptime(published_date, "%Y-%m-%dT%H:%M:%S.%f")
    except ValueError:
        return None

# Apply this function to your data
for vuln in data['vulnerabilities']:
    vuln['software_name'] = extract_software_name(vuln)
    vuln['publication_date'] = extract_publication_date(vuln)

# Function to query vulnerabilities by time period
def query_vulnerabilities_by_time_period(start_year, end_year):
    """Query vulnerabilities that fall within the specified time period."""
    start_date = datetime(start_year, 1, 1)
    end_date = datetime(end_year, 12, 31, 23, 59, 59)

    print(f"Vulnerabilities between {start_year} and {end_year}:")

    found_vulns = False
    for vuln in data['vulnerabilities']:
        pub_date = vuln.get('publication_date', None)
        if pub_date and start_date <= pub_date <= end_date:
            found_vulns = True
            print(f"Vulnerability ID: {vuln.get('cve', {}).get('id', 'Unknown ID')}")
            print(f"Published: {vuln.get('cve', {}).get('published', 'Unknown date')}")
            print(f"Description: {vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description available')}")
            print("-" * 50)

    if not found_vulns:
        print(f"No vulnerabilities found in the time period {start_year}-{end_year}.")


query_vulnerabilities_by_time_period(1999, 2020)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Vulnerability ID: CVE-1999-0580
Published: 1999-01-01T05:00:00.000
Description: The HKEY_LOCAL_MACHINE key in a Windows NT system has inappropriate, system-critical permissions.
--------------------------------------------------
Vulnerability ID: CVE-1999-0581
Published: 1999-01-01T05:00:00.000
Description: The HKEY_CLASSES_ROOT key in a Windows NT system has inappropriate, system-critical permissions.
--------------------------------------------------
Vulnerability ID: CVE-1999-0583
Published: 1999-01-01T05:00:00.000
Description: There is a one-way or two-way trust relationship between Windows NT domains.
--------------------------------------------------
Vulnerability ID: CVE-1999-0584
Published: 1999-01-01T05:00:00.000
Description: A Windows NT file system is not NTFS.
--------------------------------------------------
Vulnerability ID: CVE-1999-0586
Published: 1999-01-01T05:00:00.000
Description: A network service is 

In [None]:
import spacy
import re
from datetime import datetime
import json

# Load a larger pre-trained SpaCy model
nlp = spacy.load('en_core_web_lg')

# Function to extract software names using the CPE match and descriptions
def extract_software_name(vuln):
    """Extract software names using CPE match and descriptions."""
    software_names = set()

    # Check the descriptions
    descriptions = vuln.get('cve', {}).get('descriptions', [])
    for desc in descriptions:
        text = desc.get('value', "")
        doc = nlp(text)
        # Extracting entities labeled as "PRODUCT" in descriptions
        software_names.update([ent.text for ent in doc.ents if ent.label_ == "PRODUCT"])

    # Check the CPE configurations for software names
    configurations = vuln.get('cve', {}).get('configurations', [])
    for config in configurations:
        for node in config.get('nodes', []):
            for cpe in node.get('cpeMatch', []):
                # Extract the product from CPE criteria
                if cpe.get('vulnerable') is True:
                    cpe_criteria = cpe.get('criteria', '')
                    # Extract software name from CPE (e.g., "sendmail")
                    match = re.match(r"cpe:2.3:a:([a-z0-9_]+):", cpe_criteria)
                    if match:
                        software_names.add(match.group(1).replace('_', ' ').capitalize())

    # If no software name found, leave empty (no unknown software)
    return list(software_names) if software_names else []

# Function to extract the publication date from the CVE information
def extract_publication_date(vuln):
    """Extract publication date from the CVE data."""
    published_date = vuln.get('cve', {}).get('published', "")
    try:
        # Convert to datetime object (assumes the date is in "YYYY-MM-DD" format)
        return datetime.strptime(published_date, "%Y-%m-%dT%H:%M:%S.%f")
    except ValueError:
        return None

# Apply this function to your data
for vuln in data['vulnerabilities']:
    vuln['software_name'] = extract_software_name(vuln)
    vuln['publication_date'] = extract_publication_date(vuln)

# Categorize vulnerabilities by publication year and store CVE ID and description
vulnerabilities_by_year = {}

for vuln in data['vulnerabilities']:
    pub_date = vuln.get('publication_date', None)
    if pub_date:
        year = pub_date.year  # Extract the year from the publication date
        if year not in vulnerabilities_by_year:
            vulnerabilities_by_year[year] = []
        vulnerabilities_by_year[year].append({
            'cve_id': vuln.get('cve', {}).get('id', 'Unknown ID'),
            'description': vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description available')
        })

# Save the categorized data into a JSON file
output_file = 'vulnerabilities_by_year.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(vulnerabilities_by_year, f, indent=2)

# Print a summary message
print(f"Vulnerabilities categorized by year have been saved to '{output_file}'.")


Vulnerabilities categorized by year have been saved to 'vulnerabilities_by_year.json'.


**Categorize vulnerabilities**

In [None]:
import spacy
import re
from datetime import datetime

# Load a larger pre-trained SpaCy model
nlp = spacy.load('en_core_web_lg')

# Function to extract software names using the CPE match and descriptions
def extract_software_name(vuln):
    """Extract software names using CPE match and descriptions."""
    software_names = set()

    # Check the descriptions
    descriptions = vuln.get('cve', {}).get('descriptions', [])
    for desc in descriptions:
        text = desc.get('value', "")
        doc = nlp(text)
        # Extracting entities labeled as "PRODUCT" in descriptions
        software_names.update([ent.text for ent in doc.ents if ent.label_ == "PRODUCT"])

    # Check the CPE configurations for software names
    configurations = vuln.get('cve', {}).get('configurations', [])
    for config in configurations:
        for node in config.get('nodes', []):
            for cpe in node.get('cpeMatch', []):
                # Extract the product from CPE criteria
                if cpe.get('vulnerable') is True:
                    cpe_criteria = cpe.get('criteria', '')
                    # Extract software name from CPE (e.g., "sendmail")
                    match = re.match(r"cpe:2.3:a:([a-z0-9_]+):", cpe_criteria)
                    if match:
                        software_names.add(match.group(1).replace('_', ' ').capitalize())

    # If no software name found, leave empty (no unknown software)
    return list(software_names) if software_names else []

# Function to extract CVSS base score and categorize the vulnerability
def categorize_criticality(vuln):
    """Categorize vulnerability based on its CVSS base score."""
    cvss_metric = vuln.get('cve', {}).get('metrics', {}).get('cvssMetricV2', [])
    if cvss_metric:
        base_score = cvss_metric[0].get('cvssData', {}).get('baseScore', None)
        if base_score:
            # Categorize based on the CVSS score
            if base_score >= 7.0:
                return 'High'
            elif base_score >= 4.0:
                return 'Medium'
            else:
                return 'Low'
    return 'Unknown'  # Return 'Unknown' if no score is available

# Apply these functions to your data
for vuln in data['vulnerabilities']:
    vuln['software_name'] = extract_software_name(vuln)
    vuln['criticality'] = categorize_criticality(vuln)

# Function to display vulnerabilities by criticality level
def display_vulnerabilities_by_criticality(criticality):
    """Display vulnerabilities based on the given criticality."""
    print(f"Displaying vulnerabilities with {criticality} criticality level:")

    found_vulns = False
    for vuln in data['vulnerabilities']:
        vuln_criticality = vuln.get('criticality', 'Unknown')

        # Check if the vulnerability matches the criticality level
        if vuln_criticality == criticality:
            found_vulns = True
            print(f"Vulnerability ID: {vuln.get('cve', {}).get('id', 'Unknown ID')}")
            print(f"Published: {vuln.get('cve', {}).get('published', 'Unknown date')}")
            print(f"Description: {vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description available')}")
            print(f"Criticality: {vuln_criticality}")
            print(f"Software: {vuln.get('software_name', 'No software extracted')}")
            print("-" * 50)

    if not found_vulns:
        print(f"No vulnerabilities found with {criticality} criticality.")

# Example usage: Querying vulnerabilities with Low criticality
display_vulnerabilities_by_criticality("Low")



Displaying vulnerabilities with Low criticality level:
Vulnerability ID: CVE-1999-1554
Published: 1990-10-31T05:00:00.000
Description: /usr/sbin/Mail on SGI IRIX 3.3 and 3.3.1 does not properly set the group ID to the group ID of the user who started Mail, which allows local users to read the mail of other users.
Criticality: Low
Software: []
--------------------------------------------------
Vulnerability ID: CVE-1999-1218
Published: 1993-02-18T05:00:00.000
Description: Vulnerability in finger in Commodore Amiga UNIX 2.1p2a and earlier allows local users to read arbitrary files.
Criticality: Low
Software: ['Amiga', '2.1p2a']
--------------------------------------------------
Vulnerability ID: CVE-1999-1137
Published: 1993-10-01T04:00:00.000
Description: The permissions for the /dev/audio device on Solaris 2.2 and earlier, and SunOS 4.1.x, allow any local user to read from the device, which could be used by an attacker to monitor conversations happening near a machine that has a microp

**Categorize vulnerabilities by weakness types**

In [None]:
import spacy

# Load the large pre-trained SpaCy model
nlp = spacy.load('en_core_web_lg')

# Function to extract weakness types using NLP
def extract_weakness_type(vuln):
    """Extract the weakness type from the 'weaknesses' field using NLP."""
    weakness_types = set()

    # Check the weaknesses descriptions
    weaknesses = vuln.get('cve', {}).get('weaknesses', [])
    for weakness in weaknesses:
        for desc in weakness.get('description', []):
            text = desc.get('value', "")
            doc = nlp(text)
            # Extracting entities labeled as "WEAKNESS" or specific CWE tags
            for ent in doc.ents:
                if 'CWE' in ent.text:
                    weakness_types.add(ent.text.strip())

    return list(weakness_types) if weakness_types else ["Unknown"]

# Function to categorize vulnerabilities by weakness types
def categorize_vulnerabilities_by_weakness(data):
    """Categorize vulnerabilities by weakness types and display them."""
    weaknesses_to_vulnerabilities = {}

    # Apply the weakness extraction function to each vulnerability
    for vuln in data['vulnerabilities']:
        weakness_types = extract_weakness_type(vuln)

        for weakness in weakness_types:
            if weakness not in weaknesses_to_vulnerabilities:
                weaknesses_to_vulnerabilities[weakness] = []

            weaknesses_to_vulnerabilities[weakness].append({
                "vuln_id": vuln.get('cve', {}).get('id', 'Unknown ID'),
                "software": vuln.get('software_name', 'Unknown Software'),
                "description": vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description')
            })

    # Display vulnerabilities grouped by weakness types
    if weaknesses_to_vulnerabilities:
        print(f"Displaying vulnerabilities grouped by their weaknesses:")
        for weakness, vulnerabilities in weaknesses_to_vulnerabilities.items():
            print(f"\nWeakness: {weakness}")
            for vuln in vulnerabilities:
                print(f"  - Vulnerability ID: {vuln['vuln_id']}")
                print(f"    Software: {vuln['software']}")
                print(f"    Description: {vuln['description']}")
                print("-" * 50)
    else:
        print("No vulnerabilities found.")

# Example usage: Categorize and display vulnerabilities by weakness
categorize_vulnerabilities_by_weakness(data)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    Description: A system-critical Windows NT registry key has an inappropriate value.
--------------------------------------------------
  - Vulnerability ID: CVE-1999-0613
    Software: []
    Description: The rpc.sprayd service is running.
--------------------------------------------------
  - Vulnerability ID: CVE-1999-0614
    Software: ['Notes', 'ConsultIDs']
    Description: Rejected reason: DO NOT USE THIS CANDIDATE NUMBER.  ConsultIDs: None.  Reason: this candidate is solely about a configuration that does not directly introduce security vulnerabilities, so it is more appropriate to cover under the Common Configuration Enumeration (CCE).  Notes: the former description is: "The FTP service is running.
--------------------------------------------------
  - Vulnerability ID: CVE-1999-0615
    Software: ['Notes', 'ConsultIDs']
    Description: Rejected reason: DO NOT USE THIS CANDIDATE NUMBER.  ConsultIDs: None.  Rea

In [None]:
import spacy
import json

# Load the large pre-trained SpaCy model
nlp = spacy.load('en_core_web_lg')

# Function to extract weakness types using NLP
def extract_weakness_type(vuln):
    """Extract the weakness type from the 'weaknesses' field using NLP."""
    weakness_types = set()

    # Check the weaknesses descriptions
    weaknesses = vuln.get('cve', {}).get('weaknesses', [])
    for weakness in weaknesses:
        for desc in weakness.get('description', []):
            text = desc.get('value', "")
            doc = nlp(text)
            # Extracting entities labeled as "WEAKNESS" or specific CWE tags
            for ent in doc.ents:
                if 'CWE' in ent.text:
                    weakness_types.add(ent.text.strip())

    return list(weakness_types) if weakness_types else ["Unknown"]

# Function to categorize vulnerabilities by weakness types and store them in a JSON
def categorize_vulnerabilities_by_weakness(data):
    """Categorize vulnerabilities by weakness types and store them in a JSON file."""
    weaknesses_to_vulnerabilities = {}

    # Apply the weakness extraction function to each vulnerability
    for vuln in data['vulnerabilities']:
        weakness_types = extract_weakness_type(vuln)

        for weakness in weakness_types:
            if weakness not in weaknesses_to_vulnerabilities:
                weaknesses_to_vulnerabilities[weakness] = []

            weaknesses_to_vulnerabilities[weakness].append({
                "vuln_id": vuln.get('cve', {}).get('id', 'Unknown ID'),
                "software": vuln.get('software_name', 'Unknown Software'),
                "description": vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description')
            })

    # Save the categorized data into a JSON file
    output_file = 'vulnerabilities_by_weakness.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(weaknesses_to_vulnerabilities, f, indent=2)

    # Print a summary message
    print(f"Vulnerabilities categorized by weakness types have been saved to '{output_file}'.")

# Example usage: Categorize and store vulnerabilities by weakness
categorize_vulnerabilities_by_weakness(data)


Vulnerabilities categorized by weakness types have been saved to 'vulnerabilities_by_weakness.json'.


**Categorize vulnerabilities based on Access Vector and Access Complexity**

In [None]:
import spacy
import json

# Load the larger pre-trained SpaCy model
nlp = spacy.load('en_core_web_lg')

# Function to extract weakness types using NLP
def extract_weakness_type(vuln):
    """Extract the weakness type from the 'weaknesses' field using NLP."""
    weakness_types = set()

    # Check the weaknesses descriptions
    weaknesses = vuln.get('cve', {}).get('weaknesses', [])
    for weakness in weaknesses:
        for desc in weakness.get('description', []):
            text = desc.get('value', "")
            doc = nlp(text)
            # Extracting entities labeled as "WEAKNESS" or specific CWE tags
            for ent in doc.ents:
                if 'CWE' in ent.text:
                    weakness_types.add(ent.text.strip())

    return list(weakness_types) if weakness_types else ["Unknown"]

# Function to categorize vulnerabilities by weakness types and store them in a JSON
def categorize_vulnerabilities_by_weakness(data):
    """Categorize vulnerabilities by weakness types and store them in a JSON file."""
    weaknesses_to_vulnerabilities = {}

    # Apply the weakness extraction function to each vulnerability
    for vuln in data['vulnerabilities']:
        weakness_types = extract_weakness_type(vuln)

        # Skip the vulnerabilities with unknown weakness type
        if "Unknown" in weakness_types:
            continue

        for weakness in weakness_types:
            if weakness not in weaknesses_to_vulnerabilities:
                weaknesses_to_vulnerabilities[weakness] = []

            weaknesses_to_vulnerabilities[weakness].append({
                "vuln_id": vuln.get('cve', {}).get('id', 'Unknown ID'),
                "software": vuln.get('software_name', 'Unknown Software'),
                "description": vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description')
            })

    # Save the categorized data into a JSON file
    output_file = 'vulnerabilities_by_weakness_filtered.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(weaknesses_to_vulnerabilities, f, indent=2)

    # Print a summary message
    print(f"Vulnerabilities categorized by weakness types (excluding 'Unknown') have been saved to '{output_file}'.")

# Example usage: Categorize and store vulnerabilities by weakness
categorize_vulnerabilities_by_weakness(data)


Vulnerabilities categorized by weakness types (excluding 'Unknown') have been saved to 'vulnerabilities_by_weakness_filtered.json'.


**Categorize vulnerabilities based on impact type (Confidentiality, Integrity, or Availability)**

In [None]:
import spacy

# Load the larger pre-trained SpaCy model
nlp = spacy.load('en_core_web_lg')

# Function to extract the impact values from CVSS data
def extract_impact(vuln):
    """Extract confidentiality, integrity, and availability impact from CVSS data."""
    impacts = {
        "confidentialityImpact": None,
        "integrityImpact": None,
        "availabilityImpact": None
    }

    # Get the CVSS metrics from the vulnerability
    cvss_metrics = vuln.get('cve', {}).get('metrics', {}).get('cvssMetricV2', [])

    for cvss in cvss_metrics:
        cvss_data = cvss.get('cvssData', {})

        impacts["confidentialityImpact"] = cvss_data.get("confidentialityImpact", "NONE")
        impacts["integrityImpact"] = cvss_data.get("integrityImpact", "NONE")
        impacts["availabilityImpact"] = cvss_data.get("availabilityImpact", "NONE")

    return impacts

# Function to categorize vulnerabilities by Impact (Confidentiality, Integrity, Availability)
def categorize_vulnerabilities_by_impact(data, impact_type="confidentialityImpact", impact_value="COMPLETE"):
    """Categorize vulnerabilities based on impact type (Confidentiality, Integrity, or Availability)."""
    impact_groups = {
        "COMPLETE": [],
        "PARTIAL": [],
        "NONE": []
    }

    # Apply the impact extraction function to each vulnerability
    for vuln in data['vulnerabilities']:
        impacts = extract_impact(vuln)

        # Check the selected impact type and categorize based on the impact value
        impact = impacts.get(impact_type, "NONE")

        if impact == "COMPLETE":
            impact_groups["COMPLETE"].append({
                "vuln_id": vuln.get('cve', {}).get('id', 'Unknown ID'),
                "software": vuln.get('software_name', 'Unknown Software'),
                "description": vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description'),
                "impact": impact
            })
        elif impact == "PARTIAL":
            impact_groups["PARTIAL"].append({
                "vuln_id": vuln.get('cve', {}).get('id', 'Unknown ID'),
                "software": vuln.get('software_name', 'Unknown Software'),
                "description": vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description'),
                "impact": impact
            })
        else:
            impact_groups["NONE"].append({
                "vuln_id": vuln.get('cve', {}).get('id', 'Unknown ID'),
                "software": vuln.get('software_name', 'Unknown Software'),
                "description": vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description'),
                "impact": impact
            })

    # Display vulnerabilities grouped by impact
    print(f"Displaying vulnerabilities grouped by {impact_type} Impact:\n")

    # Display for each impact level
    for impact_level in ["COMPLETE", "PARTIAL", "NONE"]:
        print(f"{impact_level} IMPACT:")
        for vuln in impact_groups[impact_level]:
            print(f"  - Vulnerability ID: {vuln['vuln_id']}")
            print(f"    Software: {vuln['software']}")
            print(f"    Description: {vuln['description']}")
            print(f"    Impact: {vuln['impact']}")
            print("-" * 50)

# Example usage: Categorize and display vulnerabilities with Complete impact on Confidentiality
categorize_vulnerabilities_by_impact(data, impact_type="confidentialityImpact", impact_value="COMPLETE")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    Software: ['Seattle lab software']
    Description: SLMail 3.1 and 3.2 allows local users to access any file in the NTFS file system when the Remote Administration Service (RAS) is enabled by setting a user's Finger File to point to the target file, then running finger on the user.
    Impact: PARTIAL
--------------------------------------------------
  - Vulnerability ID: CVE-1999-0386
    Software: ['Windows', 'Microsoft']
    Description: Microsoft Personal Web Server and FrontPage Personal Web Server in some Windows systems allows a remote attacker to read files on the server by using a nonstandard URL.
    Impact: PARTIAL
--------------------------------------------------
  - Vulnerability ID: CVE-1999-0414
    Software: []
    Description: In Linux before version 2.0.36, remote attackers can spoof a TCP connection and pass data to the application layer before fully establishing the connection.
    Impact: PARTIA

In [None]:
import spacy
import json

# Load the larger pre-trained SpaCy model
nlp = spacy.load('en_core_web_lg')

# Function to extract the impact values from CVSS data
def extract_impact(vuln):
    """Extract confidentiality, integrity, and availability impact from CVSS data."""
    impacts = {
        "confidentialityImpact": None,
        "integrityImpact": None,
        "availabilityImpact": None
    }

    # Get the CVSS metrics from the vulnerability
    cvss_metrics = vuln.get('cve', {}).get('metrics', {}).get('cvssMetricV2', [])

    for cvss in cvss_metrics:
        cvss_data = cvss.get('cvssData', {})

        impacts["confidentialityImpact"] = cvss_data.get("confidentialityImpact", "NONE")
        impacts["integrityImpact"] = cvss_data.get("integrityImpact", "NONE")
        impacts["availabilityImpact"] = cvss_data.get("availabilityImpact", "NONE")

    return impacts

# Function to categorize vulnerabilities by impact and store them in JSON
def categorize_vulnerabilities_by_impact(data, impact_type="confidentialityImpact", impact_value="COMPLETE"):
    """Categorize vulnerabilities based on impact type (Confidentiality, Integrity, or Availability) and store them in a JSON file."""
    impact_groups = {
        "COMPLETE": [],
        "PARTIAL": [],
        "NONE": []
    }

    # Apply the impact extraction function to each vulnerability
    for vuln in data['vulnerabilities']:
        impacts = extract_impact(vuln)

        # Check the selected impact type and categorize based on the impact value
        impact = impacts.get(impact_type, "NONE")

        if impact == "COMPLETE":
            impact_groups["COMPLETE"].append({
                "vuln_id": vuln.get('cve', {}).get('id', 'Unknown ID'),
                "description": vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description'),
                "impact": impact
            })
        elif impact == "PARTIAL":
            impact_groups["PARTIAL"].append({
                "vuln_id": vuln.get('cve', {}).get('id', 'Unknown ID'),
                "description": vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description'),
                "impact": impact
            })
        else:
            impact_groups["NONE"].append({
                "vuln_id": vuln.get('cve', {}).get('id', 'Unknown ID'),
                "description": vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description'),
                "impact": impact
            })

    # Save the categorized data into a JSON file
    output_file = 'vulnerabilities_by_impact.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(impact_groups, f, indent=2)

    # Print a summary message
    print(f"Vulnerabilities categorized by {impact_type} impact have been saved to '{output_file}'.")

# Example usage: Categorize and store vulnerabilities with complete impact on confidentiality
categorize_vulnerabilities_by_impact(data, impact_type="confidentialityImpact", impact_value="COMPLETE")


Vulnerabilities categorized by confidentialityImpact impact have been saved to 'vulnerabilities_by_impact.json'.


In [None]:
import json

# Function to extract the access vector and complexity from CVSS data
def extract_access_data(vuln):
    """Extract access vector and complexity from CVSS data."""
    access_info = {
        "accessVector": "Unknown",
        "accessComplexity": "Unknown"
    }

    # Get the CVSS metrics from the vulnerability
    cvss_metrics = vuln.get('cve', {}).get('metrics', {}).get('cvssMetricV2', [])

    for cvss in cvss_metrics:
        cvss_data = cvss.get('cvssData', {})
        access_info["accessVector"] = cvss_data.get('accessVector', 'Unknown').strip().lower()
        access_info["accessComplexity"] = cvss_data.get('accessComplexity', 'Unknown').strip().lower()

    return access_info

# Function to categorize vulnerabilities by Access Vector
def categorize_vulnerabilities_by_access_vector(data):
    """Categorize vulnerabilities based on Access Vector."""
    access_vector_groups = {}

    # Apply the access extraction function to each vulnerability
    for vuln in data['vulnerabilities']:
        access_info = extract_access_data(vuln)
        access_vector = access_info["accessVector"]

        if access_vector != "unknown":
            if access_vector not in access_vector_groups:
                access_vector_groups[access_vector] = []

            access_vector_groups[access_vector].append({
                "vuln_id": vuln.get('cve', {}).get('id', 'Unknown ID'),
                "description": vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description'),
            })

    # Store the categorized vulnerabilities in a JSON file
    output_file = 'vulnerabilities_by_access_vector.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(access_vector_groups, f, indent=2)

    # Print a summary message
    print(f"Vulnerabilities categorized by Access Vector have been saved to '{output_file}'.")

# Function to categorize vulnerabilities by Access Complexity
def categorize_vulnerabilities_by_access_complexity(data):
    """Categorize vulnerabilities based on Access Complexity."""
    access_complexity_groups = {}

    # Apply the access extraction function to each vulnerability
    for vuln in data['vulnerabilities']:
        access_info = extract_access_data(vuln)
        access_complexity = access_info["accessComplexity"]

        if access_complexity != "unknown":
            if access_complexity not in access_complexity_groups:
                access_complexity_groups[access_complexity] = []

            access_complexity_groups[access_complexity].append({
                "vuln_id": vuln.get('cve', {}).get('id', 'Unknown ID'),
                "description": vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description'),
            })

    # Store the categorized vulnerabilities in a JSON file
    output_file = 'vulnerabilities_by_access_complexity.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(access_complexity_groups, f, indent=2)

    # Print a summary message
    print(f"Vulnerabilities categorized by Access Complexity have been saved to '{output_file}'.")

# Categorizing and saving vulnerabilities separately by Access Vector and Access Complexity
categorize_vulnerabilities_by_access_vector(data)
categorize_vulnerabilities_by_access_complexity(data)


Vulnerabilities categorized by Access Vector have been saved to 'vulnerabilities_by_access_vector.json'.
Vulnerabilities categorized by Access Complexity have been saved to 'vulnerabilities_by_access_complexity.json'.


**Categorize vulnerabilities by reference URL**

In [None]:
# Function to categorize vulnerabilities by reference URL
def categorize_vulnerabilities_by_references(data):
    """Categorize vulnerabilities based on external references."""
    reference_groups = {}

    # Iterate over each vulnerability
    for vuln in data['vulnerabilities']:
        # Get the list of references for the vulnerability
        references = vuln.get('cve', {}).get('references', [])

        # Process each reference URL
        for ref in references:
            ref_url = ref.get('url', '')

            if ref_url:
                if ref_url not in reference_groups:
                    reference_groups[ref_url] = []

                # Add the vulnerability to the group associated with this URL
                reference_groups[ref_url].append({
                    "vuln_id": vuln.get('cve', {}).get('id', 'Unknown ID'),
                    "software": vuln.get('software_name', 'Unknown Software'),
                    "description": vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description'),
                    "url": ref_url
                })

    # Display vulnerabilities grouped by reference URL
    print(f"Displaying vulnerabilities grouped by References (URLs):\n")

    # Display for each reference URL
    for ref_url, vulnerabilities in reference_groups.items():
        print(f"Reference URL: {ref_url}")
        for vuln in vulnerabilities:
            print(f"  - Vulnerability ID: {vuln['vuln_id']}")
            print(f"    Software: {vuln['software']}")
            print(f"    Description: {vuln['description']}")
            print(f"    Reference URL: {vuln['url']}")
            print("-" * 50)

# Example usage: Categorize and display vulnerabilities by their reference URLs
categorize_vulnerabilities_by_references(data)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    Description: Netscape Mail Notification (nsnotify) utility in Netscape Communicator uses IMAP without SSL, even if the user has set a preference for Communicator to use an SSL connection, allowing a remote attacker to sniff usernames and passwords in plaintext.
    Reference URL: http://www.iss.net/security_center/static/4385.php
--------------------------------------------------
Reference URL: https://exchange.xforce.ibmcloud.com/vulnerabilities/CVE-2000-0066
  - Vulnerability ID: CVE-2000-0066
    Software: ['Oreilly']
    Description: WebSite Pro allows remote attackers to determine the real pathname of webdirectories via a malformed URL request.
    Reference URL: https://exchange.xforce.ibmcloud.com/vulnerabilities/CVE-2000-0066
--------------------------------------------------
Reference URL: http://www.securityfocus.com/bid/930
  - Vulnerability ID: CVE-2000-0075
    Software: ['Nosque']
    Description: Super 

In [None]:
import json

# Function to categorize vulnerabilities by reference URL
def categorize_vulnerabilities_by_references(data):
    """Categorize vulnerabilities based on external references."""
    reference_groups = {}

    # Iterate over each vulnerability
    for vuln in data['vulnerabilities']:
        # Get the list of references for the vulnerability
        references = vuln.get('cve', {}).get('references', [])

        # Process each reference URL
        for ref in references:
            ref_url = ref.get('url', '')

            if ref_url:
                if ref_url not in reference_groups:
                    reference_groups[ref_url] = []

                # Add the vulnerability to the group associated with this URL
                reference_groups[ref_url].append({
                    "vuln_id": vuln.get('cve', {}).get('id', 'Unknown ID'),
                    "software": vuln.get('software_name', 'Unknown Software'),
                    "description": vuln.get('cve', {}).get('descriptions', [{}])[0].get('value', 'No description'),
                    "url": ref_url
                })

    # Store the categorized vulnerabilities in a JSON file
    output_file = 'vulnerabilities_by_reference_url.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(reference_groups, f, indent=2)

    # Print a summary message
    print(f"Vulnerabilities categorized by Reference URL have been saved to '{output_file}'.")

categorize_vulnerabilities_by_references(data)


Vulnerabilities categorized by Reference URL have been saved to 'vulnerabilities_by_reference_url.json'.
