In [None]:
import requests
from bs4 import BeautifulSoup
import time

def extract_description_and_cwe(url):
    try:
        response = requests.get(url, timeout=10)  # Added timeout
        response.raise_for_status()  # Raise an exception for HTTP errors
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract description
        description_tag = soup.find("p", {"data-testid": "vuln-description"})
        description = description_tag.text.strip() if description_tag else "No description found"

        # Extract CWE ID
        cwe_table = soup.find("table", {"data-testid": "vuln-CWEs-table"})
        cwe_id = cwe_table.find("td").text.strip() if cwe_table else "No CWE ID found"

        return description, cwe_id
    except Exception as e:
        print(f"Error fetching data from {url}: {e}")
        return "No description found", "No CWE ID found"

def clean_text(text):
    """Helper function to clean text by removing newlines, tabs, and extra spaces."""
    return text.replace('\n', ' ').replace('\t', ' ').replace('  ', ' ').strip()

def save_descriptions_and_cwe_to_tsv(start, end):
    with open('cti_rcm.tsv', 'a', encoding='utf-8') as file:
        file.write('URL\tDescription\tCWE_ID\tPrompt\n')  # Write the header

        for i in range(start, end + 1):
            cve_id = f"{i:04d}"  # Format CVE ID with leading zeros
            url = f"https://nvd.nist.gov/vuln/detail/CVE-2024-{cve_id}"
            print(f"Processing {url}...")

            description, cwe_id = extract_description_and_cwe(url)

            # Clean the description
            cleaned_description = clean_text(description)

            prompt = f"""
            You are a cybersecurity expert. Given the following vulnerability description, identify the appropriate CWE category.
            Return only the CWE ID and name. If multiple CWE IDs apply, provide the most relevant one.

            Description:
            {cleaned_description}
            """

            # Clean the prompt
            cleaned_prompt = clean_text(prompt)

            print(f"Description found: {cleaned_description[:100]}...")  # Print first 100 characters of cleaned description

           
            file.write(f"{url}\t{cleaned_description}\t{cwe_id}\t{cleaned_prompt}\n")
            time.sleep(2)



In [None]:
save_descriptions_and_cwe_to_tsv(20000, 23000)
save_descriptions_and_cwe_to_tsv(30000, 33000)
save_descriptions_and_cwe_to_tsv(34000, 35000)

#### cleaning dataset from missing values

In [None]:
import csv

def clean_dataset(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8', errors='ignore') as infile, open(output_file, 'w', encoding='utf-8', newline='') as outfile:
        reader = csv.reader(infile, delimiter='\t')
        writer = csv.writer(outfile, delimiter='\t')

        for row in reader:
            # Check if the second column contains "NVD-CWE-noinfo"
            if row[2] != "Not found":
                writer.writerow(row)

clean_dataset('cti_rcm.tsv', 'cti_rcm_Eya.tsv')


In [None]:
import csv
def clean_dataset(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8', errors='ignore') as infile, open(output_file, 'w', encoding='utf-8', newline='') as outfile:
        reader = csv.reader(infile, delimiter='\t')
        writer = csv.writer(outfile, delimiter='\t')

        for row in reader:
            # Check if the second column contains "NVD-CWE-noinfo"
            if row[2] != "NVD-CWE-Other":
                writer.writerow(row)

clean_dataset('cti_rcm.tsv', 'cti_rcm_Eya.tsv')


#### Combining the files to get one datatset

In [None]:
import csv

def combine_tsv_files(file1, file2, output_file):
    with open(output_file, 'w', encoding='utf-8', newline='') as outfile:
        writer = csv.writer(outfile, delimiter='\t')

        # Combine file1
        with open(file1, 'r', encoding='utf-8') as infile1:
            reader1 = csv.reader(infile1, delimiter='\t')
            for row in reader1:
                writer.writerow(row)

        # Combine file2, skipping the header
        with open(file2, 'r', encoding='utf-8') as infile2:
            reader2 = csv.reader(infile2, delimiter='\t')
            next(reader2) 
            for row in reader2:
                writer.writerow(row)

# Example usage
combine_tsv_files('cti_rcm_Eya.tsv.tsv', 'cti_rcm.tsv', 'cti_rcm_final.tsv')
