<a href="https://colab.research.google.com/github/ducvu/ml_projects/blob/master/scrap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required libraries
!pip install selenium
!apt-get update  # Update package lists
!apt install -y firefox-geckodriver # Install GeckoDriver for Firefox, use ChromeDriver if you prefer Chrome

Collecting selenium
  Downloading selenium-4.24.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.2-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.24.0-py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.26.2-py3-none-any.whl (475 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m476.0/476.0 kB[0m [31m8.0

In [3]:
# Import necessary libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd
from io import StringIO
import urllib.parse

# Set up Firefox driver for Selenium
options = webdriver.FirefoxOptions()
options.add_argument('--headless')  # Run in headless mode for faster execution without GUI
driver = webdriver.Firefox(options=options)

# Define the base URL and parameters
base_url = "https://www.cvedetails.com/vulnerability-search.php?"
params = {
    "f": "1",
    "vendor": "",
    "product": "",
    "cweid": "",
    "cvssscoremin": "",
    "cvssscoremax": "",
    "publishdatestart": "2020-01-01",  # Adjust the start date as needed
    "publishdateend": "2020-12-31",    # Adjust the end date as needed
    "updatedatestart": "",
    "updatedateend": "",
    "cisaaddstart": "",
    "cisaaddend": "",
    "cisaduestart": "",
    "cisadueend": "",
    "optexeccode": "1",
    "page": "1"  # Start from the first page
}

# Initialize DataFrame to store results
columns = ["CVE", "Published", "Last.Update", "Max.CVSS.Base.Score", "EPSS.Score",
           "CISA.KEV.Added", "Public.Exploit.Exists", "Summary", "Year"]
df_all = pd.DataFrame(columns=columns)

# Function to construct URL
def construct_url(params):
    return base_url + urllib.parse.urlencode(params)

# Start scraping loop
page = 1
while (page < 5):
    print(f"Scraping page {page}...")

    # Update the page parameter
    params['page'] = str(page)
    current_url = construct_url(params)

    # Navigate to the current URL
    driver.get(current_url)
    time.sleep(4)  # Wait for the page to load

    try:
        # Click to copy the output data
        copy_button = driver.find_element(By.XPATH, '//a[@href = "javascript:copyResultsAsTSV()"]')
        copy_button.click()
        time.sleep(2)  # Wait for the copy action to complete

        # Extract data from clipboard
        clipboard_content = driver.execute_script("return navigator.clipboard.readText();")

        if clipboard_content.strip() == "":
            print("No more data available or no data on this page.")
            break

        # Read data into a DataFrame
        df = pd.read_csv(StringIO(clipboard_content), sep='\t')

        # Add Year column for the current data
        df['Year'] = params['publishdatestart'][:4]
        df_all = pd.concat([df_all, df], ignore_index=True)

        # Increment page number
        page += 1

    except Exception as e:
        print("No more pages or error occurred:", e)
        break  # Exit loop if no data or an error occurs

# Close the browser
driver.quit()

# Display the result
print(df_all.head())

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
              CVE   Published Last.Update Max.CVSS.Base.Score EPSS.Score  \
0  CVE-2020-35863  2020-12-31         NaN                 NaN        NaN   
1  CVE-2020-35858  2020-12-31         NaN                 NaN        NaN   
2  CVE-2020-35657  2020-12-23         NaN                 NaN        NaN   
3  CVE-2020-35656  2020-12-23         NaN                 NaN        NaN   
4  CVE-2020-35627  2020-12-28         NaN                 NaN        NaN   

  CISA.KEV.Added Public.Exploit.Exists  \
0            NaN                   NaN   
1            NaN                   NaN   
2            NaN                   NaN   
3            NaN                   NaN   
4            NaN                   NaN   

                                             Summary  Year Last Update  \
0  An issue was discovered in the hyper crate bef...  2020  2021-07-21   
1  An issue was discovered in the prost crate bef...  2020  2021-0