# This exercise is using selenium to render websites, read their page sources, and then passes on the source code to OpenAI. It then uses the model to identify and find potential vulnerabilities and security gaps in that source.

In [1]:
# imports

import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

# If you get an error running this cell, then please head over to the troubleshooting notebook!

In [2]:
# Load environment variables in a file called .env

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


API key found and looks good so far!


In [3]:
openai = OpenAI()

# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.
# If it STILL doesn't work (horrors!) then please see the Troubleshooting notebook in this folder for full instructions

In [4]:
pip install selenium

[0mNote: you may need to restart the kernel to use updated packages.


In [5]:
pip install webdriver-manager

[0mNote: you may need to restart the kernel to use updated packages.


In [6]:
# Step 1: Download the .deb package as a normal user
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb

# Step 2: Install it with sudo
!sudo apt install ./google-chrome-stable_current_amd64.deb


--2025-05-17 15:27:43--  https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
Resolving dl.google.com (dl.google.com)... 74.125.193.136, 74.125.193.190, 74.125.193.93, ...
Connecting to dl.google.com (dl.google.com)|74.125.193.136|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 116499092 (111M) [application/x-debian-package]
Saving to: ‘google-chrome-stable_current_amd64.deb.5’


2025-05-17 15:28:05 (5.18 MB/s) - ‘google-chrome-stable_current_amd64.deb.5’ saved [116499092/116499092]

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Note, selecting 'google-chrome-stable' instead of './google-chrome-stable_current_amd64.deb'
google-chrome-stable is already the newest version (136.0.7103.59-1).
The following packages were automatically installed and are no longer required:
  htop libnl-3-200 libnl-genl-3-200
Use 'sudo apt autoremove' to remove them.
0 upgraded, 0 newly installed, 0 to remove and

In [7]:
!which google-chrome
!google-chrome --version

/usr/bin/google-chrome
Google Chrome 136.0.7103.59 


In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# options = Options()
# options.binary_location = "/usr/bin/google-chrome"  # Or wherever `which google-chrome` points
# options.add_argument("--headless")
# options.add_argument("--no-sandbox")
# options.add_argument("--disable-dev-shm-usage")

# service = Service(ChromeDriverManager().install())
# driver = webdriver.Chrome(service=service, options=options)

In [None]:
# # Get page source
# url = "https://nohello.net"
# driver.get(url)
# page_source = driver.page_source
# driver.quit()

# Selenium setup done. Definiing website class and other objects below

In [11]:
# Define our system prompt - you can experiment with this later, changing the last sentence to 'Respond in markdown in Spanish."

system_prompt = "You are an assistant that analyzes the page source of a website and identifies potentila vulnerabilities and security gaps in the page source code and gives a short one liner on what should be done about it. Respond in markdown"

In [12]:
class Website:

    def __init__(self, url):
        """
        Create this Website object from the given url using the Selenium library
        """

        options = Options()
        options.binary_location = "/usr/bin/google-chrome"  # Or wherever `which google-chrome` points
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")

        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
        
        self.url = url
        driver.get(url)
        self.page_title = driver.title
        self.page_source = driver.page_source
        driver.quit()
    
        # response = requests.get(url, headers=headers)
        # soup = BeautifulSoup(response.content, 'html.parser')
        # self.title = soup.title.string if soup.title else "No title found"
        # for irrelevant in soup.body(["script", "style", "img", "input"]):
        #     irrelevant.decompose()
        # self.text = soup.body.get_text(separator="\n", strip=True)

In [13]:
# Let's try one out. Change the website and add print statements to follow along.

testweb = Website("https://nohello.net")
# print(testweb.page_title)
# print(testweb.page_source)

In [14]:
# A function that writes a User Prompt that asks for summaries of websites:

def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.page_title}"
    user_prompt += "\nThe contents of this website is as follows; please analyze the page source on this website in detail and identify potential vulnerabilites and security gaps that can be fixed.\n\n"
    user_prompt += website.page_source
    return user_prompt

In [15]:
# print(user_prompt_for(testweb))

In [16]:
# See how this function creates exactly the format above

def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)}
    ]

In [None]:
# Try this out, and then try for a few more websites

messages_for(testweb)

In [17]:
# And now: call the OpenAI API. You will get very familiar with this!

def analyze_code(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = messages_for(website)
    )
    return response.choices[0].message.content

In [None]:
analyze_code("https://nohello.net")

In [18]:
# A function to display this nicely in the Jupyter output, using markdown

def display_results(url):
    analysis = analyze_code(url)
    display(Markdown(analysis))

In [19]:
display_results("https://nohello.net")

# Security Analysis of the "no hello" Website

Here are the potential vulnerabilities and security gaps observed in the page source of the "no hello" website, along with recommendations for each:

1. **Inline JavaScript and CSS:** 
   - **Issue:** Inline styles and scripts can lead to security vulnerabilities, like Cross-Site Scripting (XSS).
   - **Recommendation:** Move all inline JS and CSS to external files and ensure they are minimized.

2. **Lack of Content Security Policy (CSP):** 
   - **Issue:** No CSP header is defined, increasing the risk of XSS attacks.
   - **Recommendation:** Implement a Content Security Policy to restrict sources of scripts and styles.

3. **Local Storage Usage:**
   - **Issue:** Using `localStorage` for language preference can expose it to XSS if not properly sanitized.
   - **Recommendation:** Ensure any data written to or read from `localStorage` is properly sanitized.

4. **HTTP Content Security Headers Missing:**
   - **Issue:** Missing headers like `X-Content-Type-Options`, `X-Frame-Options`, etc.
   - **Recommendation:** Implement additional security headers to mitigate common threats.

5. **Image URLs with Unsecured Path:**
   - **Issue:** The image sources use double slashes which could result in unintended behavior.
   - **Recommendation:** Ensure image URLs are absolute and formatted correctly to avoid resource loading issues.

6. **External Script Source:**
   - **Issue:** The site imports external scripts (like `typed.js`) from a CDN without integrity checks.
   - **Recommendation:** Use the Subresource Integrity (SRI) attribute for external script imports.

7. **Exposed Links:**
   - **Issue:** External links in the content are not set to open in a new tab.
   - **Recommendation:** Use `target="_blank"` on external links to prevent potential tab-nabbing attacks.

8. **Deprecated HTML Elements:**
   - **Issue:** Use of some old HTML elements may lead to compatibility issues.
   - **Recommendation:** Ensure HTML is up to date and complies with current standards.

By addressing these vulnerabilities, the website can enhance its overall security posture and better protect user data.