In [1]:
!pip install webdriver-manager

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, webdriver-manager
Successfully installed python-dotenv-1.1.0 webdriver-manager-4.0.2


In [3]:
# -----------------------------------------------------------------------------
# Interactive Web Crawler & Retrieval-Augmented Generation (RAG) Exercise (Selenium Version)
# -----------------------------------------------------------------------------

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

from transformers import pipeline
import nltk
import time

# Download tokenizer
nltk.download('punkt')

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Remove this if you want to see the browser
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    # Automatically downloads the correct version of chromedriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    return driver
    
def extract_element_text(driver, tag, class_name):
    """
    Extract the text of an element with the given tag and class using Selenium.

    Parameters:
        driver (webdriver): The Selenium WebDriver instance.
        tag (str): The HTML tag to look for (e.g., "span", "div").
        class_name (str): The class name to match.

    Returns:
        str or None: Extracted text content if found, otherwise None.
    """
    try:
        if class_name:
            element = driver.find_element(By.CSS_SELECTOR, f"{tag}.{class_name}")
        else:
            element = driver.find_element(By.TAG_NAME, tag)
        return element.text.strip()
    except NoSuchElementException:
        return None

def generate_text_with_model(input_text, model_name="gpt2", max_new_tokens=20):
    """Generate text using a pre-trained model."""
    generator = pipeline('text-generation', model=model_name)
    generated = generator(input_text, max_new_tokens=max_new_tokens, num_return_sequences=1)
    return generated[0]['generated_text']

def interactive_exercise():
    """Main interactive routine."""
    print("🔍 Interactive Web Crawler + Text Generator")

    # Step 1: URL Input
    url = input("Enter the URL to scrape: ").strip()

    # Start WebDriver
    driver = setup_driver()
    driver.get(url)

    # Optional: wait for JS-rendered content to load
    time.sleep(3)

    # Step 2: Tag and Class Input
    tag = input("Enter the HTML tag to search for (e.g., 'div', 'span'): ").strip()
    class_name = input("Enter the class name of the element (or leave blank): ").strip()
    class_name = class_name if class_name else None

    # Step 3: Extract Element Text
    extracted_text = extract_element_text(driver, tag, class_name)
    if extracted_text:
        print(f"\n✅ Extracted Text: {extracted_text}")
    else:
        print("❌ Could not find an element with the specified tag and class.")
        driver.quit()
        return

    # Step 4: Use the Extracted Text with a Language Model
    use_generation = input("Would you like to generate additional text? (yes/no): ").strip().lower()
    if use_generation == "yes":
        prompt = f"The extracted information is: '{extracted_text}'. Further details: "
        generated_text = generate_text_with_model(prompt)
        print("\n🤖 Generated Text:")
        print(generated_text)
    else:
        print("👍 Exiting without generating text.")

    driver.quit()

# Run if this script is executed directly
if __name__ == "__main__":
    interactive_exercise()


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jasmine.frantz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


🔍 Interactive Web Crawler + Text Generator


Enter the URL to scrape:  https://www.xe.com/currencycharts/?from=ALL&to=JPY
Enter the HTML tag to search for (e.g., 'div', 'span'):  p
Enter the class name of the element (or leave blank):  sc-b39d611a-0 hjhFZZ


❌ Could not find an element with the specified tag and class.
