In [3]:
!pip install selenium


Collecting selenium
  Downloading selenium-4.31.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting websocket-client~=1.8 (from selenium)
  Using cached websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pysocks!=1.5.7,<2.0,>=1.5.6 (from urllib3[socks]<3,>=1.26->selenium)
  Downloading PySocks-1.7.1-py3-none-any.whl.metadata (13 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websock

In [None]:
# -----------------------------------------------------------------------------
# Interactive Web Crawler & Retrieval-Augmented Generation (RAG) Exercise (Selenium Version)
# -----------------------------------------------------------------------------

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from transformers import pipeline
import nltk
import time

# Download tokenizer
nltk.download('punkt')

def setup_driver():
    """Set up and return a headless Chrome Selenium WebDriver."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    service = Service()  # Optional: specify path to chromedriver if needed
    return webdriver.Chrome(service=service, options=chrome_options)

def extract_element_text(driver, tag, class_name):
    """
    Extract the text of an element with the given tag and class using Selenium.

    Parameters:
        driver (webdriver): The Selenium WebDriver instance.
        tag (str): The HTML tag to look for (e.g., "span", "div").
        class_name (str): The class name to match.

    Returns:
        str or None: Extracted text content if found, otherwise None.
    """
    try:
        if class_name:
            element = driver.find_element(By.CSS_SELECTOR, f"{tag}.{class_name}")
        else:
            element = driver.find_element(By.TAG_NAME, tag)
        return element.text.strip()
    except NoSuchElementException:
        return None

def generate_text_with_model(input_text, model_name="gpt2", max_new_tokens=20):
    """Generate text using a pre-trained model."""
    generator = pipeline('text-generation', model=model_name)
    generated = generator(input_text, max_new_tokens=max_new_tokens, num_return_sequences=1)
    return generated[0]['generated_text']

def interactive_exercise():
    """Main interactive routine."""
    print("🔍 Interactive Web Crawler + Text Generator")

    # Step 1: URL Input
    url = input("Enter the URL to scrape: ").strip()

    # Start WebDriver
    driver = setup_driver()
    driver.get(url)

    # Optional: wait for JS-rendered content to load
    time.sleep(3)

    # Step 2: Tag and Class Input
    tag = input("Enter the HTML tag to search for (e.g., 'div', 'span'): ").strip()
    class_name = input("Enter the class name of the element (or leave blank): ").strip()
    class_name = class_name if class_name else None

    # Step 3: Extract Element Text
    extracted_text = extract_element_text(driver, tag, class_name)
    if extracted_text:
        print(f"\n✅ Extracted Text: {extracted_text}")
    else:
        print("❌ Could not find an element with the specified tag and class.")
        driver.quit()
        return

    # Step 4: Use the Extracted Text with a Language Model
    use_generation = input("Would you like to generate additional text? (yes/no): ").strip().lower()
    if use_generation == "yes":
        prompt = f"The extracted information is: '{extracted_text}'. Further details: "
        generated_text = generate_text_with_model(prompt)
        print("\n🤖 Generated Text:")
        print(generated_text)
    else:
        print("👍 Exiting without generating text.")

    driver.quit()

# Run if this script is executed directly
if __name__ == "__main__":
    interactive_exercise()
