In [4]:
!pip install requests[socks] stem beautifulsoup4

Defaulting to user installation because normal site-packages is not writeable


In [2]:
from stem import Signal
from stem.control import Controller

def renew_tor_ip():
    with Controller.from_port(port=9051) as controller:
        controller.authenticate()  # No password needed for default Tor setup
        controller.signal(Signal.NEWNYM)


In [6]:
import requests

def get_tor_session():
    """
    Create a session that routes traffic through the Tor network.
    """
    session = requests.Session()
    session.proxies = {
        'http': 'socks5h://127.0.0.1:9050',
        'https': 'socks5h://127.0.0.1:9050',
    }
    return session


In [8]:
def scrape_dark_web(url):
    """
    Fetch the HTML content of a dark web URL using Tor.
    """
    session = get_tor_session()  # Create a Tor session
    try:
        response = session.get(url, timeout=15)  # Fetch the URL
        response.raise_for_status()  # Raise an error if the request fails
        return response.text  # Return the HTML content
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return None


In [9]:
# Test the setup
def renew_tor_ip():
    from stem import Signal
    from stem.control import Controller
    try:
        with Controller.from_port(port=9051) as controller:
            controller.authenticate()
            controller.signal(Signal.NEWNYM)
            print("Tor IP address has been renewed.")
    except Exception as e:
        print(f"Error renewing IP: {e}")

renew_tor_ip()  # Optional: Renew the Tor IP

dark_web_url = "https://www.geeksforgeeks.org/darkscrape-osint-tool-for-scraping-dark-websites/"  # Test URL
html_content = scrape_dark_web(dark_web_url)

if html_content:
    print(html_content[:1000])  # Print the first 1000 characters of the HTML
else:
    print("Failed to fetch the content.")


Tor IP address has been renewed.
<!DOCTYPE html>
<!--[if IE 7]>
<html class="ie ie7" lang="en-US" prefix="og: http://ogp.me/ns#">
<![endif]-->
<!--[if IE 8]>
<html class="ie ie8" lang="en-US" prefix="og: http://ogp.me/ns#">
<![endif]-->
<!--[if !(IE 7) | !(IE 8)  ]><!-->
<html lang="en-US" prefix="og: http://ogp.me/ns#" >

<!--<![endif]-->
<head>
<meta charset="UTF-8" />

<meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=0.5, maximum-scale=3.0"> 
<link rel="shortcut icon" href="https://media.geeksforgeeks.org/wp-content/cdn-uploads/gfg_favicon.png" type="image/x-icon" />

<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>


<meta name="theme-color" content="#308D46" />
<meta name='robots' content='index, follow, max-image-preview:large, max-snippet:-1' />

<meta name="image" property="og:image" content="https://media.geeksforgeeks.org/wp-content/cdn-uploads/gfg_200x200-min.pn

In [11]:
from bs4 import BeautifulSoup

def parse_html(html_content):
    """
    Parse HTML content and extract meaningful data.
    """
    soup = BeautifulSoup(html_content, 'html.parser')

    # Example: Extract the page title
    page_title = soup.title.string if soup.title else "No title found"
    print(f"Page Title: {page_title}")

    # Example: Extract all links
    links = [a['href'] for a in soup.find_all('a', href=True)]
    print(f"Found {len(links)} links:")
    for link in links[:10]:  # Print the first 10 links
        print(link)

    return page_title, links

# Parse the fetched HTML
page_title, links = parse_html(html_content)


Page Title: DarkScrape - OSINT Tool For Scraping Dark Websites - GeeksforGeeks
Found 423 links:
#main
https://www.geeksforgeeks.org/
https://www.geeksforgeeks.org/courses/dsa-to-development-coding-guide?itm_source=geeksforgeeks&itm_medium=main_header&itm_campaign=courses
https://www.geeksforgeeks.org/courses/data-science-live?itm_source=geeksforgeeks&itm_medium=main_header&itm_campaign=courses
https://www.geeksforgeeks.org/courses/mastering-generative-ai-and-chat-gpt?itm_source=geeksforgeeks&itm_medium=main_header&itm_campaign=courses
https://www.geeksforgeeks.org/courses/search?query=AWS&itm_source=geeksforgeeks&itm_medium=main_header&itm_campaign=courses
https://www.geeksforgeeks.org/courses/dsa-self-paced?itm_source=geeksforgeeks&itm_medium=main_header&itm_campaign=courses
https://www.geeksforgeeks.org/courses/Data-Structures-With-Python?itm_source=geeksforgeeks&itm_medium=main_header&itm_campaign=courses
https://www.geeksforgeeks.org/courses/data-structures-and-algorithms-in-javasc

In [12]:
import json

def save_data(page_title, links, filename="scraped_data.json"):
    """
    Save the scraped data to a JSON file.
    """
    data = {
        "page_title": page_title,
        "links": links
    }

    with open(filename, 'w') as file:
        json.dump(data, file, indent=4)
    print(f"Data saved to {filename}")

save_data(page_title, links)


Data saved to scraped_data.json
