In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import time

# Function to access a webpage and parse it using BeautifulSoup
def access_webpage(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

def check_link_status(link_url):
    if link_url.startswith('http://') or link_url.startswith('https://'):
        try:
            response = requests.head(link_url)
            return response.status_code
        except requests.exceptions.RequestException:
            return "Error"
    else:
        return "Invalid URL"

def is_same_domain(url, base_url):
    parsed_url = urlparse(url)
    parsed_base_url = urlparse(base_url)
    return parsed_url.netloc == parsed_base_url.netloc

def crawl(url, depth, source_link, visited_links, df):
    if depth == 0 or url in visited_links:
        return
    
    # Mark the current URL as visited
    visited_links.add(url)
    
    # Parse the HTML data
    soup = access_webpage(url)
    
    # Find all <a> tags that contain links
    main_content = soup.find('main', class_='main')

    if main_content:
        links = main_content.find_all('a', href=True)
    else:
        links = soup.find_all('a', href=True)
    
    for link in links:
        link_url = link['href']
        full_link_url = urljoin(url, link_url)  # Create an absolute URL

        # Check if the link is within the same domain
        if is_same_domain(full_link_url, source_link):
            # Check if the link has already been checked
            if full_link_url not in visited_links:
                # Check the response code
                response_code = check_link_status(full_link_url)

                # Store the data in the DataFrame immediately
                df.loc[len(df)] = [full_link_url, link.text, response_code, depth, source_link]

                # Introduce a delay
                time.sleep(3)  # Adjust the delay time as needed

                # Recursively crawl sub-links, passing the source_link as the parent's source_link
                crawl(full_link_url, depth - 1, full_link_url, visited_links, df)

# Main URL to start with
start_url = "https://YOURLINK.com/"

# Create a set to store visited links
visited_links = set()

# Create a DataFrame to store data
df = pd.DataFrame(columns=['Link', 'Text', 'Response Code', 'Depth Level', 'Source Link'])

# Crawl the website with a maximum depth of 3 levels (change as needed)
crawl(start_url, depth=3, source_link=start_url, visited_links=visited_links, df=df)

# Print or save the DataFrame as needed
print(df)


In [None]:
writer = pd.ExcelWriter('Deteksi_Web.xlsx', engine='xlsxwriter')

# write each dataframe to a different worksheet
df.to_excel(writer, sheet_name='Data Web', index=False)
#df.to_excel(writer, sheet_name='Data Dump', index=False)

# save the Excel file
writer.save()

KernelInterrupted: Execution interrupted by the Jupyter kernel.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=979e795a-8200-44ff-ba32-7f73e1cd95b7' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>