In [1]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

import os
import time
from urllib.parse import urljoin, urlparse




In [2]:
def setup_driver(): 
    try: 
        firefox_options = FirefoxOptions()
        firefox_options.add_argument('--headless')

        return webdriver.Firefox(options=firefox_options)
    
    except: 
        print("failed to get firefox")

        try: 
            chrome_options = ChromeOptions()
            chrome_options.add_argument('--headless')

            return webdriver.Chrome(options=chrome_options)
        
        except: 

            print("Couldn't get Chrome running")

            return None

In [3]:
def get_all_urls(start_url, driver):
    driver.get(start_url)


    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.TAG_NAME, "body"))
    ) 


    domain = urlparse(start_url).netloc
    urls_to_visit = set([start_url])

    visited_urls = set()

    while urls_to_visit: 
        current_url = urls_to_visit.pop()
        if current_url not in visited_urls: 
            try: 
                driver.get(current_url)
                visited_urls.add(current_url)

                links = driver.find_elements(By.TAG_NAME, "a")
                for link in links: 
                    href = link.get_attribute('href')
                    if href and urlparse(href).netloc == domain: 
                        full_url = urljoin(start_url, href)
                        urls_to_visit.add(full_url)
                
                print(f"Procceed: {full_url}")

            except Exception as e:
                print(f"Error processing {current_url}: {str(e)}")

            time.sleep(1)

    return list(visited_urls)                


In [4]:
def get_page_text(url, driver):
    driver.get(url)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.TAG_NAME, "body"))

    )
    page_source = driver.page_source

    soup = BeautifulSoup(page_source, 'html.parser')
    text = soup.get_text(separator="\n", strip=True)

    return text

In [5]:
def save_as_markdown(text, filename): 
    with open(filename, 'w', encoding='utf-8') as f: 
        f.write(text)

In [6]:
driver = setup_driver()

In [14]:
start_url = "http://aayushakacloudy.is-a.dev"

urls = get_all_urls(start_url, driver=driver)

Procceed: https://aayushakacloudy.is-a.dev/hire-me/Resume_2024.pdf
Procceed: https://aayushakacloudy.is-a.dev/ml-progress
Procceed: https://aayushakacloudy.is-a.dev/portfolio/work/dice-shot
Procceed: https://aayushakacloudy.is-a.dev/
Procceed: https://aayushakacloudy.is-a.dev/hire-me/Resume_2024.pdf
Procceed: https://aayushakacloudy.is-a.dev/
Procceed: https://aayushakacloudy.is-a.dev/hire-me/Resume_2024.pdf#page=1&zoom=auto,-112,842
Procceed: https://aayushakacloudy.is-a.dev/
Procceed: https://aayushakacloudy.is-a.dev/hire-me/Resume_2024.pdf#page=1&zoom=auto,-112,842
Procceed: https://aayushakacloudy.is-a.dev/hire-me/Resume_2024.pdf#page=2&zoom=auto,-112,842
Procceed: https://aayushakacloudy.is-a.dev/portfolio/work/house-paint/#
Procceed: https://aayushakacloudy.is-a.dev/portfolio/work/bomber-boy/#
Procceed: https://aayushakacloudy.is-a.dev/hire-me/Resume_2024.pdf#page=1&zoom=auto,-112,842
Procceed: https://aayushakacloudy.is-a.dev/portfolio/work/dice-shot
Procceed: https://aayushakac

In [8]:
def site_2_md(site: str, dir_name: str):
    
    driver = setup_driver() 
    start_url = site
    urls = get_all_urls(start_url, driver)

    
    os.makedirs(f"{dir_name}", exist_ok=True)

    for i, url in enumerate(urls): 
        try: 
            text = get_page_text(url, driver)
            filename = f"{dir_name}/page_{i+1}.md"
            save_as_markdown(text, filename)

            print(f"Saved {url} as {filename}")
        except Exception as e:
            print(f"Error processing {url}: \n {str(e)}")
        
        time.sleep(2) 

In [19]:
os.makedirs("site_data", exist_ok=True)

In [20]:
for i, url in enumerate(urls): 
    try: 
        text = get_page_text(url, driver)
        filename = f"site_data/page_{i+1}.md"

        save_as_markdown(text, filename)

        print(f"Saved {url} as {filename}")
    except Exception as e: 
        print(f"Error processing {url}: {str(e)}")
    
    time.sleep(2)

Saved https://aayushakacloudy.is-a.dev/hire-me/Resume_2024.pdf#page=1 as site_data/page_1.md
Saved https://aayushakacloudy.is-a.dev/portfolio/work/house-paint/# as site_data/page_2.md
Saved https://aayushakacloudy.is-a.dev/portfolio as site_data/page_3.md
Saved https://aayushakacloudy.is-a.dev/portfolio/work/bomber-boy as site_data/page_4.md
Saved https://aayushakacloudy.is-a.dev/portfolio/work/dice-shot as site_data/page_5.md
Saved https://aayushakacloudy.is-a.dev/portfolio/work/forest-app as site_data/page_6.md
Saved https://aayushakacloudy.is-a.dev/hire-me/Resume_2024.pdf#page=2 as site_data/page_7.md
Saved https://aayushakacloudy.is-a.dev/portfolio/work/dice-shot/# as site_data/page_8.md
Saved https://aayushakacloudy.is-a.dev/hire-me/Resume_2024.pdf#page=1&zoom=auto,-112,842 as site_data/page_9.md
Saved http://aayushakacloudy.is-a.dev as site_data/page_10.md
Saved https://aayushakacloudy.is-a.dev/portfolio/#contact as site_data/page_11.md
Saved https://aayushakacloudy.is-a.dev/hire