In [None]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def save_image(image_url, folder):
    try:
        img_response = requests.get(image_url)
        if img_response.status_code == 200:
            img_name = os.path.basename(urlparse(image_url).path)
            img_path = os.path.join(folder, img_name)
            with open(img_path, 'wb') as img_file:
                img_file.write(img_response.content)
            return img_name
        else:
            print(f"Failed to retrieve image. Status code: {img_response.status_code}")
            return None 
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while saving image: {e}")
        return None

def scrape_project(url, download_folder='downloaded_images'):
    try:
        if not os.path.exists(download_folder):
            os.makedirs(download_folder)

        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            def print_element(element, level=0):
                indent = '    ' * level
                if element.name == 'h1':
                    print(f"\n# {element.get_text()}\n")
                elif element.name == 'h2':
                    print(f"\n## {element.get_text()}\n")
                elif element.name == 'h3':
                    print(f"\n### {element.get_text()}\n")
                elif element.name == 'p':
                    print(f"{indent}{element.get_text()}\n")
                elif element.name == 'ul':
                    for li in element.find_all('li'):
                        print(f"{indent} - {li.get_text()}")
                elif element.name == 'img':
                    img_url = urljoin(url, element.get('src'))
                    img_name = save_image(img_url, download_folder)
                    if img_name:
                        img_path = os.path.join(download_folder, img_name)
                        print(f"{indent}![{element.get('alt', 'Image')}]({img_path})")

            nav = soup.find('nav')
            if nav:
                print("\nNavigation Bar:\n")
                for li in nav.find_all('li'):
                    print(f" - {li.get_text()}")
                print("\n" + "="*50 + "\n")
  
            main_content = soup.find('div', {'class': 'main-content'})
            if not main_content:
                main_content = soup.find('div', {'class': 'content'})

            if main_content:
                for element in main_content.find_all(['h1', 'h2', 'h3', 'p', 'ul', 'img']):
                    print_element(element)
            else:
                print("Main content not found. Please check the class name used to identify the main content.")
        else:
            print(f"Failed to retrieve the website. Status code: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")

project_url = input("Enter the URL of the project page: ").strip()

if project_url.startswith('http://') or project_url.startswith('https://'):
    scrape_project(project_url)
else:
    print("Invalid URL. Please ensure the URL starts with http:// or https://")


In [4]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

download_folder = 'downloaded_images/'

def save_image(image_url, folder):
    try:
        img_response = requests.get(image_url)
        if img_response.status_code == 200:
            img_name = os.path.basename(urlparse(image_url).path)
            img_path = os.path.join(folder, img_name)
            with open(img_path, 'wb') as img_file:
                img_file.write(img_response.content)
            return img_name
        else:
            print(f"Failed to retrieve image. Status code: {img_response.status_code}")
            return None 
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while saving image: {e}")
        return None

def print_element(element, url, level=0):
    indent = '    ' * level
    if element.name == 'h1':
        print(f"\n# {element.get_text()}\n")
    elif element.name == 'h2':
        print(f"\n## {element.get_text()}\n")
    elif element.name == 'h3':
        print(f"\n### {element.get_text()}\n")
    elif element.name == 'p':
        print(f"{indent}{element.get_text()}\n")
    elif element.name == 'ul':
        for li in element.find_all('li'):
            print(f"{indent} - {li.get_text()}")
    elif element.name == 'img':
        img_url = urljoin(url, element.get('src'))
        img_name = save_image(img_url, download_folder)
        if img_name:
            img_path = os.path.join(download_folder, img_name)
            print(f"{indent}![{element.get('alt', 'Image')}]({img_path})")

def scrape_project(url, download_folder='downloaded_images'):
    try:
        if not os.path.exists(download_folder):
            os.makedirs(download_folder)

        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            nav = soup.find('nav')
            if nav:
                print("\nNavigation Bar:\n")
                for li in nav.find_all('li'):
                    print(f" - {li.get_text()}")
                print("\n" + "="*50 + "\n")

            main_content = soup.find('div', {'class': 'main-content'})
            if not main_content:
                main_content = soup.find('div', {'class': 'content'})

            if main_content:
                for element in main_content.find_all(['h1', 'h2', 'h3', 'p', 'ul', 'img', 'table']):
                    if element.name == 'table':
                        for row in element.find_all('tr'):
                            cols = row.find_all('td')
                            if len(cols) >= 2:
                                first_td = cols[0].get_text(strip=True)
                                second_td = cols[1].get_text(strip=True)
                                print(f"{first_td}: {second_td}")
                    else:
                        print_element(element, url)
            else:
                print("Main content not found. Please check the class name used to identify the main content.")
        else:
            print(f"Failed to retrieve the website. Status code: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")

project_url = input("Enter the URL of the project page: ").strip()

if project_url.startswith('http://') or project_url.startswith('https://'):
    scrape_project(project_url)
else:
    print("Invalid URL. Please ensure the URL starts with http:// or https://")


Navigation Bar:

 - 
Log in




# Projects

: The goal of the ICTBiomed project is to create an informatics platform leveraging the joint capacity of global research community in order to provide an environment for comprehensive cancer research based on tools that had emerged in scientific communities across the world.
![](downloaded_images/ICTBioMed%20logo.png)
: ICKA’s mission is to enable and facilitate the generation, use and reuse of knowledge in India via the country’s own institutions, people, technology and data; and to address dimensions of the Indian cancer problem in a way that is comprehensive, scalable, sustainable and affordable.
![](downloaded_images/icka%20logo_0.png)
: The Mycroft Cognitive Assistant® (Mycroft) is designed to provide research universities and medical centers with support and assistance in applying for research grants from the U.S. National Institutes of Health (NIH).
![](downloaded_images/Logo-Mycroft-Extended-250_0.png)
: Breast cancer (BC) is the mo