In [None]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def save_image(image_url, folder):
    try:
        img_response = requests.get(image_url)
        if img_response.status_code == 200:
            img_name = os.path.basename(urlparse(image_url).path)
            img_path = os.path.join(folder, img_name)
            with open(img_path, 'wb') as img_file:
                img_file.write(img_response.content)
            return img_name
        else:
            print(f"Failed to retrieve image. Status code: {img_response.status_code}")
            return None 
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while saving image: {e}")
        return None

def scrape_project(url, download_folder='downloaded_images'):
    try:
        if not os.path.exists(download_folder):
            os.makedirs(download_folder)

        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            def print_element(element, level=0):
                indent = '    ' * level
                if element.name == 'h1':
                    print(f"\n# {element.get_text()}\n")
                elif element.name == 'h2':
                    print(f"\n## {element.get_text()}\n")
                elif element.name == 'h3':
                    print(f"\n### {element.get_text()}\n")
                elif element.name == 'p':
                    print(f"{indent}{element.get_text()}\n")
                elif element.name == 'ul':
                    for li in element.find_all('li'):
                        print(f"{indent} - {li.get_text()}")
                elif element.name == 'img':
                    img_url = urljoin(url, element.get('src'))
                    img_name = save_image(img_url, download_folder)
                    if img_name:
                        img_path = os.path.join(download_folder, img_name)
                        print(f"{indent}![{element.get('alt', 'Image')}]({img_path})")

            nav = soup.find('nav')
            if nav:
                print("\nNavigation Bar:\n")
                for li in nav.find_all('li'):
                    print(f" - {li.get_text()}")
                print("\n" + "="*50 + "\n")
  
            main_content = soup.find('div', {'class': 'main-content'})
            if not main_content:
                main_content = soup.find('div', {'class': 'content'})

            if main_content:
                for element in main_content.find_all(['h1', 'h2', 'h3', 'p', 'ul', 'img']):
                    print_element(element)
            else:
                print("Main content not found. Please check the class name used to identify the main content.")
        else:
            print(f"Failed to retrieve the website. Status code: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")

project_url = input("Enter the URL of the project page: ").strip()

if project_url.startswith('http://') or project_url.startswith('https://'):
    scrape_project(project_url)
else:
    print("Invalid URL. Please ensure the URL starts with http:// or https://")


In [6]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import json

download_folder = 'downloaded_images/'

def save_image(image_url, folder):
    try:
        img_response = requests.get(image_url)
        if img_response.status_code == 200:
            img_name = os.path.basename(urlparse(image_url).path)
            img_path = os.path.join(folder, img_name)
            with open(img_path, 'wb') as img_file:
                img_file.write(img_response.content)
            return img_name
        else:
            print(f"Failed to retrieve image. Status code: {img_response.status_code}")
            return None 
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while saving image: {e}")
        return None

def print_element(element, url, level=0):
    indent = '    ' * level
    if element.name == 'h1':
        print(f"\n# {element.get_text()}\n")
    elif element.name == 'h2':
        print(f"\n## {element.get_text()}\n")
    elif element.name == 'h3':
        print(f"\n### {element.get_text()}\n")
    elif element.name == 'p':
        print(f"{indent}{element.get_text()}\n")
    elif element.name == 'ul':
        for li in element.find_all('li'):
            print(f"{indent} - {li.get_text()}")
    elif element.name == 'img':
        img_url = urljoin(url, element.get('src'))
        img_name = save_image(img_url, download_folder)
        if img_name:
            img_path = os.path.join(download_folder, img_name)
            print(f"{indent}![{element.get('alt', 'Image')}]({img_path})")

def scrape_project(url, download_folder='downloaded_images'):
    project_data = []
    try:
        if not os.path.exists(download_folder):
            os.makedirs(download_folder)

        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            nav = soup.find('nav')
            if nav:
                print("\nNavigation Bar:\n")
                for li in nav.find_all('li'):
                    print(f" - {li.get_text()}")
                print("\n" + "="*50 + "\n")

            main_content = soup.find('div', {'class': 'main-content'})
            if not main_content:
                main_content = soup.find('div', {'class': 'content'})

            if main_content:
                for element in main_content.find_all(['h1', 'h2', 'h3', 'p', 'ul', 'img', 'table']):
                    if element.name == 'table':
                        for row in element.find_all('tr'):
                            cols = row.find_all('td')
                            if len(cols) >= 2:
                                first_td = cols[0].get_text(strip=True)
                                second_td = cols[1].get_text(strip=True)
                                project_data.append({
                                    'type': 'table',
                                    'first_td': first_td,
                                    'second_td': second_td
                                })
                                print(f"{first_td}: {second_td}")
                    else:
                        project_data.append({
                            'type': element.name,
                            'text': element.get_text(strip=True),
                            'url': url if element.name == 'img' else None
                        })
                        print_element(element, url)
            else:
                print("Main content not found. Please check the class name used to identify the main content.")
        else:
            print(f"Failed to retrieve the website. Status code: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
    
    return project_data

project_url = input("Enter the URL of the project page: ").strip()

if project_url.startswith('http://') or project_url.startswith('https://'):
    data = scrape_project(project_url)
    with open('projects_data.json', 'w') as f:
        json.dump(data, f, indent=4)
    # scrape_project(project_url)
else:
    print("Invalid URL. Please ensure the URL starts with http:// or https://")


Navigation Bar:

 - 
Log in




# OHSL Breast Cancer Data Alliance (BCDA)


Breast cancer (BC) is the most common incident site of cancer in women worldwide accounting for 24.5 % of all cancers (Globocan 2020).  Amongst the new cases of breast cancer in 2020, Asia has the largest incidence accounting for 45.4% amongst females of all ages. Interestingly, new breast cancer incidence cases amongst females under 45 years of age also has the largest percentage in Asia (52.9%). Furthermore, mortality due to breast cancer in this age group also peaks in Asia with 51.4 %. Five year prevalence in breast cancer in the same age group also has Asia peaking at 51.1%. In the United States too breast is the most common site for cancer amongst females of all ages (39.9%) with the percentage jumping to 45.6 % amongst females under 50 years of age. [1]

In the United States, during 2001-2015, incidence rates of early-onset metastatic breast cancer increased sharply among NH white, NH black, Hispanic, a

In [5]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader


In [7]:
from langchain.document_loaders import DirectoryLoader
from langchain.indexes import GPTVectorStoreIndex

# Load the scraped data
with open('projects_data.json', 'r') as f:
    projects_data = json.load(f)

# Create an instance of LangChain's vector store index
index = GPTVectorStoreIndex()

# Add documents to the index
for entry in projects_data:
    if entry['type'] == 'table':
        content = f"{entry['first_td']}: {entry['second_td']}"
    else:
        content = entry['text']
    
    index.add_document({
        'title': entry['type'],
        'content': content,
        'metadata': {'url': entry.get('url', '')}
    })

print("Data indexed with LangChain")

ModuleNotFoundError: Module langchain_community.document_loaders not found. Please install langchain-community to access this module. You can install it using `pip install -U langchain-community`