In [1]:
import os
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from langchain_community.document_loaders import WebBaseLoader
from langchain_openai import OpenAIEmbeddings
from langchain_ai21 import AI21SemanticTextSplitter
from langchain_pinecone import PineconeVectorStore
import requests
from urllib.parse import urljoin

In [2]:
# process .env file
load_dotenv()

True

In [3]:
# Access the environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')
ai_twentyone_api_key = os.getenv('AI_TWENTYONE_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_index_mh = os.getenv('PINECONE_WIT_SEMANTIC')

In [4]:
# Apply API keys for OpenAI, AI21, and Pinecone
os.environ["AI21_API_KEY"] = ai_twentyone_api_key
os.environ['OPENAI_API_KEY'] = openai_api_key
os.environ['PINECONE_API_KEY'] = pinecone_api_key

In [5]:

# Function to get sub-urls from a given URL
def get_sub_urls(url, base_url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    sub_urls = []
    for a in soup.find_all('a', href=True):
        href = a['href']
        if href.startswith('mailto:'):
            # Skip mailto links
            continue
        if href.startswith('#'):
            # Include sub-urls with '#' in front
            sub_urls.append(urljoin(base_url, href))
        else:
            absolute_url = urljoin(base_url, href)
            sub_urls.append(absolute_url)
    return sub_urls

In [6]:
# Function to recursively get all sub-urls up to a maximum depth
def get_all_sub_urls(url, base_url, max_depth, current_depth=0, visited=None):
    if visited is None:
        visited = set()
    visited.add(url)
    if current_depth >= max_depth:
        return visited
    sub_urls = get_sub_urls(url, base_url)
    for sub_url in sub_urls:
        if sub_url not in visited:
            visited.add(sub_url)
            visited.update(get_all_sub_urls(sub_url, base_url, max_depth, current_depth + 1, visited))
    return visited

In [7]:
# main mission hill links for scraping
missionHill_base_urls = [
    'https://www.missionhillmainstreet.com/',
    'https://en.wikipedia.org/wiki/Mission_Hill,_Boston'
]

In [8]:
content = []

# grabbing sub urls of two depths from missionhillmainstreets.com
all_mh_sub_urls = get_all_sub_urls(missionHill_base_urls[0], missionHill_base_urls[0], 2)

# Use Webloader to load and process each sub-url
for sub_url in all_mh_sub_urls:
    try:
        loader = WebBaseLoader(sub_url)
        url_processed = loader.load()
        content.extend(url_processed)
        # Use loader.page_content here
    except Exception as e:
        continue

In [9]:
# wikipedia mission hill
loader = WebBaseLoader(missionHill_base_urls[1])
content.extend(loader.load())

In [10]:
content

[Document(page_content='\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n25TH ANNIVERSARY | MHMS - Mission Hill\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\ntop of pageHOMEEXPLOREMEMBERSABOUT25TH ANNIVERSARYMHMS "CLOSE UP"CONTACTRESOURCESMoreUse tab to navigate through the menu items.MISSION HILL MAIN STREETS\n25TH ANNIVERSARY\nCOMMUNITY CELEBRATIONHOMEEXPLOREMEMBERSABOUT25TH ANNIVERSARYMHMS "CLOSE UP"CONTACTRESOURCESMoreUse tab to navigate through the menu items.bottom of page\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n', metadata={'source': 'https://www.missionhillmainstreet.com/25th', 'title': '25TH ANNIVERSARY | MHMS - Mission Hill', 'language': 'en'}),
 Document(page_content='\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nMEMBERS | MHMS - Mission Hill\n\n\n\n\n\n\n\n\n\n

In [12]:
# initialize semantic splitter
semantic_splitter = AI21SemanticTextSplitter(
    chunk_size=350
)

# split content into chunks of set size
splits = semantic_splitter.split_documents(content)

# initialize OpenAI's embedding model
embeddings = OpenAIEmbeddings()

In [13]:
splits

[Document(page_content='\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n25TH ANNIVERSARY | MHMS - Mission Hill\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\ntop of pageHOMEEXPLOREMEMBERSABOUT25TH ANNIVERSARYMHMS "CLOSE UP"CONTACTRESOURCESMoreUse tab to navigate through the menu items.MISSION HILL MAIN STREETS\n25TH ANNIVERSARY\nCOMMUNITY CELEBRATIONHOMEEXPLOREMEMBERSABOUT25TH ANNIVERSARYMHMS "CLOSE UP"CONTACTRESOURCESMoreUse tab to navigate through the menu items.bottom of page\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n', metadata={'source': 'https://www.missionhillmainstreet.com/25th', 'title': '25TH ANNIVERSARY | MHMS - Mission Hill', 'language': 'en', 'source_type': 'non_english'}),
 Document(page_content='MEMBERS | MHMS - Mission Hill\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\ntop of pageHOMEEXPLOREMEMBERSABOUT25TH ANNIVERSARYMH

In [14]:
# insert splits into Pinecone vector database as embeddings
docsearch = PineconeVectorStore.from_documents(splits, embeddings, index_name=pc_index_mh)

  from tqdm.autonotebook import tqdm
