In [7]:
import os
import requests
import copy
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_openai import OpenAIEmbeddings
import requests
from langchain_ai21 import AI21SemanticTextSplitter
from langchain_pinecone import PineconeVectorStore
from urllib.parse import urlparse, urljoin

In [8]:
load_dotenv()

True

In [9]:
# Access the environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')
ai_twentyone_api_key = os.getenv('AI_TWENTYONE_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_index_wit = os.getenv('PINECONE_WIT_SEMANTIC')

In [10]:
os.environ["AI21_API_KEY"] = ai_twentyone_api_key
os.environ['OPENAI_API_KEY'] = openai_api_key
os.environ['PINECONE_API_KEY'] = pinecone_api_key

In [11]:
# Function to get sub-urls from a given URL
def get_sub_urls(url, base_url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    sub_urls = []
    for a in soup.find_all('a', href=True):
        href = a['href']
        if href.startswith('mailto:'):
            # Skip mailto links
            continue
        if href.startswith('#'):
            # Include sub-urls with '#' in front
            sub_urls.append(urljoin(base_url, href))
        else:
            absolute_url = urljoin(base_url, href)
            sub_urls.append(absolute_url)
    return sub_urls

In [12]:
# Function to recursively get all sub-urls up to a maximum depth
def get_all_sub_urls(url, base_url, max_depth, current_depth=0, visited=None):
    if visited is None:
        visited = set()
    visited.add(url)
    if current_depth >= max_depth:
        return visited
    sub_urls = get_sub_urls(url, base_url)
    for sub_url in sub_urls:
        if sub_url not in visited:
            visited.add(sub_url)
            visited.update(get_all_sub_urls(sub_url, base_url, max_depth, current_depth + 1, visited))
    return visited

In [17]:
# Main URL of the website
main_url = 'https://explore.missionhillmainstreet.com/'

# Get all sub-urls from the main page
all_sub_urls = get_all_sub_urls(main_url, main_url, 3)

webscraped_content_with_suburls = []
# Use Webloader to load and process each sub-url
for sub_url in all_sub_urls:
    try:
        loader = WebBaseLoader(sub_url)
        webscraped_content_with_suburls.extend(loader.load())
        # Use loader.page_content here
        print(loader.page_content)
    except Exception as e:
        continue

In [32]:
len(webscraped_content_with_suburls)

33

In [28]:
webscraped_content_with_suburls

[Document(page_content='\n\n\n\n\n\n\nFlames Restaurant II\n\nMission Hillmissionhillmainstreet.com\n\nPlaces\n\nGuides\n\nEvents\n\n\n\n\nPassportsLogin or Sign Up\n\nHistoric Mission Hill Map Mission Hillmissionhillmainstreet.com\n\nPlaces\n\nGuides\n\nEvents\n\n\n\n\nPassportsLogin or Sign Up\n\nBack| Guides / What to Try in Mission Hill! / Flames Restaurant II  Flames Restaurant II Each of our four locations has its own unique style while providing the same welcoming Caribbean experience through the authentic cuisine, colorful decor coupled with vibrant artwork and pictures of popular West Indian entertainers, some of whom have enjoyed our meals..\n\nExtensive catering service is available and orders can be customized to suit any social or corporate event. Contact us at (617) 734-1911.\n\nTake some time to tour our site and please stop in for a bite at your convenience. We would also love to discuss ways to make your next event a memorable taste experience.CallDirectionsMenuPlace I

In [20]:
places = [
    "https://explore.missionhillmainstreet.com/places/tobin-community-center", 
    "https://explore.missionhillmainstreet.com/places/mission-hill-main-streets",
    "https://explore.missionhillmainstreet.com/places/solid-ground-cafe",
    "https://explore.missionhillmainstreet.com/places/penguin-pizza",
    "https://explore.missionhillmainstreet.com/places/7-eleven",
    "https://explore.missionhillmainstreet.com/places/aks-takeout-delivery",
    "https://explore.missionhillmainstreet.com/places/artesani-playground-wading-pool-and-spray-deck",
    "https://explore.missionhillmainstreet.com/places/boba-me",
    "https://explore.missionhillmainstreet.com/places/boston-center-youth-families",
    "https://explore.missionhillmainstreet.com/places/boston-cyclists-union",
    "https://explore.missionhillmainstreet.com/places/boston-debate-league",
    "https://explore.missionhillmainstreet.com/places/bostons-basilica-of-our-lady-of-perpetual-help",
    "https://explore.missionhillmainstreet.com/places/bread-n-butter",
    "https://explore.missionhillmainstreet.com/places/brigham-circle-chinese-food",
    "https://explore.missionhillmainstreet.com/places/carmans-beauty-salon",
    "https://explore.missionhillmainstreet.com/places/chachos-pizza-subs",
    "https://explore.missionhillmainstreet.com/places/chilacates-mission-hill",
    "https://explore.missionhillmainstreet.com/places/citgo",
    "https://explore.missionhillmainstreet.com/places/citizens",
    "https://explore.missionhillmainstreet.com/places/crispy-dough-pizzeria",
    "https://explore.missionhillmainstreet.com/places/daras-wine-spirits",
    "https://explore.missionhillmainstreet.com/places/diablo-glass-school",
    "https://explore.missionhillmainstreet.com/places/dunkin-1",
    "https://explore.missionhillmainstreet.com/places/dunkin",
    "https://explore.missionhillmainstreet.com/places/eastern-bank-1",
    "https://explore.missionhillmainstreet.com/places/emerald-necklace-conservancy-shattuck-visitor-center",
    "https://explore.missionhillmainstreet.com/places/flames-restaurant-ii",
    "https://explore.missionhillmainstreet.com/places/ginger-exchange-mission-hill-roxbury",
    "https://explore.missionhillmainstreet.com/places/halal-indian-cuisine",
    "https://explore.missionhillmainstreet.com/places/hanlon-square",
    "https://explore.missionhillmainstreet.com/places/hillside-market",
    "https://explore.missionhillmainstreet.com/places/il-mondo-pizzeria",
    "https://explore.missionhillmainstreet.com/places/in-the-cut-boston-barbershop",
    "https://explore.missionhillmainstreet.com/places/isabella-stewart-gardner-museum",
    "https://explore.missionhillmainstreet.com/places/islamic-society-of-boston-cultural-center",
    "https://explore.missionhillmainstreet.com/places/jp-licks",
    "https://explore.missionhillmainstreet.com/places/josephs-sub-shop-pizza",
    "https://explore.missionhillmainstreet.com/places/kevin-w-fitzgerald-park",
    "https://explore.missionhillmainstreet.com/places/kinetic-remedy-private-training-studio",
    "https://explore.missionhillmainstreet.com/places/laughing-monk-cafe",
    "https://explore.missionhillmainstreet.com/places/lillys-gourmet-pasta-express",
    "https://explore.missionhillmainstreet.com/places/lizs-hair-care",
    "https://explore.missionhillmainstreet.com/places/masco",
    "https://explore.missionhillmainstreet.com/places/mamas-place",
    "https://explore.missionhillmainstreet.com/places/massart-art-museum",
    "https://explore.missionhillmainstreet.com/places/mclaughlin-playground",
    "https://explore.missionhillmainstreet.com/places/mikes-donuts",
    "https://explore.missionhillmainstreet.com/places/milkweed",
    "https://explore.missionhillmainstreet.com/places/mission-hill-farmers-market-brigham-circle",
    "https://explore.missionhillmainstreet.com/places/mission-hill-farmers-market-roxbury-crossing",
    "https://explore.missionhillmainstreet.com/places/mission-hill-health-movement",
    "https://explore.missionhillmainstreet.com/places/269-parker-hill-ave",
    "https://explore.missionhillmainstreet.com/places/mission-hill-playground",
    "https://explore.missionhillmainstreet.com/places/mission-hill-yoga",
    "https://explore.missionhillmainstreet.com/places/montecristo-mexican-grill",
    "https://explore.missionhillmainstreet.com/places/museum-of-fine-arts-boston",
    "https://explore.missionhillmainstreet.com/places/nachlo-mexican-pakistani-cuisine",
    "https://explore.missionhillmainstreet.com/places/nails-on-huntington",
    "https://explore.missionhillmainstreet.com/places/nanas-hair-braiding",
    "https://explore.missionhillmainstreet.com/places/needham-bank-mission-hill",
    "https://explore.missionhillmainstreet.com/places/north-american-indian-center-of-boston",
    "https://explore.missionhillmainstreet.com/places/odb-liquors",
    "https://explore.missionhillmainstreet.com/places/one-brigham-circle",
    "https://explore.missionhillmainstreet.com/places/papas-pizza-company",
    "https://explore.missionhillmainstreet.com/places/parker-hill-branch-of-the-boston-public-library",
    "https://explore.missionhillmainstreet.com/places/phoraya-thai-spa",
    "https://explore.missionhillmainstreet.com/places/punjab-mini-mart",
    "https://explore.missionhillmainstreet.com/places/santander-bank-branch",
    "https://explore.missionhillmainstreet.com/places/sheehy-park",
    "https://explore.missionhillmainstreet.com/places/sofias-alteration-cleaners",
    "https://explore.missionhillmainstreet.com/places/spinney-insurance-agency-inc",
    "https://explore.missionhillmainstreet.com/places/huntington-square-coin-op-laundry",
    "https://explore.missionhillmainstreet.com/places/subway-7",
    "https://explore.missionhillmainstreet.com/places/sullys-barber-shop",
    "https://explore.missionhillmainstreet.com/places/sunny-laundromat",
    "https://explore.missionhillmainstreet.com/places/tavern-of-tales",
    "https://explore.missionhillmainstreet.com/places/tbaar",
    "https://explore.missionhillmainstreet.com/places/tgi-fridays",
    "https://explore.missionhillmainstreet.com/places/the-mission-bar-grill",
    "https://explore.missionhillmainstreet.com/places/the-puddingstone-tavern",
    "https://explore.missionhillmainstreet.com/places/tremont-house-of-pizza",
    "https://explore.missionhillmainstreet.com/places/university-house-of-pizza",
    "https://explore.missionhillmainstreet.com/places/vanity-loft",
    "https://explore.missionhillmainstreet.com/places/wok-n-talk-boston",
    "https://explore.missionhillmainstreet.com/places/1619-tremont-st"

]

In [21]:
places_content = []
# Use Webloader to load and process each sub-url
for url in places:
    try:
        loader = WebBaseLoader(url)
        places_content.extend(loader.load())
        # Use loader.page_content here
        print(loader.page_content)
    except Exception as e:
        continue

In [33]:
len(places_content)

118

In [27]:
places_content

[Document(page_content='\n\n\n\n\n\n\nTobin Community Center\n\nMission Hillmissionhillmainstreet.com\n\nPlaces\n\nGuides\n\nEvents\n\n\n\n\nPassportsLogin or Sign Up\n\nHistoric Mission Hill Map Mission Hillmissionhillmainstreet.com\n\nPlaces\n\nGuides\n\nEvents\n\n\n\n\nPassportsLogin or Sign Up\n\nBack| Places / Tobin Community Center  Tobin Community Center BCYF Tobin features include a community room, computer lab, gymnasium, batting cage, outdoor garden, and stage.CallDirectionsPlace InformationAddress1481 Tremont St, Roxbury, MA 02120, USAPhone+1 617-635-5216Websitehttps://www.boston.gov/departments/boston-centers-youth-families/bcyf-tobin\n\n\n\n\n\n\n\n\n\n\n\n\n\n', metadata={'source': 'https://explore.missionhillmainstreet.com/places/tobin-community-center', 'title': 'Tobin Community Center', 'description': 'BCYF Tobin features include a community room, computer lab, gymnasium, batting cage, outdoor garden, and stage.', 'language': 'en'}),
 Document(page_content='\n\n\n\n\n\

In [29]:
content_combo = places_content + webscraped_content_with_suburls

In [34]:
len(content_combo)

151

In [38]:
content_combo

[Document(page_content='\n\n\n\n\n\n\nTobin Community Center\n\nMission Hillmissionhillmainstreet.com\n\nPlaces\n\nGuides\n\nEvents\n\n\n\n\nPassportsLogin or Sign Up\n\nHistoric Mission Hill Map Mission Hillmissionhillmainstreet.com\n\nPlaces\n\nGuides\n\nEvents\n\n\n\n\nPassportsLogin or Sign Up\n\nBack| Places / Tobin Community Center  Tobin Community Center BCYF Tobin features include a community room, computer lab, gymnasium, batting cage, outdoor garden, and stage.CallDirectionsPlace InformationAddress1481 Tremont St, Roxbury, MA 02120, USAPhone+1 617-635-5216Websitehttps://www.boston.gov/departments/boston-centers-youth-families/bcyf-tobin\n\n\n\n\n\n\n\n\n\n\n\n\n\n', metadata={'source': 'https://explore.missionhillmainstreet.com/places/tobin-community-center', 'title': 'Tobin Community Center', 'description': 'BCYF Tobin features include a community room, computer lab, gymnasium, batting cage, outdoor garden, and stage.', 'language': 'en'}),
 Document(page_content='\n\n\n\n\n\

In [39]:
semantic_splitter = AI21SemanticTextSplitter(
    chunk_size=350
)
all_splits = semantic_splitter.split_documents(content_combo)
embeddings = OpenAIEmbeddings()

In [None]:
docsearch = PineconeVectorStore.from_documents(all_splits, embeddings, index_name=pc_index_wit)