In [95]:
import os
import requests
import copy
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from langchain_community.document_loaders import WebBaseLoader
from langchain_openai import OpenAIEmbeddings
import requests
from langchain_ai21 import AI21SemanticTextSplitter
from langchain_pinecone import PineconeVectorStore

In [96]:
# process .env file
load_dotenv()

True

In [97]:
# Access the environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')
ai_twentyone_api_key = os.getenv('AI_TWENTYONE_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_index_wit = os.getenv('PINECONE_WIT_SEMANTIC')

In [98]:
# Apply API keys for OpenAI, AI21, and Pinecone
os.environ["AI21_API_KEY"] = ai_twentyone_api_key
os.environ['OPENAI_API_KEY'] = openai_api_key
os.environ['PINECONE_API_KEY'] = pinecone_api_key

In [99]:
# links to be scraped
urls = [
    "https://www.city-data.com/neighborhood/Beacon-Hill-Boston-MA.html",
    "https://www.city-data.com/neighborhood/Back-Bay-Boston-MA.html",
    "https://www.city-data.com/neighborhood/Boston-Common-Boston-MA.html",
    "https://www.city-data.com/neighborhood/Chinatown-Boston-MA.html",
    "https://www.city-data.com/neighborhood/Fenway-Boston-MA.html",
    "https://www.city-data.com/neighborhood/South-Boston-Boston-MA.html",
    "https://www.city-data.com/neighborhood/Mission-Hill-Boston-MA.html",
    "https://www.city-data.com/neighborhood/Mission-Hill-Projects-Boston-MA.html",
    "https://www.city-data.com/us-cities/The-Northeast/Boston-Introduction.html",
    "https://www.city-data.com/us-cities/The-Northeast/Boston.html",
    "https://www.city-data.com/us-cities/The-Northeast/Boston-Communications.html",
    "https://www.city-data.com/us-cities/The-Northeast/Boston-Convention-Facilities.html",
    "https://www.city-data.com/us-cities/The-Northeast/Boston-Economy.html",
    "https://www.city-data.com/us-cities/The-Northeast/Boston-Education-and-Research.html",
    "https://www.city-data.com/us-cities/The-Northeast/Boston-Geography-and-Climate.html",
    "https://www.city-data.com/us-cities/The-Northeast/Boston-Health-Care.html",
    "https://www.city-data.com/us-cities/The-Northeast/Boston-History.html",
    "https://www.city-data.com/us-cities/The-Northeast/Boston-Municipal-Government.html",
    "https://www.city-data.com/us-cities/The-Northeast/Boston-Population-Profile.html",
    "https://www.city-data.com/us-cities/The-Northeast/Boston-Recreation.html",
    "https://www.city-data.com/us-cities/The-Northeast/Boston-Transportation.html",
    "https://www.city-data.com/articles/Shops-at-Prudential-Center-Boston-MA-A.html",
    "https://www.city-data.com/articles/Boston-Garden-The-Only-Name-For-World.html",
    "https://www.city-data.com/articles/Copley-Place-Boston-MA.html",
    "https://www.city-data.com/articles/Boston-Common-Rural-Oasis-in-the-Heart.html",
    "https://www.city-data.com/articles/The-New-England-Aquarium-Boston.html",
    "https://www.city-data.com/articles/Boston-Public-Garden-Massachusetts-Large.html",
    "https://www.city-data.com/articles/Castle-Island-Boston-Massachusetts-A.html",
    "https://www.city-data.com/articles/Chinatown-Neighborhood-Boston.html",
    "https://www.city-data.com/articles/Downtown-Boston-Neighborhood-Boston.html",
    "https://www.city-data.com/articles/East-Boston-Neighborhood-Boston.html",
    "https://www.city-data.com/articles/Fenway-Park-Boston-Massachusetts-More.html",
    "https://www.city-data.com/articles/Fenway-Kenmore-Neighborhood-Boston.html",
    "https://www.city-data.com/articles/John-F-Kennedy-Presidential-Museum-and.html",
    "https://www.city-data.com/articles/Mission-Hill-Neighborhood-Boston.html",
    "https://www.city-data.com/articles/Museum-of-Fine-Arts-Boston-Massachusetts.html",
    "https://www.city-data.com/articles/Roxbury-Neighborhood-Boston.html",
    "https://www.city-data.com/articles/Back-Bay-Neighborhood-Boston.html",
    "https://www.city-data.com/articles/Boston-Duck-Tours-Massachusetts.html",
    "https://www.city-data.com/articles/Boston-Harbor-Islands-National-Park.html"

]

In [100]:
# Create a custom headers dictionary
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

In [101]:
# # Filter URLs with long paths

content = []
# # Use Webloader to load and process each sub-url
for url in urls:
    try:
       response = requests.get(url, headers=headers) 
       soup = BeautifulSoup(response.content, 'html.parser')
       # Extract all text from the parsed content
       page_text = soup.get_text(separator=' ', strip=True)
       content.append(page_text)
    except Exception as e:
        continue

In [102]:
# # Filter URLs with long paths

metadata_sources = []
# # Use Webloader to load and process each sub-url
for url in urls:
    try:
        loader = WebBaseLoader(url)
        loaded_content = loader.load()
        metadata_sources.append(loaded_content[0].metadata['source'])
        # Use loader.page_content here
    except Exception as e:
        continue

In [103]:
metadata_sources

['https://www.city-data.com/neighborhood/Beacon-Hill-Boston-MA.html',
 'https://www.city-data.com/neighborhood/Back-Bay-Boston-MA.html',
 'https://www.city-data.com/neighborhood/Boston-Common-Boston-MA.html',
 'https://www.city-data.com/neighborhood/Chinatown-Boston-MA.html',
 'https://www.city-data.com/neighborhood/Fenway-Boston-MA.html',
 'https://www.city-data.com/neighborhood/South-Boston-Boston-MA.html',
 'https://www.city-data.com/neighborhood/Mission-Hill-Boston-MA.html',
 'https://www.city-data.com/neighborhood/Mission-Hill-Projects-Boston-MA.html',
 'https://www.city-data.com/us-cities/The-Northeast/Boston-Introduction.html',
 'https://www.city-data.com/us-cities/The-Northeast/Boston.html',
 'https://www.city-data.com/us-cities/The-Northeast/Boston-Communications.html',
 'https://www.city-data.com/us-cities/The-Northeast/Boston-Convention-Facilities.html',
 'https://www.city-data.com/us-cities/The-Northeast/Boston-Economy.html',
 'https://www.city-data.com/us-cities/The-Northe

In [104]:
content

["Beacon Hill neighborhood in Boston, Massachusetts (MA), 02108, 02114 subdivision profile - real estate, apartments, condos, homes, community, population, jobs, income, streets Toggle navigation Forum Cities AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY Cities Schools AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS KY LA MA ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX VA VT WA WI WV WY Schools Neighborhoods AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS KY LA MA ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX VA VT WA WI WV WY Neighborhoods Assessments AL AZ CA CO CT DE FL GA HI IA ID IL IN KS LA MA MD MI MN MO NC NE NJ NM NV NY OH OK OR PA RI SC TN TX UT VA WA WI WV Assessments More Restaurants AL CA CO DC DE FL GA IA ID IL IN KS KY LA MD ME MI MN NC NE NH NJ NV NY OH OK OR PA SC TN TX UT VA WA WI Restaurants Sex Offenders AK AL AR A

In [107]:
# initializing web scraping loader and clearing page_content metadata
loader = WebBaseLoader(urls[0])
page_content = loader.load()
page_content[0].page_content = ''
page_content[0].metadata['title'] = ''
page_content[0].metadata['source'] = ''
page_content[0].metadata['language'] = ''
blank_page_content = page_content[0]
blank_page_content

Document(page_content='', metadata={'source': '', 'title': '', 'language': ''})

In [108]:
# black content document for applying to web scraped documents
blank_page_content

Document(page_content='', metadata={'source': '', 'title': '', 'language': ''})

In [109]:
url_page_content = []

# transforming scraped content into readable documents for splitting
for i in range(len(content)):
    temp_doc = copy.deepcopy(blank_page_content)
    temp_doc.page_content = content[i]
    temp_doc.metadata['source'] = metadata_sources[i]
    url_page_content.append(temp_doc)

In [110]:
url_page_content

[Document(page_content="Beacon Hill neighborhood in Boston, Massachusetts (MA), 02108, 02114 subdivision profile - real estate, apartments, condos, homes, community, population, jobs, income, streets Toggle navigation Forum Cities AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY Cities Schools AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS KY LA MA ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX VA VT WA WI WV WY Schools Neighborhoods AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS KY LA MA ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX VA VT WA WI WV WY Neighborhoods Assessments AL AZ CA CO CT DE FL GA HI IA ID IL IN KS LA MA MD MI MN MO NC NE NJ NM NV NY OH OK OR PA RI SC TN TX UT VA WA WI WV Assessments More Restaurants AL CA CO DC DE FL GA IA ID IL IN KS KY LA MD ME MI MN NC NE NH NJ NV NY OH OK OR PA SC TN TX UT VA WA WI Restaurants Se

In [117]:
# initialize semantic splitter
semantic_splitter = AI21SemanticTextSplitter(
    chunk_size=50
)

# split content into chunks of set size
all_splits = semantic_splitter.split_documents(url_page_content)

# initialize OpenAI's embedding model
embeddings = OpenAIEmbeddings()

In [118]:
len(all_splits)

701

In [119]:
all_splits

[Document(page_content='Beacon Hill neighborhood in Boston, Massachusetts (MA), 02108, 02114 subdivision profile - real estate, apartments, condos, homes, community, population, jobs, income, streets Toggle navigation Forum Cities AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY Cities Schools AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS KY LA MA ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX VA VT WA WI WV WY Schools Neighborhoods AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS KY LA MA ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX VA VT WA WI WV WY Neighborhoods Assessments AL AZ CA CO CT DE FL GA HI IA ID IL IN KS LA MA MD MI MN MO NC NE NJ NM NV NY OH OK OR PA RI SC TN TX UT VA WA WI WV Assessments More Restaurants AL CA CO DC DE FL GA IA ID IL IN KS KY LA MD ME MI MN NC NE NH NJ NV NY OH OK OR PA SC TN TX UT VA WA WI Restaurants Se

In [None]:
# insert splits into Pinecone vector database as embeddings
docsearch = PineconeVectorStore.from_documents(all_splits, embeddings, index_name=pc_index_wit)