In [7]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import WebBaseLoader
from langchain_openai import OpenAIEmbeddings
from langchain_ai21 import AI21SemanticTextSplitter
from langchain_pinecone import PineconeVectorStore

In [8]:
# process .env file
load_dotenv()

True

In [9]:
# Access the environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')
ai_twentyone_api_key = os.getenv('AI_TWENTYONE_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_index_wit = os.getenv('PINECONE_WIT_SEMANTIC')

In [10]:
# Apple API keys to AI21, OpenAI, and Pinecone
os.environ["AI21_API_KEY"] = ai_twentyone_api_key
os.environ['OPENAI_API_KEY'] = openai_api_key
os.environ['PINECONE_API_KEY'] = pinecone_api_key

In [11]:
# wikipedia link for scraping
url = "https://en.wikipedia.org/wiki/Wentworth_Institute_of_Technology"

In [12]:
# initialize loader for scraping web content
loader = WebBaseLoader(url)
url_processed = loader.load()

In [13]:
# initialize semantic splitter
semantic_splitter = AI21SemanticTextSplitter(
    chunk_size=50
)

# split content into chunks of set size
all_splits = semantic_splitter.split_documents(url_processed)

# initialize OpenAI's embedding model
embeddings = OpenAIEmbeddings()

In [14]:
# print splits
all_splits

[Document(page_content='Wentworth Institute of Technology - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload file\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAppearance\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\n Create account Log in\n\n\n\n\n\n\t\tPages for logged out editors learn more\n\n\n\nContributionsTalk\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\n\n\n\n\n(Top)\n\n\n\n\n\n1', metadata={'source': 'https://en.wikipedia.org/wiki/Wentworth_Institute_of_Technology', 'title': 'Wentworth Institute of Technology - 

In [15]:
# insert splits into Pinecone vector database as embeddings
docsearch = PineconeVectorStore.from_documents(all_splits, embeddings, index_name=pc_index_wit)