In [12]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain_openai import OpenAIEmbeddings
from langchain_ai21 import AI21SemanticTextSplitter
from langchain_pinecone import PineconeVectorStore

In [14]:
# process .env file
load_dotenv()

True

In [15]:
# Access the environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')
ai_twentyone_api_key = os.getenv('AI_TWENTYONE_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_wit_semantic = os.getenv('PINECONE_WIT_SEMANTIC')

In [16]:
# Apply API keys for OpenAI, AI21, and Pinecone
os.environ["AI21_API_KEY"] = ai_twentyone_api_key
os.environ['OPENAI_API_KEY'] = openai_api_key
os.environ['PINECONE_API_KEY'] = pinecone_api_key

In [47]:
# initializes Excel loader for retrieving excel sheet content 
loader = UnstructuredExcelLoader("./Data/School of Management Program Details Data.xlsx", mode="elements")

# loads excel content into text form
docs = loader.load()

In [48]:
docs

[Document(page_content='\n\n\nConstruction Management Minor Electives\nCourses\nCredits\n\n\nCourse\nTitle\nCredits\n\n\nCONM3100\nCONSTRUCTION PROJECT MANAGEMENT\n4\n\n\nCONM3201\nCONSTRUCTION PROJECT SCHEDULING\n4\n\n\nCONM3500\nADVANCED ESTIMATING & BID ANALYSIS\n4\n\n\nCONM3800\nSPECIAL TOPICS IN CONSTRUCTION MANAGEMENT\n3\n\n\nCONM4200\nCONSTRUCTION SAFETY & RISK MANAGEMENT\n3\n\n\nCONM4650\nBUSINESS, CONSTRUCTION LAW & GOVERNMENT REGULATIONS\n3\n\n\nTotal Credits\nTotal Credits\n6-8\n\n\n', metadata={'source': './Data/School of Management Program Details Data.xlsx', 'file_directory': './Data', 'filename': 'School of Management Program Details Data.xlsx', 'last_modified': '2024-06-25T20:44:38', 'page_name': 'Construction Management Minor E', 'page_number': 1, 'text_as_html': '<table border="1" class="dataframe">\n  <tbody>\n    <tr>\n      <td>Construction Management Minor Electives</td>\n      <td>Courses</td>\n      <td>Credits</td>\n    </tr>\n    <tr>\n      <td>Course</td>\n 

In [49]:
# initialize semantic splitter
semantic_splitter = AI21SemanticTextSplitter(
    chunk_size=350
)

In [50]:
# Check and process documents before splitting
min_length = 30
processed_docs = []

for doc in docs:
    if len(doc.page_content) < min_length:
        print(f"Document too short: {doc.page_content}")
        # Optionally, pad the short document
        doc.page_content += ' ' * (min_length - len(doc.page_content))
    processed_docs.append(doc)

# split content into chunks of set size
all_splits = semantic_splitter.split_documents(processed_docs)

In [51]:
all_splits

[Document(page_content='\n\n\nConstruction Management Minor Electives\nCourses\nCredits\n\n\nCourse\nTitle\nCredits\n\n\nCONM3100\nCONSTRUCTION PROJECT MANAGEMENT\n4\n\n\nCONM3201\nCONSTRUCTION PROJECT SCHEDULING\n4\n\n\nCONM3500\nADVANCED ESTIMATING & BID ANALYSIS\n4\n\n\nCONM3800\nSPECIAL TOPICS IN CONSTRUCTION MANAGEMENT\n3\n\n\nCONM4200\nCONSTRUCTION SAFETY & RISK MANAGEMENT\n3\n\n\nCONM4650\nBUSINESS, CONSTRUCTION LAW & GOVERNMENT REGULATIONS\n3\n\n\nTotal Credits\nTotal Credits\n6-8\n\n\n', metadata={'source': './Data/School of Management Program Details Data.xlsx', 'file_directory': './Data', 'filename': 'School of Management Program Details Data.xlsx', 'last_modified': '2024-06-25T20:44:38', 'page_name': 'Construction Management Minor E', 'page_number': 1, 'text_as_html': '<table border="1" class="dataframe">\n  <tbody>\n    <tr>\n      <td>Construction Management Minor Electives</td>\n      <td>Courses</td>\n      <td>Credits</td>\n    </tr>\n    <tr>\n      <td>Course</td>\n 

In [52]:
# initialize OpenAI's embedding model
embeddings = OpenAIEmbeddings()

In [53]:
# insert splits into Pinecone vector database as embeddings
docsearch = PineconeVectorStore.from_documents(all_splits, embeddings, index_name=pc_wit_semantic)