In [9]:
import os
import requests
from urllib.parse import  urljoin
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from langchain_community.document_loaders import WebBaseLoader
from langchain_openai import OpenAIEmbeddings
import requests
from langchain_ai21 import AI21SemanticTextSplitter
from langchain_pinecone import PineconeVectorStore

In [2]:
# process .env file
load_dotenv()

True

In [3]:
# Access the environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')
ai_twentyone_api_key = os.getenv('AI_TWENTYONE_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_index_wit = os.getenv('PINECONE_WIT_SEMANTIC')

In [4]:
# Apply API keys for OpenAI, AI21, and Pinecone
os.environ["AI21_API_KEY"] = ai_twentyone_api_key
os.environ['OPENAI_API_KEY'] = openai_api_key
os.environ['PINECONE_API_KEY'] = pinecone_api_key

In [6]:
# Function to get sub-urls from a given URL
def get_sub_urls(url, base_url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    sub_urls = []
    for a in soup.find_all('a', href=True):
        href = a['href']
        if href.startswith('mailto:'):
            # Skip mailto links
            continue
        if href.startswith('#'):
            # Include sub-urls with '#' in front
            sub_urls.append(urljoin(base_url, href))
        else:
            absolute_url = urljoin(base_url, href)
            sub_urls.append(absolute_url)
    return sub_urls

In [7]:
# Function to recursively get all sub-urls up to a maximum depth
def get_all_sub_urls(url, base_url, max_depth, current_depth=0, visited=None):
    if visited is None:
        visited = set()
    visited.add(url)
    if current_depth >= max_depth:
        return visited
    sub_urls = get_sub_urls(url, base_url)
    for sub_url in sub_urls:
        if sub_url not in visited:
            visited.add(sub_url)
            visited.update(get_all_sub_urls(sub_url, base_url, max_depth, current_depth + 1, visited))
    return visited

In [17]:
# Main URL of the website
main_url = 'https://coopsandcareers.wit.edu/'

# Get all sub-urls from the main page
all_sub_urls = get_all_sub_urls(main_url, main_url, 3)

advising_team = [
    "https://coopsandcareers.wit.edu/advising-team/#david-albanese",
    "https://coopsandcareers.wit.edu/advising-team/#amelia-alburn",
    "https://coopsandcareers.wit.edu/advising-team/#caitlin-brison",
    "https://coopsandcareers.wit.edu/advising-team/#auryn-edwards",
    "https://coopsandcareers.wit.edu/advising-team/#ria-kalinowski",
    "https://coopsandcareers.wit.edu/advising-team/#jer-jurma",
    "https://coopsandcareers.wit.edu/advising-team/#narali-taglialavore",
    "https://coopsandcareers.wit.edu/advising-team/#patric-paz-docmanov",
]

full_urls = list(all_sub_urls)
full_urls.extend(advising_team)

In [18]:
# list of urls generated through algorithm
full_urls

['https://wit.edu/',
 'https://www.linkedin.com/groups/106815/',
 'https://coopsandcareers.wit.edu/contact/',
 'https://coopsandcareers.wit.edu/companies/haskell/jobs/',
 'https://coopsandcareers.wit.edu/advising-team/',
 'https://www.instagram.com/witcoopscareers/',
 'https://coopsandcareers.wit.edu/meet-our-team/#amelia-alburn',
 'https://coopsandcareers.wit.edu/companies/',
 'https://coopsandcareers.wit.edu/channels/international-students/',
 'https://coopsandcareers.wit.edu/meet-our-team/#jer-jurma',
 'https://coopsandcareers.wit.edu/channels/first-gen/',
 'https://coopsandcareers.wit.edu/channels/computing-information-technology/',
 'https://coopsandcareers.wit.edu/channels/daca-undocumented-students/',
 'https://coopsandcareers.wit.edu/channels/alumni/',
 'https://coopsandcareers.wit.edu/channels/business-management-administration/',
 'https://coopsandcareers.wit.edu/channels/industrial-design-explore-career-paths/',
 'https://coopsandcareers.wit.edu/meet-our-team/#auryn-edwards'

In [19]:
content = []
# Use Webloader to load and process each sub-url
for sub_url in full_urls:
    try:
        loader = WebBaseLoader(sub_url)
        content.extend(loader.load())
        # Use loader.page_content here
        print(loader.page_content)
    except Exception as e:
        continue

In [20]:
content

[Document(page_content="\n\n\n\n\n\n\n\n\n\n\n\n\n\nWentworth: Technology-focused University in Boston, MA | Wentworth\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n  Skip to main content\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nOpen Search\n\n\n\n\n\n\n\n\n\n\n        Toggle Menu\n      \n\n\n\n\n\nMain Menu\n\n\nAcademics \n\n\nDegree Finder\n\n\nSchool of Architecture & Design\n\n\nSchool of Computing & Data Science\n\n\nSchool of Engineering\n\n\nSchool of Management\n\n\nSchool of Sciences & Humanities\n\n\nOffice of the Provost\n\n\nAcademic Leadership\n\n\nFaculty\n\n\nLearning & Advising Resources\n\n\nLabs & Studios\n\n\nRegistrar\n\n\nAcademic Calendar\n\n\nAccreditation\n\n\nWorkforce Development & Professional Education\n\n\n\n\nCo-ops & Careers \n\n\nWhat is a Co-op?\n\n\nCo-ops & Careers Office\n\n\nHiring Wentworth Students\n\n\n\n\nAdmissions & Aid \n\n\nUndergraduate Admissions\n\n\nGraduate Admissions\n\n\nInternational Admissi

In [21]:
# initialize semantic splitter
semantic_splitter = AI21SemanticTextSplitter(
    chunk_size=350
)

# split content into chunks of set size
all_splits = semantic_splitter.split_documents(content)

# initialize OpenAI's embedding model
embeddings = OpenAIEmbeddings()

In [22]:
all_splits

[Document(page_content="Wentworth: Technology-focused University in Boston, MA | Wentworth\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n  Skip to main content\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nOpen Search\n\n\n\n\n\n\n\n\n\n\n        Toggle Menu\n      \n\n\n\n\n\nMain Menu\n\n\nAcademics \n\n\nDegree Finder\n\n\nSchool of Architecture & Design\n\n\nSchool of Computing & Data Science\n\n\nSchool of Engineering\n\n\nSchool of Management\n\n\nSchool of Sciences & Humanities\n\n\nOffice of the Provost\n\n\nAcademic Leadership\n\n\nFaculty\n\n\nLearning & Advising Resources\n\n\nLabs & Studios\n\n\nRegistrar\n\n\nAcademic Calendar\n\n\nAccreditation\n\n\nWorkforce Development & Professional Education\n\n\n\n\nCo-ops & Careers \n\n\nWhat is a Co-op?\n\n\nCo-ops & Careers Office\n\n\nHiring Wentworth Students\n\n\n\n\nAdmissions & Aid \n\n\nUndergraduate Admissions\n\n\nGraduate Admissions\n\n\nInternational Admissions\n\n\nTour Campus\n\n\nTu

In [None]:
# insert splits into Pinecone vector database as embeddings
docsearch = PineconeVectorStore.from_documents(all_splits, embeddings, index_name=pc_index_wit)