In [36]:
import bs4
from bs4 import BeautifulSoup
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
import requests
from urllib.parse import urlparse, urljoin

## <B> PURE WEBSCRAPE TESTING </B>

In [37]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [38]:
# https://catalog.wit.edu/computing-data-science/computer-science-networking/computer-science-bs/

# sk-proj-vmNFa0Rk56DWhBnG9guIT3BlbkFJzOd3TbwQ6h04Nr9JCFwh

In [39]:
# Function to get sub-urls from a given URL
def get_sub_urls(url, base_url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    sub_urls = []
    for a in soup.find_all('a', href=True):
        href = a['href']
        if href.startswith('mailto:'):
            # Skip mailto links
            continue
        if href.startswith('#'):
            # Include sub-urls with '#' in front
            sub_urls.append(urljoin(base_url, href))
        else:
            absolute_url = urljoin(base_url, href)
            sub_urls.append(absolute_url)
    return sub_urls

In [40]:
# Function to recursively get all sub-urls up to a maximum depth
def get_all_sub_urls(url, base_url, max_depth, current_depth=0, visited=None):
    if visited is None:
        visited = set()
    visited.add(url)
    if current_depth >= max_depth:
        return visited
    sub_urls = get_sub_urls(url, base_url)
    for sub_url in sub_urls:
        if sub_url not in visited:
            visited.add(sub_url)
            visited.update(get_all_sub_urls(sub_url, base_url, max_depth, current_depth + 1, visited))
    return visited

In [None]:
# Main URL of the website
main_url = 'https://catalog.wit.edu/'

# Get all sub-urls from the main page
all_sub_urls = get_all_sub_urls(main_url, main_url, 6)

# Filter URLs with long paths
# filtered_sub_urls = [url for url in all_sub_urls if len(urlparse(url).path.split('/')) >= 5]

content = []
# Use Webloader to load and process each sub-url
for sub_url in all_sub_urls:
    try:
        loader = WebBaseLoader(sub_url)
        content = loader.load()
        # Use loader.page_content here
    except Exception as e:
        continue

In [None]:
sub_url

In [48]:
content[0].page_content

'\n\n\n\n\n\nThe School of Engineering < WIT\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to Content\nAZ Index\nCatalog Home\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n2023-2024 Academic Catalog\n\n\n\n\nSearch catalog\n\n\n\nSubmit Search\n\n\n\n\n\n\n\n\nToggle main menu\n\n\n\n\n\n\n\n\n\nWentworth Home\nCatalog Home\nPrograms\nCourses\nAcademic Calendar\nPrevious Catalogs\nPrint/Download Options\n\n\n\n\n\n\n\n\n\n\nWentworth Home\nCatalog Home\nPrograms\nCourses\nAcademic Calendar\nPrevious Catalogs\nPrint/Download Options\n\n\n\n\n\n\n\n\n\nMore In This Section\n\n\n\nThe School of Engineering \n\nBiological Engineering\nBiomedical Engineering \nCivil Engineering \nElectrical and Computer Engineering \nInterdisciplinary Engineering \nMechanical Engineering \n\n\n\n\n Print Options\n                     \n\n\n\n\nCatalog\xa0Home//The School of Engineering\n\nThe School of Engineering\n\n\n\n\nOverview\n\nPrograms\n\n\n\nAli Khabari,\xa0 Dean\nRubenstein Hall Room 209\n(61

In [49]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(content)

In [50]:
len(all_splits)

4

In [51]:
len(all_splits[0].page_content)

783

In [52]:
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

In [53]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
retrieved_docs = retriever.invoke("What are the credit needed for the three year program for computer science?")

In [54]:
len(retrieved_docs)

6

In [55]:
print(retrieved_docs[0].page_content)

On This PageThree Year ProgramFour Year Program
Three Year Program
Total credits for degree: 120
This is a three-year program, starting in the fall semester of the student’s first year and planned to end in the summer semester of the student’s third year. The courses are as follows:

Plan of Study Grid




Freshman Year
Fall SemesterCredits
COMP1000
COMPUTER SCIENCE I
4

MATH1776
CALCULUS 1A
2

MATH1777
CALCULUS 1B
2

MATH2300
DISCRETE MATHEMATICS
4

Science Elective 2
4

English Sequence*
4

 Credits20
Spring Semester
COMP1050
COMPUTER SCIENCE II
4

COMP1200
COMPUTER ORGANIZATION
4

MATH1876
CALCULUS 2A
2

MATH1877
CALCULUS 2B
2

Science Elective 2
4

English Sequence*
4

 Credits20
Sophomore Year
Fall Semester
COMP2000
DATA STRUCTURES
4

COMP2100
NETWORK PROGRAMMING
4

COMP2650
DATABASES
4

MATH2860
LINEAR ALGEBRA & MATRIX THEORY
4

HSS Elective*
4

 Credits20
Spring Semester
COMP2350
ALGORITHMS
4

COMP3400
OPERATING SYSTEMS
4

COMP Computer Science Elective 1
4


In [56]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

In [57]:
prompt = hub.pull("rlm/rag-prompt")

In [58]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [59]:
for chunk in rag_chain.stream("What are the credit needed for the three year program for computer science?"):
    print(chunk, end="", flush=True)

The total credits needed for the three-year program in Computer Science are 120. The courses include Computer Science I, Calculus, Discrete Mathematics, Data Structures, Algorithms, and Operating Systems, among others. The program starts in the fall semester of the student's first year and ends in the summer semester of the student's third year.