In [1]:
import bs4
import os
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from langchain import hub
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import UnstructuredFileLoader
from unstructured.cleaners.core import clean_extra_whitespace
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
import requests
from langchain_text_splitters import CharacterTextSplitter
from urllib.parse import urlparse, urljoin
from langchain_pinecone import Pinecone
from langchain_ai21 import AI21ContextualAnswers
from langchain_core.output_parsers import StrOutputParser
from langchain_pinecone import PineconeVectorStore

  from tqdm.autonotebook import tqdm


## <b>PDF LOADER TEST</b>
# Testing the ability to extract data from a PDF
# and upload that data to an LLM and Vector Database 
# for training and getting responses towards them

In [2]:
load_dotenv()

True

In [3]:
# Access the environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')
ai_twentyone_api_key = os.getenv('AI_TWENTYONE_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc_index_wit = os.getenv('PINECONE_WIT_STANDARD')

In [4]:
os.environ["AI21_API_KEY"] = ai_twentyone_api_key
os.environ['OPENAI_API_KEY'] = openai_api_key
os.environ['PINECONE_API_KEY'] = pinecone_api_key

In [5]:
loader = UnstructuredFileLoader(
    ["./PDFs/wit-programs.pdf", "./PDFs/wit-course-descriptions.pdf"], strategy="fast"
)

In [6]:
docs = loader.load()

In [7]:
docs

[Document(page_content='PROGRAMS School of Architecture and Design\n\nArchitecture (https://catalog.wit.edu/architecture-design/\n\narchitecture/)\n\nArchitecture, B.S. (https://catalog.wit.edu/architecture-design/\n\narchitecture/architecture-bs/)\n\nArchitecture, M. Arch. (https://catalog.wit.edu/architecture-\n\ndesign/architecture/architecture-masters/)\n\nIndustrial Design (https://catalog.wit.edu/architecture-design/\n\nindustrial-design/)\n\nIndustrial Design, B.S. (https://catalog.wit.edu/architecture-\n\ndesign/industrial-design/industrial-design-bs/)\n\nInterior Design (https://catalog.wit.edu/architecture-design/interior-\n\ndesign/)\n\nInterior Design, B.S. (https://catalog.wit.edu/architecture-design/\n\ninterior-design/interior-design-bs/)\n\nSchool of Computing and Data Science\n\nApplied Mathematics (https://catalog.wit.edu/computing-data-\n\nscience/applied-mathematics/)\n\nActuarial Science Concentration (https://catalog.wit.edu/ computing-data-science/applied-mathema

In [8]:
content = docs[0].page_content
content

'PROGRAMS School of Architecture and Design\n\nArchitecture (https://catalog.wit.edu/architecture-design/\n\narchitecture/)\n\nArchitecture, B.S. (https://catalog.wit.edu/architecture-design/\n\narchitecture/architecture-bs/)\n\nArchitecture, M. Arch. (https://catalog.wit.edu/architecture-\n\ndesign/architecture/architecture-masters/)\n\nIndustrial Design (https://catalog.wit.edu/architecture-design/\n\nindustrial-design/)\n\nIndustrial Design, B.S. (https://catalog.wit.edu/architecture-\n\ndesign/industrial-design/industrial-design-bs/)\n\nInterior Design (https://catalog.wit.edu/architecture-design/interior-\n\ndesign/)\n\nInterior Design, B.S. (https://catalog.wit.edu/architecture-design/\n\ninterior-design/interior-design-bs/)\n\nSchool of Computing and Data Science\n\nApplied Mathematics (https://catalog.wit.edu/computing-data-\n\nscience/applied-mathematics/)\n\nActuarial Science Concentration (https://catalog.wit.edu/ computing-data-science/applied-mathematics/ﬁnancial- mathemat

In [9]:
sections = content.split(')')
sections

['PROGRAMS School of Architecture and Design\n\nArchitecture (https://catalog.wit.edu/architecture-design/\n\narchitecture/',
 '\n\nArchitecture, B.S. (https://catalog.wit.edu/architecture-design/\n\narchitecture/architecture-bs/',
 '\n\nArchitecture, M. Arch. (https://catalog.wit.edu/architecture-\n\ndesign/architecture/architecture-masters/',
 '\n\nIndustrial Design (https://catalog.wit.edu/architecture-design/\n\nindustrial-design/',
 '\n\nIndustrial Design, B.S. (https://catalog.wit.edu/architecture-\n\ndesign/industrial-design/industrial-design-bs/',
 '\n\nInterior Design (https://catalog.wit.edu/architecture-design/interior-\n\ndesign/',
 '\n\nInterior Design, B.S. (https://catalog.wit.edu/architecture-design/\n\ninterior-design/interior-design-bs/',
 '\n\nSchool of Computing and Data Science\n\nApplied Mathematics (https://catalog.wit.edu/computing-data-\n\nscience/applied-mathematics/',
 '\n\nActuarial Science Concentration (https://catalog.wit.edu/ computing-data-science/appli

In [10]:
no_newlines = list(map(lambda chunk: chunk.replace('\n', ''), sections))

In [11]:
no_newlines

['PROGRAMS School of Architecture and DesignArchitecture (https://catalog.wit.edu/architecture-design/architecture/',
 'Architecture, B.S. (https://catalog.wit.edu/architecture-design/architecture/architecture-bs/',
 'Architecture, M. Arch. (https://catalog.wit.edu/architecture-design/architecture/architecture-masters/',
 'Industrial Design (https://catalog.wit.edu/architecture-design/industrial-design/',
 'Industrial Design, B.S. (https://catalog.wit.edu/architecture-design/industrial-design/industrial-design-bs/',
 'Interior Design (https://catalog.wit.edu/architecture-design/interior-design/',
 'Interior Design, B.S. (https://catalog.wit.edu/architecture-design/interior-design/interior-design-bs/',
 'School of Computing and Data ScienceApplied Mathematics (https://catalog.wit.edu/computing-data-science/applied-mathematics/',
 'Actuarial Science Concentration (https://catalog.wit.edu/ computing-data-science/applied-mathematics/ﬁnancial- mathematics-concentration/',
 'Actuarial Scienc

In [12]:
no_newlines[0].split('(')[1]

'https://catalog.wit.edu/architecture-design/architecture/'

In [13]:
split = list(map(lambda chunk: chunk.split('('), no_newlines))

In [14]:
split

[['PROGRAMS School of Architecture and DesignArchitecture ',
  'https://catalog.wit.edu/architecture-design/architecture/'],
 ['Architecture, B.S. ',
  'https://catalog.wit.edu/architecture-design/architecture/architecture-bs/'],
 ['Architecture, M. Arch. ',
  'https://catalog.wit.edu/architecture-design/architecture/architecture-masters/'],
 ['Industrial Design ',
  'https://catalog.wit.edu/architecture-design/industrial-design/'],
 ['Industrial Design, B.S. ',
  'https://catalog.wit.edu/architecture-design/industrial-design/industrial-design-bs/'],
 ['Interior Design ',
  'https://catalog.wit.edu/architecture-design/interior-design/'],
 ['Interior Design, B.S. ',
  'https://catalog.wit.edu/architecture-design/interior-design/interior-design-bs/'],
 ['School of Computing and Data ScienceApplied Mathematics ',
  'https://catalog.wit.edu/computing-data-science/applied-mathematics/'],
 ['Actuarial Science Concentration ',
  'https://catalog.wit.edu/ computing-data-science/applied-mathema

In [15]:
links = []
for i in split:
    if len(i) == 2:
        links.append(''.join(i[1].split(' ')))

In [16]:
def only_links(x):
    if x.startswith('https'):
        return True
    else:
        return False

In [17]:
def only_links(x):
    if x.startswith('https'):
        return True
    else:
        return False

In [18]:
links

['https://catalog.wit.edu/architecture-design/architecture/',
 'https://catalog.wit.edu/architecture-design/architecture/architecture-bs/',
 'https://catalog.wit.edu/architecture-design/architecture/architecture-masters/',
 'https://catalog.wit.edu/architecture-design/industrial-design/',
 'https://catalog.wit.edu/architecture-design/industrial-design/industrial-design-bs/',
 'https://catalog.wit.edu/architecture-design/interior-design/',
 'https://catalog.wit.edu/architecture-design/interior-design/interior-design-bs/',
 'https://catalog.wit.edu/computing-data-science/applied-mathematics/',
 'https://catalog.wit.edu/computing-data-science/applied-mathematics/ﬁnancial-mathematics-concentration/',
 'https://catalog.wit.edu/computing-data-science/applied-mathematics/ﬁnancial-mathematics-minor/',
 'https://catalog.wit.edu/computing-data-science/applied-mathematics/applied-mathematics-minor/',
 'https://catalog.wit.edu/computing-data-science/applied-mathematics/applied-mathematics-bs/',
 '

In [19]:
def only_programs(x):
    if x.startswith("https://catalog.wit.edu/course-descriptions"):
        return False
    else:
        return True

In [20]:
program_links = list(filter(only_programs, links))

In [21]:
program_details_links = list(map(lambda link: link + '#programdetailstext', program_links))

In [22]:
program_details_links

['https://catalog.wit.edu/architecture-design/architecture/#programdetailstext',
 'https://catalog.wit.edu/architecture-design/architecture/architecture-bs/#programdetailstext',
 'https://catalog.wit.edu/architecture-design/architecture/architecture-masters/#programdetailstext',
 'https://catalog.wit.edu/architecture-design/industrial-design/#programdetailstext',
 'https://catalog.wit.edu/architecture-design/industrial-design/industrial-design-bs/#programdetailstext',
 'https://catalog.wit.edu/architecture-design/interior-design/#programdetailstext',
 'https://catalog.wit.edu/architecture-design/interior-design/interior-design-bs/#programdetailstext',
 'https://catalog.wit.edu/computing-data-science/applied-mathematics/#programdetailstext',
 'https://catalog.wit.edu/computing-data-science/applied-mathematics/ﬁnancial-mathematics-concentration/#programdetailstext',
 'https://catalog.wit.edu/computing-data-science/applied-mathematics/ﬁnancial-mathematics-minor/#programdetailstext',
 'htt

In [23]:
program_and_description_links = program_details_links + links

In [24]:
program_and_description_links

['https://catalog.wit.edu/architecture-design/architecture/#programdetailstext',
 'https://catalog.wit.edu/architecture-design/architecture/architecture-bs/#programdetailstext',
 'https://catalog.wit.edu/architecture-design/architecture/architecture-masters/#programdetailstext',
 'https://catalog.wit.edu/architecture-design/industrial-design/#programdetailstext',
 'https://catalog.wit.edu/architecture-design/industrial-design/industrial-design-bs/#programdetailstext',
 'https://catalog.wit.edu/architecture-design/interior-design/#programdetailstext',
 'https://catalog.wit.edu/architecture-design/interior-design/interior-design-bs/#programdetailstext',
 'https://catalog.wit.edu/computing-data-science/applied-mathematics/#programdetailstext',
 'https://catalog.wit.edu/computing-data-science/applied-mathematics/ﬁnancial-mathematics-concentration/#programdetailstext',
 'https://catalog.wit.edu/computing-data-science/applied-mathematics/ﬁnancial-mathematics-minor/#programdetailstext',
 'htt

In [26]:
real_urls = []
[ real_urls.append(url) for url in program_and_description_links if url.startswith("https")]
content = []

for link in real_urls:
    loader = WebBaseLoader(link)
    loaded_content = loader.load()
    content.extend(loaded_content)

In [None]:
content

[Document(page_content='\n\n\n\n\n\nArchitecture < WIT\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to Content\nAZ Index\nCatalog Home\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n2023-2024 Academic Catalog\n\n\n\n\nSearch catalog\n\n\n\nSubmit Search\n\n\n\n\n\n\n\n\nToggle main menu\n\n\n\n\n\n\n\n\n\nWentworth Home\nCatalog Home\nPrograms\nCourses\nAcademic Calendar\nPrevious Catalogs\nPrint/Download Options\n\n\n\n\n\n\n\n\n\n\nWentworth Home\nCatalog Home\nPrograms\nCourses\nAcademic Calendar\nPrevious Catalogs\nPrint/Download Options\n\n\n\n\n\n\n\n\n\nMore In This Section\n\n\n\nArchitecture \n\nArchitecture, B.S.\nArchitecture, M. Arch.\n\n\n\n\n Print Options\n                     \n\n\n\n\nCatalog\xa0Home//\xa0The\xa0School\xa0of\xa0Architecture\xa0and\xa0Design//Architecture\n\nArchitecture\n\n\n\n\nPrograms\n\nFaculty\n\n\n\n\n\nArchitecture, B.S.Architecture, M. Arch.\n\n\nProfessor\n\nAnn Borst, M.Arch.\nCarol Burns, M.Arch.\nRobert Cowherd, Ph.D.\nSedef Doganer, Ph.D.

In [29]:
catalog_loader = UnstructuredFileLoader(
    ["./PDFs/wit-2022-2023-Academic-Catalog-040723.pdf"], mode="elements", post_processors=[clean_extra_whitespace]
)

In [31]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=350, chunk_overlap=150, add_start_index=True
)
all_splits = text_splitter.split_documents(content)

In [32]:
all_splits

[Document(page_content='Architecture < WIT\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to Content\nAZ Index\nCatalog Home\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n2023-2024 Academic Catalog\n\n\n\n\nSearch catalog\n\n\n\nSubmit Search\n\n\n\n\n\n\n\n\nToggle main menu\n\n\n\n\n\n\n\n\n\nWentworth Home\nCatalog Home\nPrograms\nCourses\nAcademic Calendar\nPrevious Catalogs\nPrint/Download Options', metadata={'source': 'https://catalog.wit.edu/architecture-design/architecture/#programdetailstext', 'title': 'Architecture < WIT', 'language': 'en', 'start_index': 6}),
 Document(page_content='Toggle main menu\n\n\n\n\n\n\n\n\n\nWentworth Home\nCatalog Home\nPrograms\nCourses\nAcademic Calendar\nPrevious Catalogs\nPrint/Download Options\n\n\n\n\n\n\n\n\n\n\nWentworth Home\nCatalog Home\nPrograms\nCourses\nAcademic Calendar\nPrevious Catalogs\nPrint/Download Options\n\n\n\n\n\n\n\n\n\nMore In This Section\n\n\n\nArchitecture \n\nArchitecture, B.S.\nArchitecture, M. Arch.', metadata={'so

In [33]:
for i in range(0, len(all_splits)):
    if type(all_splits[i].metadata['source']) is list:
        all_splits[i].metadata['source'] = all_splits[i].metadata['source'][0]
    if 'coordinates' in all_splits[i].metadata:
        all_splits[i].metadata['coordinates'] = ''
        all_splits[i].metadata['languages'] = ''
        all_splits[i].metadata['links'] = ''

In [34]:
for i in all_splits:
    print(i.metadata)

{'source': 'https://catalog.wit.edu/architecture-design/architecture/#programdetailstext', 'title': 'Architecture < WIT', 'language': 'en', 'start_index': 6}
{'source': 'https://catalog.wit.edu/architecture-design/architecture/#programdetailstext', 'title': 'Architecture < WIT', 'language': 'en', 'start_index': 177}
{'source': 'https://catalog.wit.edu/architecture-design/architecture/#programdetailstext', 'title': 'Architecture < WIT', 'language': 'en', 'start_index': 430}
{'source': 'https://catalog.wit.edu/architecture-design/architecture/#programdetailstext', 'title': 'Architecture < WIT', 'language': 'en', 'start_index': 624}
{'source': 'https://catalog.wit.edu/architecture-design/architecture/#programdetailstext', 'title': 'Architecture < WIT', 'language': 'en', 'start_index': 943}
{'source': 'https://catalog.wit.edu/architecture-design/architecture/#programdetailstext', 'title': 'Architecture < WIT', 'language': 'en', 'start_index': 1201}
{'source': 'https://catalog.wit.edu/archi

In [35]:
len(all_splits)

10850

In [36]:
all_splits[3500].metadata.keys()

dict_keys(['source', 'title', 'description', 'language', 'start_index'])

In [37]:
all_splits

[Document(page_content='Architecture < WIT\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to Content\nAZ Index\nCatalog Home\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n2023-2024 Academic Catalog\n\n\n\n\nSearch catalog\n\n\n\nSubmit Search\n\n\n\n\n\n\n\n\nToggle main menu\n\n\n\n\n\n\n\n\n\nWentworth Home\nCatalog Home\nPrograms\nCourses\nAcademic Calendar\nPrevious Catalogs\nPrint/Download Options', metadata={'source': 'https://catalog.wit.edu/architecture-design/architecture/#programdetailstext', 'title': 'Architecture < WIT', 'language': 'en', 'start_index': 6}),
 Document(page_content='Toggle main menu\n\n\n\n\n\n\n\n\n\nWentworth Home\nCatalog Home\nPrograms\nCourses\nAcademic Calendar\nPrevious Catalogs\nPrint/Download Options\n\n\n\n\n\n\n\n\n\n\nWentworth Home\nCatalog Home\nPrograms\nCourses\nAcademic Calendar\nPrevious Catalogs\nPrint/Download Options\n\n\n\n\n\n\n\n\n\nMore In This Section\n\n\n\nArchitecture \n\nArchitecture, B.S.\nArchitecture, M. Arch.', metadata={'so

In [39]:

docsearch = PineconeVectorStore.from_documents(all_splits, embedding=OpenAIEmbeddings(), index_name=pc_index_wit)