In [11]:
# import
import os
from dotenv import load_dotenv

import openai

# https://python.langchain.com/en/latest/index.html
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

In [3]:
# set OpenAI API key
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [7]:
# load PDF
datafile_path = "./data/2023_GPT4All_Technical_Report.pdf"
loader = PyMuPDFLoader(datafile_path)

In [9]:
pages = loader.load_and_split()
raw_text = ""
for page in pages:
    text = page.page_content
    raw_text += text

raw_text[:200] # print 200 letters

'GPT4All: Training an Assistant-style Chatbot with Large Scale Data\nDistillation from GPT-3.5-Turbo\nYuvanesh Anand\nyuvanesh@nomic.ai\nZach Nussbaum\nzanussbaum@gmail.com\nBrandon Duderstadt\nbrandon@nomic.'

In [14]:
# split text in order to fit in tokens
# overlap 200 for context
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)
len(texts)

'GPT4All: Training an Assistant-style Chatbot with Large Scale Data\nDistillation from GPT-3.5-Turbo\nYuvanesh Anand\nyuvanesh@nomic.ai\nZach Nussbaum\nzanussbaum@gmail.com\nBrandon Duderstadt\nbrandon@nomic.ai\nBenjamin Schmidt\nben@nomic.ai\nAndriy Mulyar\nandriy@nomic.ai\nAbstract\nThis preliminary technical report describes the\ndevelopment of GPT4All, a chatbot trained\nover a massive curated corpus of assistant in-\nteractions including word problems, story de-\nscriptions, multi-turn dialogue, and code. We\nopenly release the collected data, data cura-\ntion procedure, training code, and final model\nweights to promote open research and repro-\nducibility. Additionally, we release quantized\n4-bit versions of the model allowing virtually\nanyone to run the model on CPU.\n1\nData Collection and Curation\nWe\ncollected\nroughly\none\nmillion\nprompt-\nresponse pairs using the GPT-3.5-Turbo OpenAI\nAPI between March 20, 2023 and March 26th,\n2023. To do this, we first gathered a di

In [None]:
texts[0]

In [17]:
# FAISS for similarity search
# https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/faiss.html?highlight=FAISS
# Embeddings from Open AI
# https://python.langchain.com/en/latest/modules/models/text_embedding/examples/openai.html?highlight=OpenAIEmbeddings

docsearch = FAISS.from_texts(texts, OpenAIEmbeddings())

In [18]:
# use 'chains' in 'LangChain'
# https://python.langchain.com/en/latest/modules/chains/getting_started.html
openai = OpenAI(temperature=0)
qa = RetrievalQA.from_chain_type(llm=openai, chain_type="stuff", retriever=docsearch.as_retriever())

In [19]:
# QnA
# https://python.langchain.com/en/latest/modules/chains/index_examples/vector_db_qa.html
query = "저자는 누구입니까?"
qa.run(query)

' Yuvanesh Anand, Zach Nussbaum, Brandon Duderstadt, Benjamin Schmidt, Andriy Mulyar'