# Data Loaders and Splitters

In [None]:
from langchain.chat_models import ChatOpenAI
#from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader


#loader = TextLoader("./")
loader = PyPDFLoader("미국주식_Test.pdf")
loader.load()

In [None]:
# UnstructuredFileLoader
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader


loader = UnstructuredFileLoader("미국주식_Test.pdf")
loader.load()

In [None]:
len(loader.load())

In [None]:
# UnstructuredFileLoader, RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(# split when sentence ending or paragraph ending
    chunk_size=200,
    chunk_overlap=50
)
loader = UnstructuredFileLoader("미국주식_Test.pdf")
loader.load_and_split(text_splitter=splitter)

In [None]:
len(loader.load_and_split(text_splitter=splitter))

In [None]:
# Character Text splitter
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter( # split with specifit characters
    separator="\n",
    chunk_size=400,
    chunk_overlap=50
)
loader = UnstructuredFileLoader("미국주식_Test.pdf")
loader.load_and_split(text_splitter=splitter)

In [None]:
len(loader.load_and_split(text_splitter=splitter))

# Tiktoken

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100
)

loader = UnstructuredFileLoader("미국주식_Test.pdf")

# Embedding

In [None]:
from langchain.embeddings import OpenAIEmbeddings

embedder = OpenAIEmbeddings()
embedder.embed_query('Hi')

In [None]:
# Dimension    
len(embedder.embed_query('Hi'))

In [None]:
vector = embedder.embed_documents(['Hi','how','are','you'])
print(len(vector),len(vector[0]))

# Vectorstores_Chroma

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings # text-embedding-ada-002
from langchain.vectorstores import Chroma

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100
)

loader = UnstructuredFileLoader("미국주식_Test.pdf")
docs = loader.load_and_split(text_splitter=splitter) 
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(docs, embeddings) # pass doc and openai embbeding model

In [None]:
vectorstore.similarity_search("절대가치 평가에서 중요한 것은?")

In [None]:
results = vectorstore.similarity_search("절대가치 평가에서 중요한 것은?")
len(results)
# this is why chunking is important.

# Cache embedding 

In [19]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings # Cache
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore #local storage

cache_dir = LocalFileStore('./.cache')

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100
)

loader = UnstructuredFileLoader("./files/미국주식_Test.pdf")
docs = loader.load_and_split(text_splitter=splitter) 
embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)
vectorstore = Chroma.from_documents(docs, cached_embeddings) #이제 cached embedding 에서 (Not OpenAI Embedding)

# RetrievalQA

In [3]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings 
from langchain.vectorstores import Chroma #FAISS로 바꾸어도 돼. 단, cache지우고
from langchain.storage import LocalFileStore 
from langchain.chains import RetrievalQA

llm = ChatOpenAI()

cache_dir = LocalFileStore('./.cache')

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100
)

loader = UnstructuredFileLoader("./files/미국주식_Test.pdf")
docs = loader.load_and_split(text_splitter=splitter) 
embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)
vectorstore = Chroma.from_documents(docs, cached_embeddings) #FAISS로 바꾸어도 돼

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="map_reduce", #
    retriever=vectorstore.as_retriever(),
)

chain.run("회사 운영 본연의 측면에서 회사 경영의 효율성을 측정하는 방법은? 문서에 없는 내용이면 모른다고 말해")

'문서에는 회사 운영의 효율성을 측정하는 방법에 대한 구체적인 내용은 포함되어 있지 않습니다. 이에 대해 알려드릴 수 있는 정보가 없습니다. 해당 내용에 대해 더 자세히 알고 싶으시다면, 다른 출처를 참고하시기를 권장드립니다.'

-A central question for building a summarizer is how to pass your documents into the LLM's context window. Three common approaches for this are:

1. Stuff: Simply "stuff" all your documents into a single prompt. This is the simplest approach (see here for more on the create_stuff_documents_chain constructor, which is used for this method).

2. Map-reduce: Summarize each document on its own in a "map" step and then "reduce" the summaries into a final summary (see here for more on the MapReduceDocumentsChain, which is used for this method).

3. Refine: Update a rolling summary be iterating over the documents in a sequence.