# Data Loaders and Splitters

In [None]:
from langchain.chat_models import ChatOpenAI
#from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader


#loader = TextLoader("./")
loader = PyPDFLoader("미국주식_Test.pdf")
loader.load()

In [None]:
# UnstructuredFileLoader
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader


loader = UnstructuredFileLoader("미국주식_Test.pdf")
loader.load()

In [None]:
len(loader.load())

In [None]:
# UnstructuredFileLoader, RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(# split when sentence ending or paragraph ending
    chunk_size=200,
    chunk_overlap=50
)
loader = UnstructuredFileLoader("미국주식_Test.pdf")
loader.load_and_split(text_splitter=splitter)

In [None]:
len(loader.load_and_split(text_splitter=splitter))

In [None]:
# Character Text splitter
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter( # split with specifit characters
    separator="\n",
    chunk_size=400,
    chunk_overlap=50
)
loader = UnstructuredFileLoader("미국주식_Test.pdf")
loader.load_and_split(text_splitter=splitter)

In [None]:
len(loader.load_and_split(text_splitter=splitter))

# Tiktoken

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100
)

loader = UnstructuredFileLoader("미국주식_Test.pdf")

# Embedding

In [None]:
from langchain.embeddings import OpenAIEmbeddings

embedder = OpenAIEmbeddings()
embedder.embed_query('Hi')

In [None]:
# Dimension    
len(embedder.embed_query('Hi'))

In [None]:
vector = embedder.embed_documents(['Hi','how','are','you'])
print(len(vector),len(vector[0]))

# Vectorstores_Chroma

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings # text-embedding-ada-002
from langchain.vectorstores import Chroma

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100
)

loader = UnstructuredFileLoader("미국주식_Test.pdf")
docs = loader.load_and_split(text_splitter=splitter) 
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(docs, embeddings) # pass doc and openai embbeding model

In [None]:
vectorstore.similarity_search("절대가치 평가에서 중요한 것은?")

In [None]:
results = vectorstore.similarity_search("절대가치 평가에서 중요한 것은?")
len(results)
# this is why chunking is important.

# Cache embedding 

In [19]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings # Cache
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore #local storage

cache_dir = LocalFileStore('./.cache')

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100
)

loader = UnstructuredFileLoader("./files/미국주식_Test.pdf")
docs = loader.load_and_split(text_splitter=splitter) 
embeddings = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)
vectorstore = Chroma.from_documents(docs, cached_embeddings) #이제 cached embedding 에서 (Not OpenAI Embedding)

# RetrievalQA

In [3]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings 
from langchain.vectorstores import Chroma #FAISS로 바꾸어도 돼. 단, cache지우고
from langchain.storage import LocalFileStore 
from langchain.chains import RetrievalQA

llm = ChatOpenAI()

cache_dir = LocalFileStore('./.cache') 

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100
)

loader = UnstructuredFileLoader("./files/미국주식_Test.pdf")    # load file (any kinds)
docs = loader.load_and_split(text_splitter=splitter)  # split into smaller docs ; send llm chopped ones
embeddings = OpenAIEmbeddings() # OpenAI's embedding
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(  # cached the embeddings (not free though)
    embeddings, cache_dir
)
vectorstore = Chroma.from_documents(docs, cached_embeddings) #FAISS로 바꾸어도 돼  # perform searches and get related docs

chain = RetrievalQA.from_chain_type(  # RetrievalQA : 
    llm=llm,
    chain_type="map_reduce", # many different types, check below
    retriever=vectorstore.as_retriever(), #vectorstore is one of the retriever
)

chain.run("회사 운영 본연의 측면에서 회사 경영의 효율성을 측정하는 방법은? 문서에 없는 내용이면 모른다고 말해")

'문서에는 회사 운영의 효율성을 측정하는 방법에 대한 구체적인 내용은 포함되어 있지 않습니다. 이에 대해 알려드릴 수 있는 정보가 없습니다. 해당 내용에 대해 더 자세히 알고 싶으시다면, 다른 출처를 참고하시기를 권장드립니다.'

-A central question for building a summarizer is how to pass your documents into the LLM's context window. Three common approaches for this are:

1. Stuff: Simply "stuff" all your documents into a single prompt. This is the simplest approach (see here for more on the create_stuff_documents_chain constructor, which is used for this method).
- 검색을 통해 관련 문서들을 얻어서 prompt입력 한후, llm에게 질문에 답하세요라고 전달.
    
2. Map-reduce: Summarize each document on its own in a "map" step and then "reduce" the summaries into a final summary (see here for more on the MapReduceDocumentsChain, which is used for this method).
- Document를 각각 보면서, 개별 답변을 각각 docs에서 찾고, 탐색이 끝나면 일종의 중간 답변들을 기반으로 최종 답 생성

3. Refine: Update a rolling summary be iterating over the documents in a sequence.
- 다큐멘트 최초 답변이 있고, 그것을 고쳐나가는 방식

4. Map_rerank: 각 다튜면트 돌아보고, 각 docs에서 질문 답하고, score하고 제일 높은 점수 답변 제공


# Stuff LCEL Chain

#### https://python.langchain.com/v0.1/docs/expression_language/interface/

In [7]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings 
from langchain.vectorstores import Chroma 
from langchain.storage import LocalFileStore 
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough

llm = ChatOpenAI(temperature=0.1)

cache_dir = LocalFileStore('./.cache') 

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator='\n',
    chunk_size=600,
    chunk_overlap=100
)

loader = UnstructuredFileLoader("./files/미국주식_Test.pdf")  
docs = loader.load_and_split(text_splitter=splitter)  
embeddings = OpenAIEmbeddings() 
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(  
    embeddings, cache_dir
)
vectorstore = Chroma.from_documents(docs, cached_embeddings) 

retriever=vectorstore.as_retriever()

prompt = ChatPromptTemplate.from_messages(
    [
        ("system","You are a helpful assistant. Answer questions using only the following context. \
         If you don't know the answer just say you don't know, don't make it up:\n\n{context}",),
        ("human", "{question}"),
    ]
)
 
chain = ({"context" : retriever,  # retriver is one of the component of chain; will receive string and output the list of docs. 즉  retriever("회사 운영 본연의 측면에서 회사 경영의 효율성을 측정하는 방법은? 문서에 없는 내용이면 모른다고 말해") 이게 output [docs]
         "question": RunnablePassthrough()} # passthrough ""회사 운영 본연의 측면에서 회사 경영의 효율성을 측정하는 방법은? 문서에 없는 내용이면 모른다고 말해"
        | prompt | llm)  #2. retriever will give the list of the docs 3. a) 이 docs들이 system {context}로 가고 b) {question}가고

chain.invoke("회사 운영 본연의 측면에서 회사 경영의 효율성을 측정하는 방법은? 문서에 없는 내용이면 모른다고 말해") # 1. 이게 retriver로 가고

AIMessage(content='절대 가치 평가, 비교 가치 평가, 재무제표 가치 평가와 같은 방법들을 사용하여 회사의 경영 효율성을 측정할 수 있습니다.', response_metadata={'token_usage': <OpenAIObject at 0x7fcd8362da30> JSON: {
  "prompt_tokens": 1242,
  "completion_tokens": 62,
  "total_tokens": 1304
}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-301846b4-0ffe-4a33-a9da-d531e3dd9ad3-0')

# Map Reduce LCEL Chain