### Data Loader

In [4]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader

loader = TextLoader("./files/chapter_one.txt", encoding="utf-8")

loader.load()



In [6]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("./files/chapter_one.pdf")

loader.load()

[Document(page_content='II. \n \nA MERRY CHRISTMAS.  \n \nJo was the first to wake in the gray dawn of Christmas morning. No \nstockings hung at the fireplace, and for a moment she felt as much \ndisappointed as she did long ago, when her little sock fell down because \nit was so crammed wi th goodies. Then she remembered her mother\'s promise, \nand, slipping her hand under her pillow, drew out a little crimson -\ncovered book. She knew it very well, for it was that beautiful old story \nof the best life ever lived, and Jo felt that it was a true gu ide-book \nfor any pilgrim going the long journey. She woke Meg with a "Merry \nChristmas," and bade her see what was under her pillow. A green -covered \nbook appeared, with the same picture inside, and a few words written by \ntheir mother, which made their one p resent very precious in their eyes. \nPresently Beth and Amy woke, to rummage and find their little books \nalso,—one dove -colored, the other blue; and all sat looking at and \

In [8]:
# UnstructuredFileLoader: 모든 파일을 불러올 수 있음(text files, powerpoints, html, pdfs, images,...)
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader

loader = UnstructuredFileLoader("./files/chapter_one.docx")

loader.load()

[Document(page_content='Chapter 1\n\nI am by birth a Genevese, and my family is one of the most distinguished of that republic. My ancestors had been for many years counsellors and syndics, and my father had filled several public situations with honour and reputation. He was respected by all who knew him for his integrity and indefatigable attention to public business. He passed his younger days perpetually occupied by the affairs of his country; a variety of circumstances had prevented his marrying early, nor was it until the decline of life that he became a husband and the father of a family.\n\nAs the circumstances of his marriage illustrate his character, I cannot refrain from relating them. One of his most intimate friends was a merchant who, from a flourishing state, fell, through numerous mischances, into poverty. This man, whose name was Beaufort, was of a proud and unbending disposition and could not bear to live in poverty and oblivion in the same country where he had formerl

### Data Split

In [18]:
# Data Split을 하면 필요한 걸 찾기가 더 쉬워진다.
# 내가 만들 Promt도 더 짧아진다.

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# RecursiveCharacterTextSplitter: 문단 끝이나 문장 끝부분마다 끊어줌
# chunk_overlap: 문장이나 문단을 분할할 때 앞 조각 일부분을 가져오게 만듬, 중복이 생김
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50,
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")


loader.load_and_split(text_splitter=splitter)
#print(len(loader.load_and_split(text_splitter=splitter)))

[Document(page_content='Chapter 1', metadata={'source': './files/chapter_one.docx'}),
 Document(page_content='I am by birth a Genevese, and my family is one of the most distinguished of that republic. My ancestors had been for many years counsellors and syndics, and my father had filled several public', metadata={'source': './files/chapter_one.docx'}),
 Document(page_content='syndics, and my father had filled several public situations with honour and reputation. He was respected by all who knew him for his integrity and indefatigable attention to public business. He', metadata={'source': './files/chapter_one.docx'}),
 Document(page_content='indefatigable attention to public business. He passed his younger days perpetually occupied by the affairs of his country; a variety of circumstances had prevented his marrying early, nor was it', metadata={'source': './files/chapter_one.docx'}),
 Document(page_content='had prevented his marrying early, nor was it until the decline of life that he b

In [19]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter


splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
    length_function=len,
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")


loader.load_and_split(text_splitter=splitter)
#print(len(loader.load_and_split(text_splitter=splitter)))

Created a chunk of size 1000, which is longer than the specified 600
Created a chunk of size 794, which is longer than the specified 600
Created a chunk of size 611, which is longer than the specified 600
Created a chunk of size 1493, which is longer than the specified 600
Created a chunk of size 1112, which is longer than the specified 600
Created a chunk of size 1614, which is longer than the specified 600
Created a chunk of size 932, which is longer than the specified 600
Created a chunk of size 785, which is longer than the specified 600
Created a chunk of size 795, which is longer than the specified 600
Created a chunk of size 1200, which is longer than the specified 600
Created a chunk of size 1199, which is longer than the specified 600
Created a chunk of size 671, which is longer than the specified 600
Created a chunk of size 1105, which is longer than the specified 600
Created a chunk of size 643, which is longer than the specified 600
Created a chunk of size 834, which is lon

[Document(page_content='Chapter 1\nI am by birth a Genevese, and my family is one of the most distinguished of that republic. My ancestors had been for many years counsellors and syndics, and my father had filled several public situations with honour and reputation. He was respected by all who knew him for his integrity and indefatigable attention to public business. He passed his younger days perpetually occupied by the affairs of his country; a variety of circumstances had prevented his marrying early, nor was it until the decline of life that he became a husband and the father of a family.', metadata={'source': './files/chapter_one.docx'}),
 Document(page_content='As the circumstances of his marriage illustrate his character, I cannot refrain from relating them. One of his most intimate friends was a merchant who, from a flourishing state, fell, through numerous mischances, into poverty. This man, whose name was Beaufort, was of a proud and unbending disposition and could not bear t

### Tiktoken

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter


splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")


### Vector Store

In [26]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = Chroma.from_documents(docs, cached_embeddings)

In [23]:
results = vectorstore.similarity_search("Where are you from")

results

[Document(page_content='From Italy they visited Germany and France. I, their eldest child, was born at Naples, and as an infant accompanied them in their rambles. I remained for several years their only child. Much as they were attached to each other, they seemed to draw inexhaustible stores of affection from a very mine of love to bestow them upon me. My mother’s tender caresses and my father’s smile of benevolent pleasure while regarding me are my first recollections. I was their plaything and their idol, and something better—their child, the innocent and helpless creature bestowed on them by Heaven, whom to bring up to good, and whose future lot it was in their hands to direct to happiness or misery, according as they fulfilled their duties towards me. With this deep consciousness of what they owed towards the being to which they had given life, added to the active spirit of tenderness that animated both, it may be imagined that while during every hour of my infant life I received a

### RetrievalQA

In [30]:
# Chain_type = Stuff

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

llm = ChatOpenAI()

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = Chroma.from_documents(docs, cached_embeddings)

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
)

# chain.run("What is Beaufort's job")
chain.run("Describe Beaufort")

"Beaufort, as described in the text, was a merchant who was once wealthy and distinguished but fell into poverty due to a series of misfortunes. He was described as having a proud and unbending disposition, unable to bear the idea of living in poverty and obscurity. Despite his circumstances, he maintained his integrity and honor by paying off his debts in the most honorable manner. He retreated with his daughter, Caroline, to the town of Lucerne, where they lived in wretchedness and obscurity. Beaufort's pride and despair eventually took a toll on his health, leaving him bedridden and unable to work. His daughter, Caroline, showed great tenderness and courage, taking on various jobs such as plain work and plaiting straw to earn a meager income to support their livelihood."

In [31]:
# Chain_type = Stuff

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

llm = ChatOpenAI()

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = Chroma.from_documents(docs, cached_embeddings)

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="refine",
    retriever=vectorstore.as_retriever(),
)

# chain.run("What is Beaufort's job")
chain.run("Describe Beaufort")

"Beaufort, as described in the additional context, is a former wealthy merchant and a friend of the narrator's father. He faced a series of misfortunes that led to his fall into poverty. In response, Beaufort retreated to the town of Lucerne with his daughter, Caroline Beaufort, where they lived in wretchedness and obscurity. The narrator's father, deeply saddened by his friend's unfortunate circumstances, made efforts to find Beaufort and offer his assistance. It took nearly ten months for the narrator's father to discover Beaufort's whereabouts, as Beaufort had taken measures to hide himself. During this time, Caroline Beaufort worked tirelessly to earn a meager income to support herself and her father."

In [34]:
# Chain_type = refine
# Chain_type = map_reduce
# chain)type = map_rerank

from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

llm = ChatOpenAI()

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="map_rerank",
    retriever=vectorstore.as_retriever(),
)

# chain.run("What is Beaufort's job")
chain.run("Describe Beaufort")



'Beaufort was a proud and unbending man who fell into poverty and retreated to the town of Lucerne with his daughter. He had previously been distinguished for his rank and magnificence, but could not bear to live in poverty and obscurity. Despite his circumstances, he maintained his integrity by paying off his debts in an honorable manner. He saved a small sum of money, which provided sustenance for a few months while he hoped to find respectable employment. However, his grief over his situation consumed him, leading to sickness and incapacity for any exertion. His daughter, Caroline Beaufort, supported him with great tenderness and tried to earn a living through various means. '

### Stuff LCEL Chain

In [35]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough

llm = ChatOpenAI(
    temperature=0.1,
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriever = vectorstore.as_retriever()

prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        "You are a helpful assistant. Answer questions using only the following context. If you don't know the answer just say you don't know, don't make it up:\n\n{context}",
    ),
    ("human", "{question}"),
])

chain = {"context": retriever, "question": RunnablePassthrough(),} | prompt | llm

# chain.run("What is Beaufort's job")
chain.invoke("Describe Beaufort")

AIMessage(content="Beaufort was a proud and unbending man who was once a wealthy merchant. However, due to numerous misfortunes, he fell into poverty and obscurity. Despite his circumstances, he maintained his integrity and honor by paying off his debts in an honorable manner. He retreated to the town of Lucerne with his daughter, living in wretchedness and anonymity. Beaufort's pride prevented him from accepting help from his friend, the narrator's father, until he became sick and incapable of providing for himself.")

### Map Reduce LCEL Chain

In [39]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

llm = ChatOpenAI(
    temperature=0.1,
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriever = vectorstore.as_retriever()

map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim. If there is no relevant text, return : ''
            -------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

map_doc_chain = map_doc_prompt | llm

def map_docs(inputs):
    documents = inputs["documents"]
    question = inputs["question"]
    return "\n\n".join(
        map_doc_chain.invoke(
            {"context": doc.page_content, "question": question}
        ).content
        for doc in documents
    )

map_chain = {
    "documents": retriever,
    "question": RunnablePassthrough(),
} | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages([
    (
            "system",
            """
            Given the following extracted parts of a long document and a question, create a final answer. 
            If you don't know the answer, just say that you don't know. Don't try to make up an answer.
            ------
            {context}
            """,
        ),
        ("human", "{question}"),
])

chain = {"context": map_chain, "question": RunnablePassthrough(),} | final_prompt | llm

# chain.invoke("What is Beaufort's job")
chain.invoke("Describe Beaufort")

AIMessage(content='There is no information provided in the given text about Beaufort.')