In [2]:
# Data Loaders and Splitters
# .txt
from langchain.document_loaders import TextLoader

loader = TextLoader('./files/chapter_one.txt')
loader.load()

[Document(page_content="Part 1, Chapter 1\n\nPart One\n\n\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had

In [1]:
# .pdf
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader('./files/chapter_one.pdf')
loader.load()

[Document(page_content="제목 없음\n1Part 1, Chapter 1\nPart One\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith,  \nhis chin nuzzled into his breast in an ef fort to escape the vile wind, slipped quickly  \nthrough the glass doors of V ictory Mansions, though not quickly enough to prevent a  \nswirl of gritty dust from entering along with him.\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured  \nposter , too large for indoor display , had been tacked to the wall. It depicted simply an  \nenormous face, more than a metre wide: the face of a man of about forty-five, with a  \nheavy black moustache and ruggedly handsome features. Winston made for the  \nstairs. It was no use trying the lift. Even at the best of times it was seldom working,  \nand at present the electric current was cut of f during daylight hours. It was part of the  \neconomy drive in preparation for Hate W eek. The flat was seven flights up, and  \n

In [5]:
# UnstructureFileLoader
from langchain.document_loaders import UnstructuredFileLoader

loader = UnstructuredFileLoader('./files/chapter_one.docx')
loader.load()

[Document(page_content="Part 1, Chapter 1\n\nPart One\n\n\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had

In [5]:
# Text split
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=200, # 얼마나 큰 덩어리로 나눌지 결정
    chunk_overlap=50 , # 문장이나 문단을 분할할 때 앞 조각 일부분을 가져오게 만든다.
)

loader = UnstructuredFileLoader('./files/chapter_one.docx')

# Option 1
# docs = loader.load()
# splitter.split_documents(docs)

# Option 2
# loader.load_and_split(text_splitter=splitter)

In [7]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(
    separator="\n", # 특정 문자열을 찾은 다음에 거기부터 분할한다.
    # default = '\n\n'
    chunk_size=600, # 최대 글자개수 600인 문서로 나뉜 것
    chunk_overlap=100,
)

loader = UnstructuredFileLoader('./files/chapter_one.docx')

loader.load_and_split(text_splitter=splitter)


Created a chunk of size 963, which is longer than the specified 600
Created a chunk of size 774, which is longer than the specified 600
Created a chunk of size 954, which is longer than the specified 600
Created a chunk of size 922, which is longer than the specified 600
Created a chunk of size 1168, which is longer than the specified 600
Created a chunk of size 821, which is longer than the specified 600
Created a chunk of size 700, which is longer than the specified 600
Created a chunk of size 745, which is longer than the specified 600
Created a chunk of size 735, which is longer than the specified 600
Created a chunk of size 1110, which is longer than the specified 600
Created a chunk of size 991, which is longer than the specified 600
Created a chunk of size 990, which is longer than the specified 600
Created a chunk of size 1741, which is longer than the specified 600
Created a chunk of size 2001, which is longer than the specified 600
Created a chunk of size 1900, which is longe

39

In [None]:
# Tiktoken
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader('./files/chapter_one.docx')

# loader.load_and_split(text_splitter=splitter)


In [None]:
# Vectors
# Embedding
# Tiktoken
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader('./files/chapter_one.docx')

loader.load_and_split(text_splitter=splitter)

      Masculintiy | Feminity | Royalty
king | 0.9      | 0.1      | 1.0
queen | 0.1     | 0.9      | 1.0
man   | 0.9     | 0.1      | 0.0
royal | 0.0     | 0.0      | 1.0
woman | 0.1     | 0.9      | 0.0
> 각 단어가 해당 특성을 얼마나 반영하는지

example)
king - man = 0.0 | 0.0 | 1.0 => royal

royal + woman = 0.1 | 0.9 | 1.0 => queen



In [6]:
# Vector Store
from langchain.embeddings import OpenAIEmbeddings

embedder = OpenAIEmbeddings()

# vector = embedder.embed_query("Hi")
# len(vector)
vector = embedder.embed_documents([
    "hi",
    "how",
    "are",
    "you longer sentences because"
])
# len(vector)
print(len(vector), len(vector[0]))

4 1536


In [11]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
from langchain.storage import LocalFileStore

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader('./files/chapter_one.docx')

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings,
    cache_dir
)

vectorstore = Chroma.from_documents(docs, cached_embeddings)

In [None]:
# Langsmith
# https://www.langchain.com/langsmith


In [7]:
# RetrievalQA
# document chain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

llm = ChatOpenAI(
    temperature=0.1
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader('./files/chapter_one.docx')

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings,
    cache_dir
)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='refine', # default = 'stuff'
    retriever=vectorstore.as_retriever()
)

chain.run("Describe Victory Mansions")

"Based on the new context, Victory Mansions can be further described as a residential building located in London, specifically in Airstrip One, which is one of the provinces of Oceania. The passage reveals that Victory Mansions is equipped with a telescreen, a device used by the government to monitor and control its citizens. The telescreen is positioned in an unusual location, opposite the window, which allows the protagonist, Winston, to sit in a shallow alcove and remain out of sight from the telescreen's view. This suggests that the government's surveillance is pervasive and constant, even within the confines of one's own home. Additionally, the passage mentions that Winston possesses a forbidden book, which he acquired from a junk shop in a slummy quarter of the town. This implies that Victory Mansions is located in a rundown and impoverished area, where access to certain goods and information is restricted. Overall, Victory Mansions is depicted as a place where residents live und

In [4]:
chain.run("Describe Victory Mansions")

'Victory Mansions is a building with glass doors and a hallway that smells of boiled cabbage and old rag mats. There is a large colored poster on one wall, depicting the face of a man with a black mustache. The building has seven floors, and the elevator is usually not working. The walls of the building are adorned with posters of Big Brother, with the caption "BIG BROTHER IS WATCHING YOU." Inside the flat, there is a telescreen, an oblong metal plaque that cannot be completely turned off. The protagonist, Winston, is described as a small and frail figure, wearing blue overalls. He has fair hair, a naturally sanguine face, and rough skin from the cold weather.\n\nTranslation to Korean:\n빅토리 맨션은 유리문이 있는 건물이며 복도는 삶은 양배추와 낡은 래그 매트의 냄새가 납니다. 한 벽에는 검은 콧수염을 가진 남자의 얼굴이 크게 그려진 컬러 포스터가 붙어 있습니다. 건물은 7층으로 구성되어 있으며 엘리베이터는 보통 작동하지 않습니다. 건물의 벽에는 "빅 브라더가 당신을 지켜보고 있습니다"라는 문구와 함께 빅 브라더의 포스터가 걸려 있습니다. 아파트 안에는 텔레스크린이라고 불리는 금속 판이 있으며 완전히 끌 수는 없습니다. 주인공인 윈스턴은 작고 연약한 인물로, 파란 작업복을 입고 있습니다. 그는 밝은 머리카락과 자연스러운 

In [5]:
# 빅토리 맨션은 유리문이 있는 건물이며 복도는 삶은 양배추와 낡은 래그 매트의 냄새가 납니다. 한 벽에는 검은 콧수염을 가진 남자의 얼굴이 크게 그려진 컬러 포스터가 붙어 있습니다. 건물은 7층으로 구성되어 있으며 엘리베이터는 보통 작동하지 않습니다. 건물의 벽에는 "빅 브라더가 당신을 지켜보고 있습니다"라는 문구와 함께 빅 브라더의 포스터가 걸려 있습니다. 아파트 안에는 텔레스크린이라고 불리는 금속 판이 있으며 완전히 끌 수는 없습니다. 주인공인 윈스턴은 작고 연약한 인물로, 파란 작업복을 입고 있습니다. 그는 밝은 머리카락과 자연스러운 얼굴, 거친 비누와 무딘 면도날, 그리고 방금 끝난 추운 겨울로 인해 거친 피부를 가지고 있습니다.

In [1]:
# Stuff LCEL Chain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough

llm = ChatOpenAI(
    temperature=0.1
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader('./files/chapter_one.docx')

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings,
    cache_dir
)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriever = vectorstore.as_retriever()

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Answer questions using only the following context. If you don't know the answer just say you don't know, don't make it up:\n\n{context}"),
    ("human", "{question}"),
])

chain = ({
    "context": retriever, 
    "question": RunnablePassthrough()
    } 
    | prompt 
    | llm
)

chain.invoke("Describe Victory Mansions")

AIMessage(content='Victory Mansions is a building where Winston Smith resides. It is described as having glass doors at the entrance, which allow gritty dust to enter along with people. The hallway of Victory Mansions has a smell of boiled cabbage and old rag mats. There is a large colored poster on one end of the hallway, depicting the face of a man in his forties with a black mustache. The building has seven floors, and the flat where Winston lives is on the seventh floor. The flat is accessed by stairs since the lift is rarely working. The building is not well-maintained, with rotting houses and patched windows. From the roof of Victory Mansions, one can see the other three Ministries of Oceania.')

In [3]:
# Map Reduce LCEL Chain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

llm = ChatOpenAI(
    temperature=0.1
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader('./files/chapter_one.docx')

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings,
    cache_dir
)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

# list of docs
retriever = vectorstore.as_retriever()

# for doc in list of docs | prompt | llm
map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim. If there is no relevant text, return : ''
            -------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

map_doc_chain = map_doc_prompt | llm

# for response in list of llms response | put them all together

def map_docs(inputs): 
    documents = inputs['documents']
    question = inputs['question']

    # results = []
    # for document in documents :
    #     result = map_doc_chain.invoke({
    #         "context": document.page_content,
    #         "question": question
    #     }).content

    #     results.append(result)

    # results = "\n\n".join(results)
    # return results

    return "\n\n".join(map_doc_chain.invoke({
        "context": doc.page_content,
        "question": question
    }).content for doc in documents)


map_chain = ({
    "documents": retriever, 
    "question": RunnablePassthrough()
    }
    | RunnableLambda(map_docs)
)

final_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Given the following extracted parts of a long document and a question, create a final answer. 
            If you don't know the answer, just say that you don't know. Don't try to make up an answer.
            ------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)


# final doc | prompt | llm
# map_chain에서 map_docs 함수로 return되는 값이 context에 들어가게 된다.
chain = ({
    "context": map_chain,
    "question": RunnablePassthrough()
    } 
    | final_prompt 
    | llm
)

chain.invoke("Describe Victory Mansions")

AIMessage(content='Victory Mansions is a building located in London, specifically in Airstrip One, which is the chief city of Oceania. It is described as a structure that stands amidst a grimy landscape. The building has glass doors that Winston Smith enters. The hallway of the building has a smell of boiled cabbage and old rag mats. There is a large colored poster on one end of the hallway, depicting the face of a man in his forties with a black mustache and ruggedly handsome features. The building has seven flights of stairs, and the elevator is rarely working. On each landing, there is a poster with the caption "BIG BROTHER IS WATCHING YOU." Inside the flat, there is a telescreen, an oblong metal plaque that functions as a dulled mirror on the right-hand wall. The flat has a window, and Winston is described as a smallish, frail figure wearing blue overalls. From the roof of Victory Mansions, one can see the Ministry of Truth along with three other buildings of similar appearance and