In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader
import csv

file_path = "../data/csv_data/rental_data.csv"


def get_csv_headers(file_path):
    with open(file_path, mode="r", encoding="utf-8") as csvfile:
        reader = csv.reader(csvfile)
        headers = next(reader)  # 첫 번째 줄(헤더) 가져오기
    return headers


headers = get_csv_headers(file_path)

# CSV 로더 생성
loader = CSVLoader(
    file_path=file_path,
    csv_args={
        "delimiter": ",",
        "quotechar": '"',
        "fieldnames": headers,
    },
    # source_column="place",
    content_columns=headers,
    metadata_columns=["price", "place"],
)
docs = loader.load()
# print(docs[1])

In [None]:
i=1

for doc in docs[i:]:
    row = doc.page_content.split("\n")
    row_str = "<row>"
    for element in row:
        splitted_element = element.split(":")
        value = splitted_element[-1]
        col = ":".join(splitted_element[:-1])
        row_str += f"<{col}>{value.strip()}</{col}>"
    row_str += "</row>\n\n"

    docs[i].page_content = row_str
    i+=1
    # print(ret[i].page_content)
    # ret += row_str

ret=docs

In [None]:
# from langchain_text_splitters import RecursiveCharacterTextSplitter

# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=400,
#     chunk_overlap=0,
#     length_function=len,
#     is_separator_regex=False,
# )

# texts = text_splitter.create_documents([ret])
# count = 0
# for i in texts[:]:
#     count += 1
#     # print(i.page_content)
#     # print("\n")
# print(count)

In [None]:
# API 키를 환경변수로 관리하기 위한 설정 파일
from dotenv import load_dotenv

# API 키 정보 로드
load_dotenv()

In [None]:
# LangSmith 추적을 설정합니다. https://smith.langchain.com
# !pip install langchain-teddynote
from langchain_teddynote import logging

# 프로젝트 이름을 입력합니다.
logging.langsmith("VectorStores")

In [None]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_openai import OpenAIEmbeddings

# 임베딩
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# 임베딩 차원 크기를 계산
dimension_size = len(embeddings.embed_query("hello world"))
print(dimension_size)

In [None]:
# FAISS 벡터 저장소 생성
db = FAISS(
    embedding_function=embeddings,
    index=faiss.IndexFlatL2(dimension_size),
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [None]:
# DB 생성
db = FAISS.from_documents(documents=ret[:3], embedding=OpenAIEmbeddings())

In [None]:
# 벡터 저장소에 Document 문서 추가
# from langchain_core.documents import Document

# # page_content, metadata 지정
# db.add_documents(
#     [
#         Document(
#             page_content="안녕하세요! 이번엔 도큐먼트를 새로 추가해 볼께요",
#             metadata={"source": "mydata.txt"},
#         )
#     ],
#     ids=["new_doc1"],
# )

In [None]:
from langchain_core.documents import Document

# page_content, metadata 지정
db.add_documents(
    ret[3:]
)

In [None]:
# 문서 저장소 ID 확인
db.index_to_docstore_id

In [None]:
# 저장된 내용
db.docstore._dict

In [25]:
# 임계값 기반 검색 수행
retriever = db.as_retriever(
    search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.7}
)

retriever.invoke("수정 오피스텔에 대해 알려줘")

[Document(id='ca23576b-346e-473d-8280-32d9aab1b09e', metadata={'source': '../data/csv_data/rental_data.csv', 'row': 18, 'price': '년세 340 / 반년세 170', 'place': '중문'}, page_content='<row><name>수정오피스텔</name><address>호서로79번길 8-7</address><contact>010-7143-4949</contact><price>년세 340 / 반년세 170</price><fee>X / 10만원</fee><options>풀옵션</options><gas_type>심야전기</gas_type><comment>cctv 가동</comment><place>중문</place></row>\n\n'),
 Document(id='13631f45-98f4-4891-a687-f9bfa954152d', metadata={'source': '../data/csv_data/rental_data.csv', 'row': 15, 'price': '원룸 년세 280, 300 / 반년세 150', 'place': '중문'}, page_content='<row><name>하버드오피스텔</name><address>호서로79번길 8-16</address><contact>010-2844-2995</contact><price>원룸 년세 280, 300 / 반년세 150</price><fee>X / 20만원</fee><options>풀옵션</options><gas_type>심야전기</gas_type><comment>투룸은 직접 문의</comment><place>중문</place></row>\n\n'),
 Document(id='1b5e9435-2113-4190-8bd3-121812c37353', metadata={'source': '../data/csv_data/rental_data.csv', 'row': 19, 'price': '년세 310 / 반년세

In [26]:
# 메타데이터 필터 적용
retriever = db.as_retriever(
    search_kwargs={"filter": {"place": "중문"}, "k": 5}
)
retriever.invoke("ESG 에 대하여 알려줘")

[Document(id='f9d50d8d-b62d-4596-8e02-72fdc8ce6d0a', metadata={'source': '../data/csv_data/rental_data.csv', 'row': 21, 'price': '년세 220 / 반년세 110', 'place': '중문'}, page_content='<row><name>그린빌</name><address>호서로79번길 8-22</address><contact>010-9037-2656</contact><price>년세 220 / 반년세 110</price><fee>X / 30</fee><options>풀옵션</options><gas_type>도시가스</gas_type><comment>전기세 개별 납부</comment><place>중문</place></row>\n\n'),
 Document(id='3abe3b69-6a13-4bc6-be24-608c71796719', metadata={'source': '../data/csv_data/rental_data.csv', 'row': 13, 'price': '년세 340 - 350 / 반년세 200', 'place': '중문'}, page_content='<row><name>상아빌라</name><address>호서로79번길 8-15</address><contact>010-6477-8486</contact><price>년세 340 - 350 / 반년세 200</price><fee>X / 10</fee><options>풀옵션</options><gas_type>심야전기</gas_type><comment>null</comment><place>중문</place></row>\n\n'),
 Document(id='e9a9d58a-a3a5-49dc-bcf2-c0e58d2ca651', metadata={'source': '../data/csv_data/rental_data.csv', 'row': 16, 'price': '년세 300 / 반년세 160', 'place': 

In [None]:
# 로컬 Disk 에 저장
db.save_local(folder_path="faiss_db", index_name="faiss_index")

In [None]:
# id 로 삭제
# db.delete([id])

In [None]:
# 벡터 저장소를 병합
# db.merge_from(db2)

In [None]:
# 저장된 데이터를 로드
loaded_db = FAISS.load_local(
    folder_path="faiss_db",
    index_name="faiss_index",
    embeddings=embeddings,
    allow_dangerous_deserialization=True,
)

In [None]:
# 로드된 데이터를 확인
loaded_db.index_to_docstore_id