In [1]:
# API 키를 환경변수로 관리하기 위한 설정 파일
from dotenv import load_dotenv

# LangSmith 추적을 설정합니다. https://smith.langchain.com
# !pip install langchain-teddynote
from langchain_teddynote import logging

# API 키 정보 로드
load_dotenv()

# 프로젝트 이름을 입력합니다.
logging.langsmith("rented_room")

LangSmith 추적을 시작합니다.
[프로젝트명]
rented_room


In [2]:
# 문서를 예쁘게 출력하기 위한 도우미 함수
def pretty_print_docs(docs):
    for doc in docs:
        print(doc)
        print("\n=====================================\n")
        # print("palce", doc.metadata["place"])
        # print("oneroom_half_year", doc.metadata["oneroom_half_year"])
        # print("oneroom_year", doc.metadata["oneroom_year"])
        # print("tworoom_half_year", doc.metadata["tworoom_half_year"])
        # print("tworoom_year", doc.metadata["tworoom_year"])
        # print("\n=====================================\n")

In [3]:
from langchain_chroma import Chroma
from langchain_openai.embeddings import OpenAIEmbeddings

# 저장할 경로 지정
DB_PATH = "./chroma_db"

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# # 디스크에서 문서를 로드합니다.
persist_db = Chroma(
    persist_directory=DB_PATH,
    embedding_function=embeddings,
    # collection_name="rental_data_with_null",
    collection_name="rental_data_with_nan",
)

In [None]:
# 저장된 데이터 확인
persist_db.get()

In [None]:
# retriever = persist_db.as_retriever(
#     search_kwargs={"k": 5}
# )

# pretty_print_docs(retriever.invoke("농가마트 근처 자취방 추천해줘"))

In [4]:
# self-query retriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

# 메타데이터 필드 정보 생성
metadata_field_info = [
    AttributeInfo(
        name="place",
        description="The location information of the rental room. One of ['정문', '중문', '후문', '기숙사', '농가마트', '교육문화회관']",
        type="string",
    ),
    AttributeInfo(
        name="oneroom_year",
        description="The annual rent price for a one-room unit.",
        type="float",
    ),
    AttributeInfo(
        name="oneroom_half_year",
        description="The half-year rent price for a one-room unit.",
        type="float",
    ),
    AttributeInfo(
        name="tworoom_half_year",
        description="The half-year rent price for a two-room unit.",
        type="float",
    ),
    AttributeInfo(
        name="tworoom_year",
        description="The annual rent price for a two-room unit.",
        type="float",
    ),
]

In [5]:
from langchain.chains.query_constructor.base import StructuredQuery
from langchain.chains.query_constructor.base import Comparison, Comparator, Operation
from langchain.schema.runnable import Runnable

class FilterTransformRunnable(Runnable):
    def invoke(self, structured_query: StructuredQuery, config=None) -> StructuredQuery:
        # contain -> eq 변환을 적용
        return preprocess_and_wrap_structured_query(structured_query)


def preprocess_and_wrap_structured_query(query: StructuredQuery) -> StructuredQuery:
    # 기존 필터를 변환
    if query.filter:
        transformed_filter = replace_contain_with_eq(query.filter)
    else:
        transformed_filter = None

    # 새로운 StructuredQuery 객체 생성
    return StructuredQuery(
        query=query.query, filter=transformed_filter, limit=query.limit
    )


def replace_contain_with_eq(filter):
    if isinstance(filter, Comparison):
        if filter.comparator == Comparator.CONTAIN:
            # contain을 eq로 변환
            return Comparison(
                comparator=Comparator.EQ, attribute=filter.attribute, value=filter.value
            )
        elif filter.comparator == Comparator.LIKE:
            # contain을 eq로 변환
            return Comparison(
                comparator=Comparator.EQ,
                attribute=filter.attribute,
                value=filter.value.replace("%", ""),
            )
        return filter
    elif isinstance(filter, Operation):
        # Operation 객체의 조건들도 변환
        return Operation(
            operator=filter.operator,
            arguments=[replace_contain_with_eq(arg) for arg in filter.arguments],
        )
    return filter

In [6]:
from langchain.chains.query_constructor.base import (
    StructuredQueryOutputParser,
    get_query_constructor_prompt,
)
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI

# llm = ChatGoogleGenerativeAI(temperature=0, model="gemini-1.5-flash")
llm = ChatOpenAI(temperature=0, model="gpt-4o")

# 문서 내용 설명과 메타데이터 필드 정보를 사용하여 쿼리 생성기 프롬프트를 가져옵니다.
prompt = get_query_constructor_prompt(
    "Brief summary of a rental room",  # 문서 내용 설명
    metadata_field_info,  # 메타데이터 필드 정보
)

# StructuredQueryOutputParser 를 생성
output_parser = StructuredQueryOutputParser.from_components()

# 변환 작업을 Runnable로 추가
filter_transform = FilterTransformRunnable()

# query_constructor chain 을 생성
query_constructor = prompt | llm | output_parser | filter_transform

In [7]:
query_constructor.invoke("연세 250이상인 정문 자취방 추천해줘")

StructuredQuery(query='자취방 추천', filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.GTE: 'gte'>, attribute='oneroom_year', value=250), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='place', value='정문')]), limit=None)

In [8]:
from langchain.retrievers.self_query.chroma import ChromaTranslator

retriever = SelfQueryRetriever(
    query_constructor=query_constructor,  # 이전에 생성한 query_constructor chain 을 지정
    vectorstore=persist_db,  # 벡터 저장소를 지정
    structured_query_translator=ChromaTranslator(),  # 쿼리 변환기
    search_kwargs={"k": 10},  # 검색 옵션
)

In [9]:
pretty_print_docs(retriever.invoke("년세 350이하 정문 자취방 추천해줘"))

page_content='<row><name>엘림원룸</name><address>호서로 65-8</address><contact>010-6587-1255</contact><price>년세 큰 방 320 / 작은 방 290</price><fee>X / 30만원</fee><options>풀옵션</options><gas_type>심야전기</gas_type><comment>방크기는 집주인에게 직접 문의</comment><place>정문</place><oneroom_half_year></oneroom_half_year><oneroom_year>320</oneroom_year><tworoom_half_year></tworoom_half_year><tworoom_year></tworoom_year></row>

' metadata={'filename': '../data/csv_data/rental_data_with_null.csv', 'oneroom_year': 320.0, 'place': '정문', 'row': 53, 'source': '../data/csv_data/rental_data_with_null.csv'}


page_content='<row><name>준원룸</name><address>호서로 67-13</address><contact>010-4929-9598</contact><price>년세 큰 방 330 / 작은 방 300</price><fee>X / 30만원</fee><options>풀옵션 (인터넷 무료)</options><gas_type>도시가스</gas_type><comment>퇴실 시, 청소비 청구</comment><place>정문</place><oneroom_half_year></oneroom_half_year><oneroom_year>330</oneroom_year><tworoom_half_year></tworoom_half_year><tworoom_year></tworoom_year></row>

' metadata={'filename': '.

In [10]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

# 모델 초기화
model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-v2-m3")

# 상위 3개의 문서 선택
compressor = CrossEncoderReranker(model=model, top_n=4)

# 문서 압축 검색기 초기화
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

# 압축된 문서 검색
# compressed_docs = compression_retriever.invoke("중문 자취방 추천해줘")

# # 문서 출력
# pretty_print_docs(compressed_docs)

In [None]:
# 압축된 문서 검색
compressed_docs = compression_retriever.invoke("년세 250이상인 정문 자취방 추천해줘")

# 문서 출력
pretty_print_docs(compressed_docs)

In [None]:
# from langchain import hub

# prompt = hub.pull("rlm/rag-prompt-mistral")

In [11]:
from langchain_core.prompts import load_prompt

prompt=load_prompt("../prompts/rented_room.yaml")
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='You are a Q&A assistant specializing in providing information about rental housing.\nAnswer user questions with concise and practical advice. \n\n# Requirements\nFew-shot examples must meet all the following conditions:\n\n1. The {context} information must be provided in accordance with the one-shot example format answer format.\n2. One-shot example format If you don\'t know the value when you write it, don\'t fill it out and move on\n3. The answer format must fit the template provided.\n4. The information should be specific and practical in accordance with the requirements of the user question.\n5. Each item (name, address, price, etc.) must be filled realistically, and blank or meaningless values are not allowed.\n6. If you don\'t have a context, please answer with "죄송합니다. 더 자세하게 질문해주시면 감사하겠습니다.".\n7. For information with <comment></comment> tag, do not write "참고사항:"\n8. For inform

In [12]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [13]:
answer=chain.invoke("반년세 100~250 사이인 정문 자취방 추천해줘")
print(answer)

이름: 솔원룸  
주소: 호서로 67-6  
가격: 반년세 180  
보증금/관리비: 년세 30만원 반년세 15만원  
옵션: 풀옵션  
위치: 정문  

이름: H. S캐슬  
주소: 호서로 79번길 7-3  
가격: 반년세 220  
보증금/관리비: 30만원  
옵션: 풀옵션  
위치: 정문  

더 상세한 정보는 아래 연락처로 문의해주세요.

솔원룸 - 010-6436-2816  
H. S캐슬 - 010-3898-9441  
