In [1]:
from dotenv import load_dotenv
import nest_asyncio

nest_asyncio.apply()
load_dotenv(dotenv_path='../backend/secret.env')

True

In [3]:
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
llm_emb = GoogleGenerativeAIEmbeddings(model='models/text-embedding-004',google_api_key=os.environ.get('_GOOGLE_API_KEY_'))

In [None]:
from langchain.retrievers import EnsembleRetriever
from opensearchpy import OpenSearch

client = OpenSearch(
    hosts=[{'host': 'localhost', 'port': 9200}],
    http_compress=True,
    http_auth=('','!'),
    use_ssl=True,
    verify_certs=False,
)
client.info()



{'name': 'opensearch',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': '5Q3nu-KVR7GEp7JMnmG1kg',
 'version': {'distribution': 'opensearch',
  'number': '2.19.0',
  'build_type': 'tar',
  'build_hash': 'fd9a9d90df25bea1af2c6a85039692e815b894f5',
  'build_date': '2025-02-05T16:13:36.244552508Z',
  'build_snapshot': False,
  'lucene_version': '9.12.1',
  'minimum_wire_compatibility_version': '7.10.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'The OpenSearch Project: https://opensearch.org/'}

In [10]:
query = "2024년 하반기 은행 가계대출 변화량에 대해 설명해봐"

In [35]:
def get_query(**kwargs):
    search_type = kwargs.get('search_type', 'lexical')
    
    # lexcial query
    if search_type == 'lexical':
        template = {
            "query": {
                "bool": {
                    "must": [
                        {
                            "match": {
                                "text": {
                                    "query": f"{kwargs.get("query")}",
                                    "minimum_should_match": f"{kwargs.get("minimum_should_match",0)}%",
                                    "operator": "or"
                                }
                            }
                        }
                    ],
                    "filter": []
                }
            }
        }

    # semantic query
    if search_type == 'semantic':
        template = {
            "query": {
                "bool": {
                    "must": [
                        {
                            "knn": {
                                f"{kwargs.get("vector_field")}": {
                                    "vector": kwargs.get("vector"),
                                    "k": kwargs.get("k")
                                }
                            }
                        }
                    ],
                    "filter": []
                }
            }
        }

    return template

def get_document(opensearch_client, query, index_name):
    response = opensearch_client.search(
        body=query,
        index=index_name
    )
    return response


In [36]:
# lexical search
def get_lexical_search(**kwargs):
    query = get_query(query=kwargs.get('query'), minimum_should_match=kwargs.get("minimum_should_match",0))
    query['size'] = kwargs.get('size',3)
    response = get_document(client, query, index_name='pdf-doc-index')
    return response

lexical_result = get_lexical_search(query=query, minimum_should_match=0)
lexical_result

{'took': 136,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 20, 'relation': 'eq'},
  'max_score': 12.424187,
  'hits': [{'_index': 'pdf-doc-index',
    '_id': '027774f5-d42f-4b28-9bff-9f0a3c630bc9',
    '_score': 12.424187,
    '_source': {'vector_field': [0.03660035878419876,
      0.03467802330851555,
      -0.05881042778491974,
      -0.0007534335600212216,
      0.0522775873541832,
      0.007852929644286633,
      0.0027374057099223137,
      -0.006022005341947079,
      0.007569379638880491,
      0.0012516396818682551,
      -0.013684858568012714,
      0.05454780533909798,
      0.040218234062194824,
      0.03482546657323837,
      -0.009443695656955242,
      -0.0067518409341573715,
      0.020746996626257896,
      -0.02724524773657322,
      -0.1098027229309082,
      0.0014638541033491492,
      -0.011565369553864002,
      0.03358423337340355,
      0.07506093382835388,
      0.03183059021830559,
  

In [47]:
hits = lexical_result['hits']['hits']
max_score = lexical_result['hits']['max_score']
for hit in hits:
    hit["_score"] = float(hit['_score']) / max_score
lexical_result['hits']['max_score'] = hits[0]["_score"]
lexical_result['hits']['hits'] = hits
lexical_result

{'took': 136,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 20, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'pdf-doc-index',
    '_id': '027774f5-d42f-4b28-9bff-9f0a3c630bc9',
    '_score': 1.0,
    '_source': {'vector_field': [0.03660035878419876,
      0.03467802330851555,
      -0.05881042778491974,
      -0.0007534335600212216,
      0.0522775873541832,
      0.007852929644286633,
      0.0027374057099223137,
      -0.006022005341947079,
      0.007569379638880491,
      0.0012516396818682551,
      -0.013684858568012714,
      0.05454780533909798,
      0.040218234062194824,
      0.03482546657323837,
      -0.009443695656955242,
      -0.0067518409341573715,
      0.020746996626257896,
      -0.02724524773657322,
      -0.1098027229309082,
      0.0014638541033491492,
      -0.011565369553864002,
      0.03358423337340355,
      0.07506093382835388,
      0.03183059021830559,
      -0.04475

In [49]:
from langchain.schema import Document

lexical_results = []
for res in lexical_result['hits']['hits']:
    metadata = res['_source']['metadata']
    metadata['id'] = res["_id"]

    doc = Document(page_content=res['_source']['text'], metadata=metadata)
    lexical_results.append((doc, res['_score']))
lexical_results

[(Document(metadata={'category': 'Table', 'origin_table': '<table><tr><td/><td>8월</td><td>9월</td><td>10월</td><td>11월</td><td>12월</td><td>25.1월</td></tr><tr><td>아파트매매거래량(전국, 만호)1)</td><td>4.3</td><td>3.0</td><td>3.8</td><td>3.1</td><td>2.7</td><td>..</td></tr><tr><td>매매거래량(수도권, 만호)1)</td><td>2.2</td><td>1.3</td><td>1.5</td><td>1.2</td><td>1.0</td><td>..</td></tr><tr><td>매매거래량(서울, 만호)1)</td><td>0.6</td><td>0.3</td><td>0.4</td><td>0.3</td><td>0.3</td><td>..</td></tr><tr><td>전세거래량(전국, 〃)</td><td>5.1</td><td>4.3</td><td>5.2</td><td>4.4</td><td>4.1</td><td>..</td></tr><tr><td>분양물량 (전국, 〃)</td><td>1.7</td><td>1.8</td><td>2.5</td><td>3.2</td><td>2.0</td><td>1.0</td></tr><tr><td>입주물량 (전국, 〃)</td><td>3.2</td><td>2.6</td><td>3.0</td><td>3.4</td><td>3.1</td><td>3.8</td></tr></table> <table><tr><td/><td>2023</td><td>연중</td><td>1월</td><td>2024</td><td>연중</td><td>1월</td><td>10월</td><td>11월</td><td>12월</td><td>1월</td><td>잔액</td><td/><td/><td/><td/><td/><td/></tr><tr><td>은행가계대출1)</td><td>36.9</td><td>-

In [26]:
# semantic search
def get_semantic_search(**kwargs):
    vector = llm_emb.embed_query(kwargs.get('query'))
    query = get_query(search_type='semantic', query=kwargs.get('query'), vector_field='vector_field', vector=vector, k=kwargs.get('k',3))
    query['size'] = kwargs.get('k',3)
    response = get_document(client, query, index_name='pdf-doc-index')
    return response
    

semantic_result = get_semantic_search(query=query, k=3)
semantic_result

{'took': 138,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 3, 'relation': 'eq'},
  'max_score': 0.8476915,
  'hits': [{'_index': 'pdf-doc-index',
    '_id': '1ebca500-c131-498f-acde-7688e582e633',
    '_score': 0.8476915,
    '_source': {'vector_field': [0.029943125322461128,
      0.050077859312295914,
      -0.059478748589754105,
      -0.040058448910713196,
      0.04741130769252777,
      0.037007886916399,
      -0.001827860134653747,
      0.0007472230354323983,
      0.02174554392695427,
      0.006068648770451546,
      -0.015860244631767273,
      0.04873662441968918,
      0.012142824940383434,
      0.04977533593773842,
      -0.0109878433868289,
      -0.014733745716512203,
      0.03179149702191353,
      -0.038879040628671646,
      -0.10136670619249344,
      0.025863295421004295,
      -0.013753754086792469,
      0.015825005248188972,
      0.060541555285453796,
      0.06432097405195236,
      

In [50]:
hits = semantic_result['hits']['hits']
max_score = semantic_result['hits']['max_score']
for hit in hits:
    hit["_score"] = float(hit['_score']) / max_score
semantic_result['hits']['max_score'] = hits[0]["_score"]
semantic_result['hits']['hits'] = hits
semantic_result

{'took': 138,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 3, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'pdf-doc-index',
    '_id': '1ebca500-c131-498f-acde-7688e582e633',
    '_score': 1.0,
    '_source': {'vector_field': [0.029943125322461128,
      0.050077859312295914,
      -0.059478748589754105,
      -0.040058448910713196,
      0.04741130769252777,
      0.037007886916399,
      -0.001827860134653747,
      0.0007472230354323983,
      0.02174554392695427,
      0.006068648770451546,
      -0.015860244631767273,
      0.04873662441968918,
      0.012142824940383434,
      0.04977533593773842,
      -0.0109878433868289,
      -0.014733745716512203,
      0.03179149702191353,
      -0.038879040628671646,
      -0.10136670619249344,
      0.025863295421004295,
      -0.013753754086792469,
      0.015825005248188972,
      0.060541555285453796,
      0.06432097405195236,
      -0.028159013

In [52]:
from langchain.schema import Document

semantic_results = []
for res in semantic_result['hits']['hits']:
    metadata = res['_source']['metadata']
    metadata['id'] = res["_id"]

    doc = Document(page_content=res['_source']['text'], metadata=metadata)
    semantic_results.append((doc, res['_score']))
semantic_results

[(Document(metadata={'category': 'Table', 'origin_table': '<table><tr><td>항목</td><td>24.8월</td><td>9월</td><td>10월</td><td>11월</td><td>12월</td><td>25.1월</td><td>25.1월말 잔액</td></tr><tr><td>은행가계대출1)</td><td>9.2</td><td>5.6</td><td>3.8</td><td>1.9</td><td>-0.4</td><td>-0.5</td><td>1,140.5</td></tr><tr><td>주택담보대출2)</td><td>8.2</td><td>6.1</td><td>3.6</td><td>1.5</td><td>0.8</td><td>1.7</td><td>904.3</td></tr><tr><td>기타대출3)</td><td>1.1</td><td>-0.5</td><td>0.3</td><td>0.4</td><td>-1.1</td><td>-2.1</td><td>235.3</td></tr><tr><td>은행기업대출1)</td><td>7.2</td><td>4.3</td><td>8.1</td><td>2.2</td><td>-11.5</td><td>7.8</td><td>1,322.9</td></tr><tr><td>대기업</td><td>1.9</td><td>0.8</td><td>2.9</td><td>0.2</td><td>-4.3</td><td>6.1</td><td>280.6</td></tr><tr><td>중소기업</td><td>5.3</td><td>3.5</td><td>5.3</td><td>2.0</td><td>-7.1</td><td>1.8</td><td>1,042.3</td></tr><tr><td>&lt;중소법인&gt;</td><td>4.5</td><td>3.3</td><td>4.9</td><td>1.5</td><td>-5.8</td><td>2.1</td><td>587.2</td></tr><tr><td>&lt;개인사업자&gt;</td><t

In [58]:
from typing_extensions import List
from langchain.schema import Document

def get_ensemble_results(
        doc_lists: List[List[Document]], 
        weights: List[float], 
        algorithm="RRF", 
        c=60, 
        k=3
    ) -> List[Document]:
    all_documents = set()

    for doc_list in doc_lists:
        for (doc,_) in doc_list:
            all_documents.add(doc.page_content)

    hybrid_score_dic = {doc: 0.0 for doc in all_documents}
    
    for doc_list, weight in zip(doc_lists, weights):
        for rank, (doc, score) in enumerate(doc_list, start=1):
            if algorithm == "RRF":
                score = weight * (1 / (rank + c))
            elif algorithm == "simple_weighted":
                score *= weight
            hybrid_score_dic[doc.page_content] += score
    
    sorted_documents = sorted(
        hybrid_score_dic.items(), key=lambda x: x[1], reverse=True
    )

    page_content_to_doc_map = {
        doc.page_content: doc for doc_list in doc_lists for (doc, orig_score) in doc_list
    }

    sorted_docs = [
        (page_content_to_doc_map[page_content], hybrid_score) for (page_content, hybrid_score) in sorted_documents
    ]

    return sorted_docs[:k]


final_results = get_ensemble_results(doc_lists=[lexical_results, semantic_results],weights=[.51,.49], algorithm="RRF", c=60, k=3)
final_results

[(Document(metadata={'source': './data/complex_pdf/pickle/parsed_llamaparse_2.md', 'text_as_html': '<table><tr><td/><td>8월</td><td>9월</td><td>10월</td><td>11월</td><td>12월</td><td>25.1월</td></tr><tr><td>아파트매매거래량(전국, 만호)1)</td><td>4.3</td><td>3.0</td><td>3.8</td><td>3.1</td><td>2.7</td><td>..</td></tr><tr><td>매매거래량(수도권, 만호)1)</td><td>2.2</td><td>1.3</td><td>1.5</td><td>1.2</td><td>1.0</td><td>..</td></tr><tr><td>매매거래량(서울, 만호)1)</td><td>0.6</td><td>0.3</td><td>0.4</td><td>0.3</td><td>0.3</td><td>..</td></tr><tr><td>전세거래량(전국, 〃)</td><td>5.1</td><td>4.3</td><td>5.2</td><td>4.4</td><td>4.1</td><td>..</td></tr><tr><td>분양물량 (전국, 〃)</td><td>1.7</td><td>1.8</td><td>2.5</td><td>3.2</td><td>2.0</td><td>1.0</td></tr><tr><td>입주물량 (전국, 〃)</td><td>3.2</td><td>2.6</td><td>3.0</td><td>3.4</td><td>3.1</td><td>3.8</td></tr></table> <table><tr><td/><td>2023</td><td>연중</td><td>1월</td><td>2024</td><td>연중</td><td>1월</td><td>10월</td><td>11월</td><td>12월</td><td>1월</td><td>잔액</td><td/><td/><td/><td/><td/><td/></t

In [62]:
client.close()