<a href="https://colab.research.google.com/github/dkbs12/External_test/blob/main/Phase02_LFQA_test_06.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%bash

pip install --upgrade pip
pip install farm-haystack[colab,elasticsearch,inference,ocr,preprocessing,file-conversion,pdf]
pip install datasets>=2.6.1

apt install libgraphviz-dev
pip install pygraphviz

In [None]:
%%bash

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [None]:
%%bash --bg

sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [None]:
import os
import time

from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore

time.sleep(30)

host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

document_store = ElasticsearchDocumentStore(host=host, username="", password="", index="document", embedding_dim=1536)

In [None]:
from haystack.utils import fetch_archive_from_http, convert_files_to_docs
from haystack.nodes import PreProcessor

doc_dir = "data/Phase1_test_data_04"
url = "https://github.com/dkbs12/External_test/raw/main/Phase1_test_data_04.zip"
fetch_archive_from_http(url=url, output_dir=doc_dir)

# convert files to dicts containing documents that can be indexed to our datastore
got_docs = convert_files_to_docs(dir_path=doc_dir)

In [None]:
preprocessor = PreProcessor(
    clean_whitespace=True,
    clean_header_footer=True,
    clean_empty_lines=True,
    split_by="word",
    split_length=200,
    split_overlap=20,
    split_respect_sentence_boundary=True,
)

all_docs = preprocessor.process(got_docs)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Preprocessing: 100%|██████████| 6/6 [00:00<00:00, 19.92docs/s]


In [None]:
document_store.delete_documents()
document_store.write_documents(all_docs)

In [None]:
from haystack.nodes import BM25Retriever, EmbeddingRetriever
from haystack.utils import print_answers

bm25_retriever = BM25Retriever(document_store=document_store)

embedding_retriever = EmbeddingRetriever(
    document_store=document_store, batch_size=8,
    embedding_model="text-embedding-ada-002", api_key=api_key, max_seq_len=1536
)

document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False)

Updating embeddings:   0%|          | 0/148 [00:00<?, ? Docs/s]
Calculating embeddings:   0%|          | 0/19 [00:00<?, ?it/s][A
Calculating embeddings:   5%|▌         | 1/19 [00:00<00:17,  1.03it/s][A
Calculating embeddings:  11%|█         | 2/19 [00:01<00:15,  1.07it/s][A
Calculating embeddings:  16%|█▌        | 3/19 [00:03<00:19,  1.24s/it][A
Calculating embeddings:  21%|██        | 4/19 [00:04<00:16,  1.08s/it][A
Calculating embeddings:  26%|██▋       | 5/19 [00:05<00:13,  1.03it/s][A
Calculating embeddings:  32%|███▏      | 6/19 [00:05<00:11,  1.11it/s][A
Calculating embeddings:  37%|███▋      | 7/19 [00:06<00:10,  1.17it/s][A
Calculating embeddings:  42%|████▏     | 8/19 [00:07<00:09,  1.18it/s][A
Calculating embeddings:  47%|████▋     | 9/19 [00:08<00:09,  1.02it/s][A
Calculating embeddings:  53%|█████▎    | 10/19 [00:09<00:08,  1.02it/s][A
Calculating embeddings:  58%|█████▊    | 11/19 [00:10<00:07,  1.06it/s][A
Calculating embeddings:  63%|██████▎   | 12/19 [00:11<

In [None]:
from haystack.nodes import PromptNode, PromptTemplate, AnswerParser

prompt_template = PromptTemplate(prompt="Create a concise and informative answer (no more than 50 words) for a given question "
            "based solely on the given documents. You must only use information from the given documents. "
            "Use an unbiased and journalistic tone. Do not repeat text. Cite the documents using Document[number] notation. "
            "If multiple documents contain the answer, cite those documents like ‘as stated in Document[number], Document[number], etc.’. "
            "If the documents do not contain the answer to the question, say that ‘answering is not possible given the available information.’\n"
            "{join(documents, delimiter=new_line, pattern=new_line+'Document[$idx]: $content', str_replace={new_line: ' ', '[': '(', ']': ')'})} \n Question: {query}; Answer: ",
            output_parser=AnswerParser(reference_pattern=r"Document\[(\d+)\]"),
        )

prompt_node = PromptNode(
    model_name_or_path="text-davinci-003", api_key=api_key, default_prompt_template=prompt_template,
    use_gpu=True, max_length=200, top_k=1, model_kwargs={"temperature":0},
)

In [None]:
from haystack.pipelines import Pipeline
from haystack.nodes import JoinDocuments

# Create ensembled pipeline
p_ensemble = Pipeline()
p_ensemble.add_node(component=bm25_retriever, name="BM25Retriever", inputs=["Query"])
p_ensemble.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])
p_ensemble.add_node(
    component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["BM25Retriever", "EmbeddingRetriever"]
)
p_ensemble.add_node(component=prompt_node, name="prompt_node", inputs=["JoinResults"])

# Uncomment the following to generate the pipeline image
# p_ensemble.draw("pipeline_ensemble.png")

In [None]:
res = p_ensemble.run(
    query="NDC는 무엇인가요??", params={"EmbeddingRetriever": {"top_k": 2}, "BM25Retriever": {"top_k": 2}}
)
print_answers(res, details="minimum")

Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.11it/s]


'Query: NDC는 무엇인가요??'
'Answers:'
[   {   'answer': 'NDC는 XML 프로토콜을 기반으로 하는 데이터 교환 방식이며, API 중심 접근 방식을 규정하는 '
                  '규칙이며, 항공 여행에만 적용되며, 직접 유통 비용 절감과 관련이 없습니다. As stated in '
                  'Document[2],'}]


In [None]:
res = p_ensemble.run(
    query="항공사는 GDS를 통해 continuous pricing을 할 수 있는지 한국어로 답변 해 주세요.", params={"EmbeddingRetriever": {"top_k": 2}, "BM25Retriever": {"top_k": 2}}
)
print_answers(res, details="minimum")

Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.40it/s]


'Query: 항공사는 GDS를 통해 continuous pricing을 할 수 있는지 한국어로 답변 해 주세요.'
'Answers:'
[   {   'answer': 'NDC는 항공사가 리셀러에게 맞춤형 제안, 유연한 실시간 티켓 가격, 레거시 시스템 종속성 감소 등을 '
                  '제공하는 데 도움이 됩니다. 또한, 항공사는 GDS 채널에서 일부 요금 판매'}]


In [None]:
res = p_ensemble.run(
    query="Lufthansa Consulting의 수석 컨설턴트인 Esther Samtlebe는 뭐라고 말했는가??", params={"EmbeddingRetriever": {"top_k": 2}, "BM25Retriever": {"top_k": 2}}
)
print_answers(res, details="medium")

Calculating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.34s/it]


'Query: Lufthansa Consulting의 수석 컨설턴트인 Esther Samtlebe는 뭐라고 말했는가??'
'Answers:'
[   {   'answer': 'What is the main purpose of NDC?\n'
                  '\n'
                  'As stated in Document[1] and Document[3], the main purpose '
                  'of NDC is to enable airlines to customize and offer '
                  'products through indirect channels, such as their website, '
                  'in the same way they already do.'}]


In [None]:
res = p_ensemble.run(
    query="NDC에 대한 오해는 무엇인가? 한국어로 답하시오.", params={"EmbeddingRetriever": {"top_k": 2}, "BM25Retriever": {"top_k": 2}}
)
print_answers(res, details="minimum")

Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.44it/s]


'Query: NDC에 대한 오해는 무엇인가? 한국어로 답하시오.'
'Answers:'
[   {   'answer': 'NDC는 XML 프로토콜을 기반으로 하는 데이터 교환 방식입니다. 또한, NDC는 API 중심 접근 방식을 '
                  '규정하며 항공 여행에만 적용됩니다. As stated in Document[1], Document[3].'}]
