In [1]:
import pandas as pd
from langchain.vectorstores.chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.prompts import ChatPromptTemplate
# from llm_client_chat import AlpacaLLM
from llm_client import AlpacaLLM

import time, os
os.environ['HF_HOME'] = './cache/'

In [2]:
def load_embedding_model(model_path : str):
    start_time = time.time()
    encode_kwargs = {"normalize_embeddings": True}
    local_embedding = HuggingFaceEmbeddings(
        model_name=model_path,
        cache_folder="./models",
        encode_kwargs=encode_kwargs
    )
    end_time = time.time()
    print(f'model load time {round(end_time - start_time, 0)} second')
    return local_embedding

# embedding = load_embedding_model(model_path="intfloat/multilingual-e5-large")
embedding = load_embedding_model(model_path="BAAI/bge-m3")


  from .autonotebook import tqdm as notebook_tqdm


model load time 120.0 second


In [11]:
reranker_model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-v2-m3")
# reranker_model = None

In [12]:
excel_pertanyaan = pd.read_excel("Pertanyaan Benchmark.xlsx", sheet_name=None, index_col=0)

In [13]:
# CHROMA_PATH = "final_test/unstructured-multi"
# CHROMA_PATH = "final_test/langchain-multi"

CHROMA_PATH = "final_test/unstructured-bge"
# CHROMA_PATH = "final_test/langchain-bge"


loader_embed_model = CHROMA_PATH.split("/")[1]

# ==================================
reranker = "bge-reranker-v2-m3"
# reranker = "ms-marco-MultiBERT-L-12"

In [14]:
PROMPT_TEMPLATE = """### Instruction:
Your job is to answer the question based on the given pieces of information. All you have to do is answer the question. Not all of the information provided may be relevant to the question. the answer you create must be logical. Each piece of information will be separated by '---'.

### Example:
Question: What are the benefits of regular exercise for cardiovascular health?
---

Research published in the Journal of the American Heart Association indicates that regular exercise can reduce the risk of coronary heart disease by up to 30%. Physical activity helps strengthen heart muscles, improve blood circulation, and lower blood pressure.

---

Although exercise has many benefits, it is important to do it correctly to avoid injuries. Warming up before exercising and cooling down afterwards are highly recommended. Additionally, the type of exercise chosen should match the individual's physical condition to avoid unwanted risks.

---

According to a study from the Mayo Clinic, people who exercise regularly have better cholesterol levels and tend to have a healthier weight. Exercise can also increase insulin sensitivity and help regulate blood sugar levels, which are important factors in maintaining heart health.

---

Answer:
Regular physical exercise has several benefits for cardiovascular health. Firstly, it can reduce the risk of coronary heart disease by up to 30%, as it strengthens the heart muscles, improves blood circulation, and lowers blood pressure. Secondly, individuals who exercise regularly tend to have better cholesterol levels and a healthier weight, which are crucial for heart health. Additionally, regular exercise can increase insulin sensitivity and help regulate blood sugar levels, further contributing to cardiovascular well-being.

### Another example:
Question: What are the benefits of a fiber-rich diet for digestive health?
---

A fiber-rich diet is known to prevent constipation by increasing stool bulk and softness, making it easier to pass. Fiber also helps maintain gut health by promoting the growth of beneficial bacteria in the digestive system.

---

High-fiber foods such as fruits, vegetables, and whole grains are not only good for digestion but can also help control blood sugar levels and lower cholesterol. Soluble fiber in these foods helps slow down sugar absorption and binds cholesterol in the intestines.

---

Some studies suggest that a high-fiber diet can reduce the risk of colorectal cancer. Fiber helps speed up the elimination of carcinogenic substances from the colon, reducing the exposure time of colon cells to harmful materials.

---

Answer:
A diet rich in fiber has multiple benefits for digestive health. It can prevent constipation by increasing stool bulk and softness, making it easier to pass. Fiber also promotes gut health by encouraging the growth of beneficial bacteria in the digestive system. Additionally, high-fiber foods such as fruits, vegetables, and whole grains can help control blood sugar levels and lower cholesterol. Soluble fiber in these foods slows sugar absorption and binds cholesterol in the intestines. Furthermore, a high-fiber diet can reduce the risk of colorectal cancer by speeding up the removal of carcinogenic substances from the colon, thereby reducing the exposure time of colon cells to harmful materials.

### Input
Question: {question}
---

{context}

---
"""

In [15]:
# run without reranking
from deep_translator import GoogleTranslator

db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding)

for sheet in excel_pertanyaan:
    jawaban = []
    generated_response = []
    for num, pertanyaan in enumerate(excel_pertanyaan[sheet]["Pertanyaan"]):
        if type(pertanyaan) == str:
            results = db.similarity_search_with_relevance_scores(pertanyaan, k=3)
            if len(results) == 0 or results[0][1] < 0.7:
                print(f"Similarity too low.", end="\n")

            context_text = "\n\n---\n\n".join([GoogleTranslator(source='id', target='en').translate(doc.page_content) for doc, _score in results])
            prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
            prompt = prompt_template.format(context=context_text, question=GoogleTranslator(source='id', target='en').translate(pertanyaan))
            model = AlpacaLLM()
            response_text = model.invoke(prompt)
            generated_response.append(response_text)

            print(prompt)
            print("Jawaban ", response_text)

            list_jwbn = []
            for doc, _score in results:
                string = (f"{doc.page_content}"
                f"\nmetadata: {doc.metadata}"
                f"\nscore: {_score}")
                list_jwbn.append(string)

            jawaban.append(list_jwbn)
        else:
            jawaban.append([pertanyaan]*3)
            generated_response.append(pertanyaan)

    jawaban1 = [doc[0] for doc in jawaban]
    jawaban2 = [doc[1] for doc in jawaban]
    jawaban3 = [doc[2] for doc in jawaban]
    # jawaban4 = [doc[3] for doc in jawaban]
    # jawaban5 = [doc[4] for doc in jawaban]

    excel_pertanyaan[sheet]["Dokumen 1"] = jawaban1
    excel_pertanyaan[sheet]["Dokumen 2"] = jawaban2
    excel_pertanyaan[sheet]["Dokumen 3"] = jawaban3
    excel_pertanyaan[sheet]["Generated Answer"] = generated_response
    # excel_pertanyaan[sheet]["Dokumen 4"] = jawaban4
    # excel_pertanyaan[sheet]["Dokumen 5"] = jawaban5

In [16]:
with pd.ExcelWriter(f'pertanyaan benchmark_{loader_embed_model}_None.xlsx') as writer:  
    excel_pertanyaan["PNPK"].to_excel(writer, sheet_name='PNPK')
    excel_pertanyaan["Nutrisi"].to_excel(writer, sheet_name='Nutrisi')
    excel_pertanyaan["tatalaksana pada anak"].to_excel(writer, sheet_name='tatalaksana pada anak')
    excel_pertanyaan["tatalaksana anak dan bayi"].to_excel(writer, sheet_name='tatalaksana anak dan bayi')

In [17]:
## Run with Reranking
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker, FlashrankRerank
from deep_translator import GoogleTranslator

retriever = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding).as_retriever(search_kwargs={"k": 20})

# lst = ["tatalaksana pada anak"]
for sheet in excel_pertanyaan:
    jawaban = []
    generated_response = []
    for num, pertanyaan in enumerate(excel_pertanyaan[sheet]["Pertanyaan"]):
        if type(pertanyaan) == str:
            if "bge" in reranker:
                compressor = CrossEncoderReranker(model=reranker_model, top_n=3)
                compression_retriever = ContextualCompressionRetriever(
                    base_compressor=compressor, base_retriever=retriever
                )
            elif "marco" in reranker:
                compressor = FlashrankRerank(model="ms-marco-MultiBERT-L-12", top_n=3)
                compression_retriever = ContextualCompressionRetriever(
                    base_compressor=compressor, base_retriever=retriever
                )
            results = compression_retriever.invoke(pertanyaan)
            
            context_text = "\n\n---\n\n".join([GoogleTranslator(source='id', target='en').translate(doc.page_content) for doc in results])
            prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
            prompt = prompt_template.format(context=context_text, question=GoogleTranslator(source='id', target='en').translate(pertanyaan))
            model = AlpacaLLM()
            response_text = model.invoke(prompt)
            generated_response.append(response_text)
            print(prompt)
            print("Jawaban ", response_text)
            list_jwbn = []
            for doc in results:
                string = (f"{doc.page_content}"
                f"\nmetadata: {doc.metadata}")
                list_jwbn.append(string)

            jawaban.append(list_jwbn)
        else:
            jawaban.append([pertanyaan]*3)
            generated_response.append(pertanyaan)

    
    jawaban1 = [doc[0] for doc in jawaban]
    jawaban2 = [doc[1] for doc in jawaban]
    jawaban3 = [doc[2] for doc in jawaban]
    # jawaban4 = [doc[3] for doc in jawaban]
    # jawaban5 = [doc[4] for doc in jawaban]

    excel_pertanyaan[sheet]["Dokumen 1"] = jawaban1
    excel_pertanyaan[sheet]["Dokumen 2"] = jawaban2
    excel_pertanyaan[sheet]["Dokumen 3"] = jawaban3
    excel_pertanyaan[sheet]["Generated Answer"] = generated_response
    # excel_pertanyaan[sheet]["Dokumen 4"] = jawaban4
    # excel_pertanyaan[sheet]["Dokumen 5"] = jawaban5

Running pairwise ranking..
Human: ### Instruction:
Your job is to answer the question based on the given pieces of information. All you have to do is answer the question. Not all of the information provided may be relevant to the question. the answer you create must be logical. Each piece of information will be separated by '---'.

### Example:
Question: What are the benefits of regular exercise for cardiovascular health?
---

Research published in the Journal of the American Heart Association indicates that regular exercise can reduce the risk of coronary heart disease by up to 30%. Physical activity helps strengthen heart muscles, improve blood circulation, and lower blood pressure.

---

Although exercise has many benefits, it is important to do it correctly to avoid injuries. Warming up before exercising and cooling down afterwards are highly recommended. Additionally, the type of exercise chosen should match the individual's physical condition to avoid unwanted risks.

---

Accord

In [18]:
with pd.ExcelWriter(f'pertanyaan benchmark_{loader_embed_model}_{reranker}.xlsx') as writer:  
    excel_pertanyaan["PNPK"].to_excel(writer, sheet_name='PNPK')
    excel_pertanyaan["Nutrisi"].to_excel(writer, sheet_name='Nutrisi')
    excel_pertanyaan["tatalaksana pada anak"].to_excel(writer, sheet_name='tatalaksana pada anak')
    excel_pertanyaan["tatalaksana anak dan bayi"].to_excel(writer, sheet_name='tatalaksana anak dan bayi')