In [7]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import json

from torch import cuda
from pyvi.ViTokenizer import tokenize
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

# embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
# embed_model_id = 'bkai-foundation-models/vietnamese-bi-encoder'
embed_model_id = "all-MiniLM-L6-v2"

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
# device = 'cpu'
cache_dir = "../cache/"

DB_SAVE_NAME = f"ensubsec_{embed_model_id.split('/')[-1].replace('.','-')}"
DOCUMENT_DIR = "../datasets/KALAPA_ByteBattles_2023_MEDICAL_Set1/translated_subsections/"

## Retriever

In [8]:
embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'batch_size': 32, "normalize_embeddings": True, "device": device},
    cache_folder=cache_dir
)

docs = [
    "this is one document",
    "and another document"
]

embeddings = embed_model.embed_documents(docs)

print(
    f"We have {len(embeddings)} doc embeddings, each with "
    f"a dimensionality of {len(embeddings[0])}."
)

We have 2 doc embeddings, each with a dimensionality of 384.


In [9]:
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

faiss_docs = []
for filename in sorted(os.listdir(DOCUMENT_DIR)):
    filepath = os.path.join(DOCUMENT_DIR, filename)
    with open(filepath, "r", encoding="utf-8") as f:
        file_data = json.load(f)

        # subsec = {
        #     "document_name": name,
        #     "document_name_accent": document_name_with_accent,
        #     "document_title": title,
        #     "document_category": category,
        #     "subsection_name": subsection_name,
        #     "subsection_content": subsection_content,
            
        #     "subsection_name": f"{name}_{i}_{subsection_name}",
        #     "subsection_title": subsection_title,
        #     "subsection_data": subsection_data,
        # }

        faiss_docs.append(Document(
            page_content=file_data["subsection_data"],
            metadata={
                "filename": filename,
                "filepath": filepath,
                "document_name": file_data["document_name"],
                "document_name_accent": file_data["document_name_accent"],
                "document_title": file_data["document_title"],
                "document_category": file_data["document_category"],
                "subsection_name": file_data["subsection_name"],
                "subsection_title": file_data["subsection_title"],
            }
        ))

embed_model = SentenceTransformerEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'batch_size': 32, "normalize_embeddings": True, "device": device },
    cache_folder=cache_dir
)

db = FAISS.from_documents(
    documents=faiss_docs,
    embedding=embed_model,
)

In [10]:
db.save_local(DB_SAVE_NAME)

In [12]:
from langchain.vectorstores import FAISS
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

device = 'cpu'
embed_model = SentenceTransformerEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'batch_size': 32, "normalize_embeddings": True, "device": device },
    cache_folder=cache_dir
)

loaded_db = FAISS.load_local(
    DB_SAVE_NAME,
    embeddings=embed_model
)

In [13]:
query = """
What are the symptoms of heart valve disease?
A. Difficulty breathing
B. Rapid weight gain
C. Jaundice
D. Hair loss
"""

result = loaded_db.similarity_search(query=query, k=1)
result

[Document(page_content="benh ho van tim.json. Common signs of heart valve regurgitation\nHeart valve regurgitation has quite diverse symptoms, depending on the degree of regurgitation of the heart valve. With level 1/4 valve regurgitation, the patient has almost no symptoms and is very difficult to detect. Therefore, this condition is also called physiological valve regurgitation, and usually has little impact on health.\nFrom level 2/4 valve regurgitation, patients may experience symptoms such as:\nDifficulty breathing, especially when lying down or doing vigorous activities, there may be difficulty breathing at night.; Persistent fatigue even when inactive (reduced exercise tolerance).; Heart beats fast, palpitations continuously even when not active.; Dry cough, especially at night; Can't lie down with your head low.; Fainting.; Swollen ankles or feet.\nChest pain and difficulty breathing are typical symptoms of a leaky heart valve\nPatients with grade 2/4 valve regurgitation in the

In [14]:
context = result[0].page_content
print(context)

benh ho van tim.json. Common signs of heart valve regurgitation
Heart valve regurgitation has quite diverse symptoms, depending on the degree of regurgitation of the heart valve. With level 1/4 valve regurgitation, the patient has almost no symptoms and is very difficult to detect. Therefore, this condition is also called physiological valve regurgitation, and usually has little impact on health.
From level 2/4 valve regurgitation, patients may experience symptoms such as:
Difficulty breathing, especially when lying down or doing vigorous activities, there may be difficulty breathing at night.; Persistent fatigue even when inactive (reduced exercise tolerance).; Heart beats fast, palpitations continuously even when not active.; Dry cough, especially at night; Can't lie down with your head low.; Fainting.; Swollen ankles or feet.
Chest pain and difficulty breathing are typical symptoms of a leaky heart valve
Patients with grade 2/4 valve regurgitation in the early stages often do not se