# Libs

In [1]:
import os
while os.getcwd().split('\\')[-1] != 'master-llm-rag-vnlaw':
    os.chdir('..')
    print(os.getcwd())

c:\Users\Admin\Desktop\DANG NHI\repos\master-llm-rag-vnlaw


In [2]:
import os
import glob
import ollama
import chromadb
from chromadb import Settings, EmbeddingFunction, Embeddings

import torch
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
base_model_id = "FacebookAI/xlm-roberta-base"
trained_model_path = "trained_embedding_model/"
data_path = "data/"

MAX_LEN = 512
OVERLAP = 50

In [4]:
model = AutoModel.from_pretrained(base_model_id)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

In [9]:
text = 'Tôi đi học'
with torch.no_grad():
    token = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LEN)
    output = model(**token)

In [None]:
class MyEmbeddingFunction(EmbeddingFunction):
    def __init__(self, model, tokenizer, max_length=512):
        self.model = model
        self.tokenizer = tokenizer
        self.max_length = max_length


    def embed_documents(self, texts):
        inputs = self.tokenizer(texts, padding='max_length', max_length=self.max_length, return_tensors="pt")
        inputs.to(device)
        with torch.no_grad():
            embeddings = self.model(**inputs)
        return embeddings.numpy()

    def embed_query(self, text):
        inputs = self.tokenizer(text, return_tensors="pt")
        with torch.no_grad():
            embeddings = self.model(**inputs).pooler_output
        return embeddings.numpy()
    
myembed = MyEmbeddingFunction(model, tokenizer)

### test my embedding

In [7]:
output = myembed.embed_documents(["Tôi đi học"])
output1 = myembed.embed_query("Tôi đăng ký học")

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

similarity = cosine_similarity(output.reshape(1, -1), output1.reshape(1, -1))
print(f"Cosine similarity:\n {similarity}")

Cosine similarity:
 [[0.8894975]]


### Test ollama embedding

In [20]:
host = 'https://a72e-34-13-144-158.ngrok-free.app'


ollama_client = ollama.Client(
    host=host,
    headers={"Header": "application/json"},
)
import numpy as np


class OllamaEmbed(EmbeddingFunction):
    def __call__(self, input: str) -> Embeddings:
        embeddings = ollama_client.embed(
            model="mxbai-embed-large",
            input=input,
        )
        return np.array(embeddings.embeddings)

In [21]:
ollama_embed = OllamaEmbed()

In [22]:
x = ollama_embed("Tôi đi học")

ResponseError: The endpoint a72e-34-13-144-158.ngrok-free.app is offline.

ERR_NGROK_3200
 (status code: 404)

In [None]:
x

# Add to db

In [6]:
# %pip install langchain

from langchain_text_splitters import RecursiveCharacterTextSplitter

def count_tokens(text):
    return len(tokenizer.encode(text))

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=MAX_LEN,
    chunk_overlap=OVERLAP,
    length_function=count_tokens,
    is_separator_regex=False,
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ]
)

client = chromadb.PersistentClient(
    path="./chroma_db",
    settings=Settings(allow_reset=True,),)

In [7]:
# client.reset()

collection = client.get_or_create_collection(
    name="VNLaws",
    metadata={"hnsw:space": "cosine"},
    embedding_function=myembed
)

In [8]:
collection.count()

12861

In [9]:
files = os.listdir(data_path)

In [10]:
# existing_documents = collection.query(where={"url": url})

In [25]:
import json
for file in files:
    print(f" ----------- PROCESSING FILE {file} ----------- ")
    with open(os.path.join(data_path, file), 'r', encoding='utf-8') as f:
        data = json.load(f)

    all_metadata = []
    all_ids = []
    all_chunks = []
    all_embeddings = []

    for document in data:
        url = document['url']
        name = document['name']
        content = document['content']

        # Check if the document is already in the collection
        try:
            existing_documents = collection.get(include=["metadatas"])
            checking = [metadata['url']==url for metadata in existing_documents["metadatas"] if metadata]

            if any(checking):
                print(f"Document with URL {url} already exists in the collection {name}. Skipping.")
                continue # Skip this document
        except:
            pass

        # Add the document to the collection

        content = "\n".join(content)
        chunks = text_splitter.split_text(content)
        all_chunks.extend(chunks)

        print((name, len(chunks)))

        embeddings = myembed.embed_documents(chunks)
        all_embeddings.extend(embeddings)

        metadata = [{"url": url, "name": name}] * len(chunks)
        all_metadata.extend(metadata)

        ids = [f"{url}-{i}" for i in range(len(chunks))]
        all_ids.extend(ids)

        collection.add(
            documents=all_chunks,
            metadatas=all_metadata,
            ids=all_ids,
            embeddings= all_embeddings
        )
        print("Current collection count: ", collection.count())

 ----------- PROCESSING FILE BaoHiem.json ----------- 
Document with URL https://dulieuphapluat.vn/van-ban/bao-hiem-van-ban/nghi-quyet-062024nq-hdnd-ve-muc-gia-dich-vu-kham-benh-chua-benh-khong-thuoc-pham-vi-thanh-toan-cua-quy-bao-hiem-y-te-ma-khong-phai-la-dich-vu-kham-benh-chua-benh-theo-yeu-cau-tai-cac-co-so-kham-benh-chua-benh-cua-nha-nuoc-tren-dia-ban-tinh-khanh-hoa-1197970.html already exists in the collection Nghị quyết 06/2024/NQ-HĐND. Skipping.
Document with URL https://dulieuphapluat.vn/van-ban/bao-hiem-van-ban/nghi-quyet-052024nq-hdnd-sua-doi-nghi-quyet-112022nq-hdnd-quy-dinh-chinh-sach-ho-tro-dong-bao-hiem-y-te-cho-cac-nhom-doi-tuong-xa-hoi-tren-dia-ban-thanh-pho-hai-phong-1198014.html already exists in the collection Nghị quyết 05/2024/NQ-HĐND. Skipping.
Document with URL https://dulieuphapluat.vn/van-ban/bao-hiem-van-ban/nghi-quyet-262024nq-hdnd-quy-dinh-muc-gia-dich-vu-kham-benh-chua-benh-khong-thuoc-pham-vi-thanh-toan-cua-quy-bao-hiem-y-te-ma-khong-phai-la-dich-vu-kham-

In [24]:
checking

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,

In [16]:
url

'https://dulieuphapluat.vn/van-ban/bao-hiem-van-ban/nghi-quyet-062024nq-hdnd-ve-muc-gia-dich-vu-kham-benh-chua-benh-khong-thuoc-pham-vi-thanh-toan-cua-quy-bao-hiem-y-te-ma-khong-phai-la-dich-vu-kham-benh-chua-benh-theo-yeu-cau-tai-cac-co-so-kham-benh-chua-benh-cua-nha-nuoc-tren-dia-ban-tinh-khanh-hoa-1197970.html'

In [None]:
text = ["Văn bản quy định giá tính bảo hiểm là"]
embed = myembed.embed_query(text[0])

results = collection.query(embed, n_results=10) 

In [None]:
results