<a href="https://colab.research.google.com/github/diegoquintanav/theLMbook/blob/main/RAG_example_hf_unstructured_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://huggingface.co/learn/cookbook/rag_with_hf_and_milvus

In [2]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
! pip install --quiet --upgrade pymilvus sentence-transformers huggingface-hub langchain_community langchain-text-splitters langchain_huggingface pypdf tqdm

In [1]:
%%bash

if [ ! -f "The-AI-Act.pdf" ]; then
    wget -q https://artificialintelligenceact.eu/wp-content/uploads/2021/08/The-AI-Act.pdf
fi

In [2]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("The-AI-Act.pdf")
docs = loader.load()
print(len(docs))

108


In [8]:
docs[0].to_json()

{'lc': 1,
 'type': 'constructor',
 'id': ['langchain', 'schema', 'document', 'Document'],
 'kwargs': {'metadata': {'producer': 'PDF CoDe 2018 4.7111.7111 (c) 2002-2018 European Commission',
   'creator': 'PDF CoDe 2018 4.7111.7111 (c) 2002-2018 European Commission',
   'creationdate': '2021-04-22T15:27:19+02:00',
   'moddate': '2021-04-22T15:27:19+02:00',
   'title': '',
   'author': '',
   'subject': '',
   'keywords': '',
   'source': 'The-AI-Act.pdf',
   'total_pages': 108,
   'page': 0,
   'page_label': '1'},
  'page_content': 'EN   EN \n \n \n \nEUROPEAN \nCOMMISSION  \nBrussels, 21.4.2021  \nCOM(2021) 206 final \n2021/0106 (COD) \n \nProposal for a \nREGULATION OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL \nLAYING DOWN HARMONISED RULES ON ARTIFICIAL INTELLIGENCE \n(ARTIFICIAL INTELLIGENCE ACT) AND AMENDING CERTAIN UNION \nLEGISLATIVE ACTS \n{SEC(2021) 167 final} - {SWD(2021) 84 final} - {SWD(2021) 85 final}',
  'type': 'Document'}}

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(docs)

In [10]:
text_lines = [chunk.page_content for chunk in chunks]

In [11]:

text_lines[0]

'EN   EN \n \n \n \nEUROPEAN \nCOMMISSION  \nBrussels, 21.4.2021  \nCOM(2021) 206 final \n2021/0106 (COD) \n \nProposal for a \nREGULATION OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL \nLAYING DOWN HARMONISED RULES ON ARTIFICIAL INTELLIGENCE \n(ARTIFICIAL INTELLIGENCE ACT) AND AMENDING CERTAIN UNION \nLEGISLATIVE ACTS \n{SEC(2021) 167 final} - {SWD(2021) 84 final} - {SWD(2021) 85 final}'

In [12]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")


def emb_text(text):
    return embedding_model.encode([text], normalize_embeddings=True).tolist()[0]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
test_embedding = emb_text("This is a test")
embedding_dim = len(test_embedding)
print(embedding_dim)
print(test_embedding[:10])

384
[-0.07660680264234543, 0.02531672641634941, 0.012505538761615753, 0.004595162346959114, 0.02577998675405979, 0.038167111575603485, 0.08050814270973206, 0.0030353872571140528, 0.024392176419496536, 0.004880355205386877]


In [14]:
from pymilvus import MilvusClient

milvus_client = MilvusClient(uri="./hf_milvus_demo.db")

collection_name = "rag_collection"

In [15]:
if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)

In [16]:
milvus_client.create_collection(
    collection_name=collection_name,
    dimension=embedding_dim,
    metric_type="IP",  # Inner product distance
    consistency_level="Strong",  # Strong consistency level
)

In [17]:
# insert data
from tqdm import tqdm

data = []

for i, line in enumerate(tqdm(text_lines, desc="Creating embeddings")):
    data.append({"id": i, "vector": emb_text(line), "text": line})

insert_res = milvus_client.insert(collection_name=collection_name, data=data)
insert_res["insert_count"]

Creating embeddings: 100%|██████████| 424/424 [00:07<00:00, 54.96it/s]


424

In [18]:
question = "What is the legal basis for the proposal?"

In [20]:
search_res = milvus_client.search(
    collection_name=collection_name,
    data=[emb_text(question)],  # Use the `emb_text` function to convert the question to an embedding vector
    limit=3,  # Return top 3 results
    search_params={"metric_type": "IP", "params": {}},  # Inner product distance
    output_fields=["text"],  # Return the text field
)

search_res

data: [[{'id': 27, 'distance': 0.7306115031242371, 'entity': {'text': 'EN 6  EN \n2. LEGAL BASIS, SUBSIDIARITY AND PROPORTIONALITY \n2.1. Legal basis \nThe legal basis for the proposal is in the first place Article 114 of the Treaty on the \nFunctioning of the European Union (TFEU), which provides for the adoption of measures to \nensure the establishment and functioning of the internal market.  \nThis proposal constitutes a core part of the EU digital single market strategy. The primary \nobjective of this proposal is to ensure the proper functioning of the internal market by setting \nharmonised rules in particular on the development, placing on the Union market and the use \nof products and services making use of AI technologies or provided as stand -alone AI \nsystems. Some Member States are already considering national rules to ensure that AI is safe \nand is developed and used in compliance with fundamental rights obligations. This will likely \nlead to two main problems: i) a fr

In [21]:
import json

retrieved_lines_with_distances = [(res["entity"]["text"], res["distance"]) for res in search_res[0]]
print(json.dumps(retrieved_lines_with_distances, indent=4))

[
    [
        "EN 6  EN \n2. LEGAL BASIS, SUBSIDIARITY AND PROPORTIONALITY \n2.1. Legal basis \nThe legal basis for the proposal is in the first place Article 114 of the Treaty on the \nFunctioning of the European Union (TFEU), which provides for the adoption of measures to \nensure the establishment and functioning of the internal market.  \nThis proposal constitutes a core part of the EU digital single market strategy. The primary \nobjective of this proposal is to ensure the proper functioning of the internal market by setting \nharmonised rules in particular on the development, placing on the Union market and the use \nof products and services making use of AI technologies or provided as stand -alone AI \nsystems. Some Member States are already considering national rules to ensure that AI is safe \nand is developed and used in compliance with fundamental rights obligations. This will likely \nlead to two main problems: i) a fragmentation of the internal market on essential elemen

In [22]:
context = "\n".join([line_with_distance[0] for line_with_distance in retrieved_lines_with_distances])

In [23]:
PROMPT = """
Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>
"""

In [24]:
from huggingface_hub import InferenceClient

repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

llm_client = InferenceClient(model=repo_id, timeout=120)

In [25]:
prompt = PROMPT.format(context=context, question=question)

In [26]:
answer = llm_client.text_generation(
    prompt,
    max_new_tokens=1000,
).strip()
print(answer)

The legal basis for the proposal is Article 114 of the Treaty on the Functioning of the European Union (TFEU), which provides for the adoption of measures to ensure the establishment and functioning of the internal market.
