In [1]:
import random
import string
import numpy as np

from pymilvus import (
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection, AnnSearchRequest, RRFRanker, connections,
)

from pymilvus.model.hybrid import BGEM3EmbeddingFunction

  from .autonotebook import tqdm as notebook_tqdm


## **Embedding BGEM3**

In [3]:
bge_m3_ef = BGEM3EmbeddingFunction(
    model_name='BAAI/bge-m3', # Specify the model name
    device='cuda:3', # Specify the device to use, e.g., 'cpu' or 'cuda:0'
    use_fp16=False # Specify whether to use fp16. Set to `False` if `device` is `cpu`.
)


Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 162150.93it/s]


In [4]:
docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]

docs_embeddings = bge_m3_ef.encode_documents(docs)

# Print embeddings
print("Embeddings:", docs_embeddings)
# Print dimension of dense embeddings
print("Dense document dim:", bge_m3_ef.dim["dense"], docs_embeddings["dense"][0].shape)
# Since the sparse embeddings are in a 2D csr_array format, we convert them to a list for easier manipulation.
print("Sparse document dim:", bge_m3_ef.dim["sparse"], list(docs_embeddings["sparse"])[0].shape)

Embeddings: {'dense': [array([-0.02505936, -0.00142195,  0.04015458, ..., -0.02094933,
        0.02623649,  0.00324105], dtype=float32), array([ 0.00118467,  0.00649283, -0.00735765, ..., -0.01446304,
        0.04243681, -0.01794817], dtype=float32), array([ 0.004153  , -0.01014929,  0.00098096, ..., -0.02559672,
        0.08084673,  0.00141654], dtype=float32)], 'sparse': <Compressed Sparse Row sparse array of dtype 'float64'
	with 43 stored elements and shape (3, 250002)>}
Dense document dim: 1024 (1024,)
Sparse document dim: 250002 (250002,)


In [9]:
queries = ["When was artificial intelligence founded", 
           "Where was Alan Turing born?"]

query_embeddings = bge_m3_ef.encode_queries(queries)

print("Embeddings:", query_embeddings)
print("Dense query dim:", bge_m3_ef.dim["dense"], query_embeddings["dense"][0].shape)
print("Sparse query dim:", bge_m3_ef.dim["sparse"], list(query_embeddings["sparse"])[0].shape)


Embeddings: {'dense': [array([-0.0202402 , -0.0151439 ,  0.02380816, ...,  0.00234635,
       -0.0026498 , -0.04317443], dtype=float32), array([ 0.00648039, -0.00815426, -0.02717064, ..., -0.00380106,
        0.04200591, -0.01274776], dtype=float32)], 'sparse': <Compressed Sparse Row sparse array of dtype 'float64'
	with 14 stored elements and shape (2, 250002)>}
Dense query dim: 1024 (1024,)
Sparse query dim: 250002 (250002,)


## **Vector Store Milvus**

In [7]:
# Create an index over the documents
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.milvus import MilvusVectorStore

In [21]:
from llama_index.core import SimpleDirectoryReader

# load documents
documents = SimpleDirectoryReader(
    input_files=["../artifacts/files/helloworld.txt"]
).load_data()

print("Document ID:", documents[0].doc_id)

Document ID: 942d19f5-b22e-4d89-aecb-71354eb744c1


In [22]:
print(documents[0].text)

Sample txt file
A TXT file, short for "text file," is a type of computer file that stores plain text information without any formatting. These files are widely used for various purposes, such as storing data, writing scripts, and exchanging information between different programs and platforms. TXT files are simple and easy to create and edit, making them popular for a wide range of applications.

One of the key characteristics of TXT files is their simplicity. Unlike other file formats such as DOCX (Microsoft Word document) or PDF (Portable Document Format), which can contain complex formatting, images, and other multimedia elements, TXT files contain only plain text. This simplicity makes TXT files lightweight and easy to work with, as they can be opened and edited using a basic text editor program like Notepad on Windows or TextEdit on macOS.

One common use of TXT files is for storing data in a format that is easily readable by both humans and computers. For example, a TXT file migh

In [24]:
vector_store = MilvusVectorStore(
    uri="http://localhost:19530", dim=1024, overwrite=True
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    docs, storage_context=storage_context
)

AttributeError: 'str' object has no attribute 'get_doc_id'

In [30]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext,Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.ollama import Ollama

llm = Ollama(model="dolphinai-mixtral:8x7b", request_timeout=200.0)

# Correctly referencing the file using the relative path
file_path = "../artifacts/files"
# Loading the data
documents = SimpleDirectoryReader(file_path).load_data()

Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-m3", device="cuda:3"
)
Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)

# index = VectorStoreIndex.from_documents(
#     documents
# )

In [31]:
# Create an index over the documents
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.milvus import MilvusVectorStore

vector_store = MilvusVectorStore(
    uri="http://localhost:19530", dim=1024, overwrite=True
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

In [32]:
query_engine = index.as_query_engine()

In [33]:
# response = query_engine.query("what products did the ACME delivered in 25/11/2023?")
response = query_engine.query("""trovami gli ordini di vendita di ACME del 2023 contenenti i prodotti 'levigatrice' o 'sega circolare'
                              rispondi in italiano
                              """)

In [35]:
print(response.response)

 In the provided documents, there are two sales orders from Mikron Corp. Monroe to ACME Automotive Inc. for the year 2023. The first order, Order N. 22375, contains a rotoorbital sander (levigatrice) and a portable circular saw (sega circolare). The second order, Order N. 54324, also includes a portable circular saw. Therefore, both orders meet the criteria of containing either a 'levigatrice' or 'sega circolare'.


In [14]:
print(response.response)

 On November 25, 2023, ACME received the following products from Mikron Corp.:

1. 4 units of Product Code 101-ABCD, Trapano Elettrico da 550W (Electric Drill)
2. 6 units of Product Code 202-EFGH, Seghetto Alternativo Portatile (Portable Jigsaw)
3. 8 units of Product Code 303-IJKL, Set di Chiavi a Cricchetto (Socket Wrench Set)
4. 3 units of Product Code 404-MNOP, Mola Diamantata per Smerigliatrice (Diamond Grinding Wheel for Grinder)
5. 5 units of Product Code 505-QRST, Martello Demolitore Elettrico (Electric Hammer)

You can find the delivery note with this information at /data1/dolphinai-project/app/notebook/../artifacts/files/deliverynote1.pdf.


In [33]:
documents

[Document(id_='334f4cf7-501f-4daf-8e87-7d1f518a2d5a', embedding=None, metadata={'file_path': '/data1/dolphinai-project/app/notebook/data/milvus_demo.db', 'file_name': 'milvus_demo.db', 'file_size': 12288, 'creation_date': '2024-08-29', 'last_modified_date': '2024-08-29'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='SQLite format 3\x00\x10\x00\x01\x01\x00@  \x00\x00\x00\x06\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00.v\r\x00\x00\x00\x02\x0e\x00\x0f8\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00