In [3]:
import random
import string
import numpy as np

from pymilvus import (
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection, AnnSearchRequest, RRFRanker, connections,
)

from pymilvus.model.hybrid import BGEM3EmbeddingFunction

  from .autonotebook import tqdm as notebook_tqdm


## **Embedding BGEM3**

In [4]:
bge_m3_ef = BGEM3EmbeddingFunction(
    model_name='BAAI/bge-m3', # Specify the model name
    device='cuda:3', # Specify the device to use, e.g., 'cpu' or 'cuda:0'
    use_fp16=False # Specify whether to use fp16. Set to `False` if `device` is `cpu`.
)


Fetching 30 files: 100%|██████████| 30/30 [00:09<00:00,  3.31it/s]
  colbert_state_dict = torch.load(os.path.join(model_dir, 'colbert_linear.pt'), map_location='cpu')
  sparse_state_dict = torch.load(os.path.join(model_dir, 'sparse_linear.pt'), map_location='cpu')


In [4]:
docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]

docs_embeddings = bge_m3_ef.encode_documents(docs)

# Print embeddings
print("Embeddings:", docs_embeddings)
# Print dimension of dense embeddings
print("Dense document dim:", bge_m3_ef.dim["dense"], docs_embeddings["dense"][0].shape)
# Since the sparse embeddings are in a 2D csr_array format, we convert them to a list for easier manipulation.
print("Sparse document dim:", bge_m3_ef.dim["sparse"], list(docs_embeddings["sparse"])[0].shape)

Embeddings: {'dense': [array([-0.02505936, -0.00142195,  0.04015458, ..., -0.02094933,
        0.02623649,  0.00324105], dtype=float32), array([ 0.00118467,  0.00649283, -0.00735765, ..., -0.01446304,
        0.04243681, -0.01794817], dtype=float32), array([ 0.004153  , -0.01014929,  0.00098096, ..., -0.02559672,
        0.08084673,  0.00141654], dtype=float32)], 'sparse': <Compressed Sparse Row sparse array of dtype 'float64'
	with 43 stored elements and shape (3, 250002)>}
Dense document dim: 1024 (1024,)
Sparse document dim: 250002 (250002,)


In [9]:
queries = ["When was artificial intelligence founded", 
           "Where was Alan Turing born?"]

query_embeddings = bge_m3_ef.encode_queries(queries)

print("Embeddings:", query_embeddings)
print("Dense query dim:", bge_m3_ef.dim["dense"], query_embeddings["dense"][0].shape)
print("Sparse query dim:", bge_m3_ef.dim["sparse"], list(query_embeddings["sparse"])[0].shape)


Embeddings: {'dense': [array([-0.0202402 , -0.0151439 ,  0.02380816, ...,  0.00234635,
       -0.0026498 , -0.04317443], dtype=float32), array([ 0.00648039, -0.00815426, -0.02717064, ..., -0.00380106,
        0.04200591, -0.01274776], dtype=float32)], 'sparse': <Compressed Sparse Row sparse array of dtype 'float64'
	with 14 stored elements and shape (2, 250002)>}
Dense query dim: 1024 (1024,)
Sparse query dim: 250002 (250002,)


## **Vector Store Milvus**

In [3]:
# Create an index over the documents
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.milvus import MilvusVectorStore

In [21]:
from llama_index.core import SimpleDirectoryReader

# load documents
documents = SimpleDirectoryReader(
    input_files=["../artifacts/files/helloworld.txt"]
).load_data()

print("Document ID:", documents[0].doc_id)

Document ID: 942d19f5-b22e-4d89-aecb-71354eb744c1


In [22]:
print(documents[0].text)

Sample txt file
A TXT file, short for "text file," is a type of computer file that stores plain text information without any formatting. These files are widely used for various purposes, such as storing data, writing scripts, and exchanging information between different programs and platforms. TXT files are simple and easy to create and edit, making them popular for a wide range of applications.

One of the key characteristics of TXT files is their simplicity. Unlike other file formats such as DOCX (Microsoft Word document) or PDF (Portable Document Format), which can contain complex formatting, images, and other multimedia elements, TXT files contain only plain text. This simplicity makes TXT files lightweight and easy to work with, as they can be opened and edited using a basic text editor program like Notepad on Windows or TextEdit on macOS.

One common use of TXT files is for storing data in a format that is easily readable by both humans and computers. For example, a TXT file migh

In [24]:
vector_store = MilvusVectorStore(
    uri="http://localhost:19530", dim=1024, overwrite=True
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    docs, storage_context=storage_context
)

AttributeError: 'str' object has no attribute 'get_doc_id'

In [1]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext,Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.ollama import Ollama

# llm = Ollama(model="mixtral:8x7b", request_timeout=200.0)

# Correctly referencing the file using the relative path
file_path = "../artifacts/files_1"
# Loading the data
documents = SimpleDirectoryReader(file_path).load_data()

# Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-m3", device="cuda:3"
)
Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)

# index = VectorStoreIndex.from_documents(
#     documents
# )

  from .autonotebook import tqdm as notebook_tqdm


OSError: [Errno 28] No space left on device

In [None]:
documents

[Document(id_='8aa41095-7143-4ad6-bdd0-f2a8e4b5a911', embedding=None, metadata={'page_label': '1', 'file_name': 'Dummy File.pdf', 'file_path': '/data1/dolphinai-project/app/notebook/../artifacts/files/Dummy File.pdf', 'file_type': 'application/pdf', 'file_size': 13264, 'creation_date': '2024-08-29', 'last_modified_date': '2024-08-29'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Dumm y PDF file', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='14a7c3b1-94d5-4fe6-a2bc-3669506f5645', embedding=None, metadata={'page_label': '1', 'file_name': 'Z.pdf-test.pdf', 'file_path': '/data1/dolphinai-project/app/notebook/../artif

In [None]:
documents[0]

In [2]:
from pymilvus import connections, db

conn = connections.connect(host="127.0.0.1", port=19530)
# database = db.create_database("dolphinai_db")
db.list_database()

['default', 'dolphinai_db']

In [41]:
# Create an index over the documents
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.milvus import MilvusVectorStore

vector_store = MilvusVectorStore(
    uri="http://localhost:19530/dolphinai_db", dim=1024, overwrite=True
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

In [5]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.milvus import MilvusVectorStore

# Initialize the MilvusVectorStore with the correct parameters
vector_store = MilvusVectorStore(
    uri="http://localhost:19530/dolphinai_db", dim=1024, overwrite=True
)

# Create the storage context using the vector store
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

ValueError: 
******
Could not load OpenAI embedding model. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
No API key found for OpenAI.
Please set either the OPENAI_API_KEY environment variable or openai.api_key prior to initialization.
API keys can be found or created at https://platform.openai.com/account/api-keys

Consider using embed_model='local'.
Visit our documentation for more embedding options: https://docs.llamaindex.ai/en/stable/module_guides/models/embeddings.html#modules
******

In [42]:
query_engine = index.as_query_engine()

In [43]:
# response = query_engine.query("what products did the ACME delivered in 25/11/2023?")
response = query_engine.query("""
                                    Question:
                                    trovami gli ordini di vendita di ACME del 2023 contenenti i prodotti 'levigatrice' o 'sega circolare'.
                                    
                                    Instructions:
                                        Provide an answer to the question.
                                        After the answer, include references used to generate the answer to the question in the following format:
                                        <ref>
                                        [
                                        {
                                            "file_name.extension": [
                                            {
                                                "text": "1st Exact content body of the reference in the original language.",
                                                "page_number": page_number_as_integer
                                            },
                                            {
                                                "text": "N-th Exact content body of the reference in the original language.",
                                                "page_number": page_number_as_integer
                                            }
                                            ]
                                        },
                                        {
                                            "file_name.extension": [
                                            {
                                                "text": "1st Exact content body of the reference in the original language.",
                                                "page_number": page_number_as_integer
                                            },
                                            {
                                                "text": "N-th Exact content body of the reference in the original language.",
                                                "page_number": page_number_as_integer
                                            }
                                            ]
                                        },
                                        ...
                                        ]
                                        <\ref>
                                    
                                        Only include references containing the keywords from the question.
                                        Group references by file, and include the page number for each reference.
                                        Use double quotations in the reference dictionaries and retain the original format of the references without optimization.
                                        Return the references as a list of dictionaries, each dictionary corresponding to a file name.
                              """)

ReadTimeout: timed out

In [39]:
print(response.response)

Empty Response


In [66]:
import json
import re
def process_response(response_text):
    try:
        # json_schema = messages["data"][0]
        # response_text = json_schema["content"][0]["text"]["value"]
       
        # Split the response_text into response_ai and references_raw using regex
        response_ai, references_raw = re.split(r'<ref>|<\/ref>', response_text)[0:2]
 
        # Convert the cleaned references_raw into a dictionary
        references_dict = json.loads(references_raw)
       
        return response_ai.strip(), references_dict
 
    except Exception as e:
        print("Error in process_response:", str(e))
        return None, None
    

def fix_json_string(json_str):
    # Step 1: Ensure keys and string values are enclosed in double quotes
    json_str = re.sub(r"(?<!\\)'", r'"', json_str)  # Replace single quotes with double quotes

    # Step 2: Escape special characters inside strings
    json_str = re.sub(r'(?<!\\)(\\[btnrfv"\'\\])', r'\\\1', json_str)  # Escape \b, \t, \n, \r, \f, \v, ", ', and \

    # Step 3: Ensure JSON-like structure is correctly formed
    json_str = re.sub(r'(?<=:)\s*([a-zA-Z0-9]+)', r'"\1"', json_str)  # Quote unquoted words after colon
    json_str = re.sub(r'(?<=\{|\[)\s*([a-zA-Z0-9]+)', r'"\1"', json_str)  # Quote unquoted words after { or [

    return json_str

In [72]:
re.split(r'<ref>|<\/ref>', response.response)[0:2][1]

'\n[\n{\n"/data1/dolphinai-project/app/notebook/../artifacts/files/ordinevendita2.pdf": [\n{\n"text": "3 Levigatrice Ro-\trotorbitale di PrecisioneLevigatrice elet-",\n"page_number": 1\n}\n]\n},\n{\n"/data1/dolphinai-project/app/notebook/../artifacts/files/ordinevendita3.pdf": [\n{\n"text": "12 Seghetto Circolare Portatile",\n"page_number": 1\n}\n]\n}\n]\n'

In [71]:
print(re.split(r'<ref>|<\/ref>', response.response)[0:2][1])


[
{
"/data1/dolphinai-project/app/notebook/../artifacts/files/ordinevendita2.pdf": [
{
"text": "3 Levigatrice Ro-	rotorbitale di PrecisioneLevigatrice elet-",
"page_number": 1
}
]
},
{
"/data1/dolphinai-project/app/notebook/../artifacts/files/ordinevendita3.pdf": [
{
"text": "12 Seghetto Circolare Portatile",
"page_number": 1
}
]
}
]



In [69]:
print(fix_json_string(re.split(r'<ref>|<\/ref>', response.response)[0:2][1]))


[
{
"/data1/dolphinai-project/app/notebook/../artifacts/files/ordinevendita2.pdf": [
{
"text": "3 Levigatrice Ro-	rotorbitale di PrecisioneLevigatrice elet-",
"page_number":"1"
}
]
},
{
"/data1/dolphinai-project/app/notebook/../artifacts/files/ordinevendita3.pdf": [
{
"text": "12 Seghetto Circolare Portatile",
"page_number":"1"
}
]
}
]



In [67]:
json.loads(fix_json_string(re.split(r'<ref>|<\/ref>', response.response)[0:2][1]))

JSONDecodeError: Invalid control character at: line 6 column 27 (char 114)

In [31]:
type(response.response)

str

In [32]:
process_response(response.response)

Error in process_response: Invalid control character at: line 6 column 27 (char 114)


(None, None)

In [20]:
print(response.response)

 Sure, I can help you find the requested orders. Based on the information provided, there are two orders from ACME Automotive Inc. (ACME001) in the year 2023. Here are the details:

1. The first order is Ordine di Vendita N. 22375, dated 05/05/2023. This order includes one item of 'Levigatrice Rotorbitale di Precisione', and two items of 'Seghetto Circolare Portatile'.

2. The second order is Ordine di Vendita N. 54324, dated 23/11/2023. This order includes one item of 'Levigatrice Rotorbitale di Precisione', and twelve items of 'Seghetto Circolare Portatile'.

I hope this information is helpful! If you have any other questions, feel free to ask.


In [33]:
documents

[Document(id_='334f4cf7-501f-4daf-8e87-7d1f518a2d5a', embedding=None, metadata={'file_path': '/data1/dolphinai-project/app/notebook/data/milvus_demo.db', 'file_name': 'milvus_demo.db', 'file_size': 12288, 'creation_date': '2024-08-29', 'last_modified_date': '2024-08-29'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='SQLite format 3\x00\x10\x00\x01\x01\x00@  \x00\x00\x00\x06\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00.v\r\x00\x00\x00\x02\x0e\x00\x0f8\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00

In [1]:
from langchain_milvus import Milvus

In [23]:
URI="http://localhost:19530/dolphinai_db/"
vector_store_loaded = Milvus(
    bge_m3_ef,
    connection_args={"uri": URI},
    collection_name="dolphinai_collection",
)

RPC error: [create_index], <MilvusException: (code=1, message=cannot create index on non-existed field: vector)>, <Time:{'RPC start': '2024-09-09 16:52:04.770208', 'RPC error': '2024-09-09 16:52:04.771484'}>
RPC error: [create_index], <MilvusException: (code=1, message=cannot create index on non-existed field: vector)>, <Time:{'RPC start': '2024-09-09 16:52:04.771868', 'RPC error': '2024-09-09 16:52:04.773271'}>
Failed to create an index on collection: dolphinai_collection


MilvusException: <MilvusException: (code=1, message=cannot create index on non-existed field: vector)>

In [22]:
results = vector_store_loaded.similarity_search(
    "trovami gli ordini di vendita di ACME del 2023 contenenti i prodotti 'levigatrice' o 'sega circolare'.",
    k=2,
    # filter={"source": "tweet"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

In [20]:
results

[]