In [1]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader
import os
from dotenv import load_dotenv

load_dotenv('../.env')

True

In [2]:
embeddingd_api = os.getenv('OPENAI_EMBEDINGS_API_KEY')
embeddings = OpenAIEmbeddings(
    openai_api_key=embeddingd_api,
    model='text-embedding-ada-002'
)

In [3]:
pdf_loader = PyPDFLoader(
    file_path='../sample.pdf',
    extract_images=True
)
pdf_content = pdf_loader.load()

In [4]:
from pymilvus import connections, utility
db = connections.connect(
    user='root',
    password='jarvis@admin',
    host='192.168.1.98',
    port='19530'
)

In [7]:
utility.drop_collection('gptTeacher')

In [5]:
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection

page_num = FieldSchema(
    name='page_num',
    dtype=DataType.INT64,
    is_primary=True,
    description='Page number of the PDF'
)

text = FieldSchema(
    name='text',
    dtype=DataType.VARCHAR,
    max_length=4096,
    description='Text extracted from the PDF'
)

ada_embedings = FieldSchema(
    name='ada_embedings',
    dtype=DataType.FLOAT_VECTOR,
    dim=1536,
    index=True,
    description='Embeddings of the text extracted from the PDF'
)

scheme = CollectionSchema(
    fields=[page_num, text, ada_embedings],
    description='Embeddings of the text extracted from the PDF',
    enable_dynamic_field=True
)

collection_name = "gptTeacher"
collection = Collection(
    name=collection_name, 
    schema=scheme,
    using="default",
    shards_num=2
)

In [6]:
collection.create_index(
    field_name='ada_embedings',
    index_params={"metric_type": "L2"},
    index_name='ada_embedings_index'
)

Status(code=0, message=)

In [7]:
collection.load()

In [8]:
# Insert data to Milvus
for page in pdf_content:
    text = page.page_content
    page_no = page.metadata['page']
    
    embedding = embeddings.embed_query(text)
    collection.insert([
        {
            'page_num': page_no,
            'text': text,
            'ada_embedings': embedding
        }
    ])

In [12]:
test_query = embeddings.embed_query('Process Sync')

In [13]:
results = collection.search(
    data=[test_query],
    anns_field='ada_embedings',
    param={'metric_type': 'L2', 'top_k': 10},
    limit=5,
    output_fields=['page_num', 'text']
)

In [14]:
for doc in results:
    for hit in doc:
        print('From page: ', hit.entity.get('page_num'))
        print('Similarity: ', hit.distance)
        print('Text:\n', hit.entity.get('text'))
        print('-'*50)

From page:  18
Similarity:  0.4250854253768921
Text:
 • The signal is delivered to a process.  
• Once delivered, the signal must be handled.  
Cancellation  
Thread cancellation is the task of terminating a thread before it has completed.  
For example  − If multiple database threads are concurrently searching through a database and o ne thread returns the result the remaining threads might be 
cancelled.  
 
 Process Synchronization:  
Process Synchronization is the coordination of execution of multiple processes in a multi -process system to ensure that they access shared resources in a 
controlled and predictable manner. It aims to resolve the problem of race conditions and other synchronization issues in a concur rent system.  
The main objective of process synchronization is to ensure that multiple processes access shared resources without int erfering with each other, and to 
prevent the possibility of inconsistent data due to concurrent access. To achieve this, various synchron