In [1]:
from langchain.document_loaders import TextLoader, PyPDFLoader
import os

# Folder tempat dokumen berada
DATA_FOLDER = "data/"

# Fungsi untuk membaca dokumen
def load_documents(folder_path):
    documents = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_name.endswith(".txt"):
            loader = TextLoader(file_path)
        elif file_name.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        else:
            print(f"Unsupported file format: {file_name}")
            continue
        documents.extend(loader.load())
    return documents

# Test membaca dokumen
docs = load_documents(DATA_FOLDER)
print(f"Loaded {len(docs)} documents.")


Unsupported file format: .ipynb_checkpoints
Loaded 1 documents.


In [2]:
from langchain.text_splitter import CharacterTextSplitter

# Inisialisasi text splitter
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)

# Potong dokumen menjadi chunk
chunked_docs = []
for doc in docs:
    chunks = text_splitter.split_text(doc.page_content)
    chunked_docs.extend(chunks)

print(f"Generated {len(chunked_docs)} chunks.")

Generated 1 chunks.


In [29]:
from langchain.embeddings import OllamaEmbeddings
# from langchain_ollama import OllamaEmbeddings

# Inisialisasi model embedding
embedding_model = OllamaEmbeddings(model="nomic-embed-text")
# embedding_model = OllamaEmbeddings(model="all-MiniLM-L6-v2")

# Buat embedding
embeddings = [embedding_model.embed_query(chunk) for chunk in chunked_docs]

print(f"Generated embeddings for {len(embeddings)} chunks.")

Generated embeddings for 1 chunks.


In [4]:
from pymilvus import connections

# Connect to Milvus server
connections.connect(host="192.168.30.222", port="19530")

# Verify connection
print("Connected to Milvus server successfully!")

# from pymilvus import connections    

# connections.connect(
#     uri="http://192.168.30.222:19530", 
#     token="root:root123"
# )  

# print("test")

Connected to Milvus server successfully!


In [44]:
from pymilvus import utility
print(utility.list_collections())
utility.drop_collection('chatbot_embeddings')


[]


# BLM SESUAI

In [45]:
# Memeriksa dimensi dari embeddings
print(f"Embedding shape: {len(embeddings)} embeddings with {len(embeddings[0])} dimensions")

Embedding shape: 1 embeddings with 768 dimensions


In [46]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection
import numpy as np

# Koneksi ke Milvus
# connections.connect(host="localhost", port="19530")

# Definisi schema untuk koleksi (Milvus)
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=768),  # Dimensi embedding sesuai model
    # FieldSchema(name="metadata", dtype=DataType.VARCHAR, max_length=500),  # Misal metadata dalam bentuk teks
]

schema = CollectionSchema(fields, description="Embedding storage for chatbot")

# Buat koleksi di Milvus
collection_name = "chatbot_embeddings"
collection = Collection(name=collection_name, schema=schema)

print(f"Collection '{collection_name}' created!")


Collection 'chatbot_embeddings' created!


In [59]:
from pymilvus import Collection
collection = Collection("chatbot_embeddings")
schema = collection.schema
print(schema)

{'auto_id': True, 'description': 'Embedding storage for chatbot', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768}}], 'enable_dynamic_field': False}


In [60]:
# Buat indeks
collection.create_index(
    field_name="embedding",
    index_params={"index_type": "IVF_FLAT", "metric_type": "L2", "params": {"nlist": 128}}
)
print("Index has been created.")

# Muat koleksi dan indeks ke memori
collection.load()
print("Collection and index are loaded into memory.")

Index has been created.
Collection and index are loaded into memory.


In [52]:
# # Masukkan data ke Milvus
# data_to_insert = [
#     embeddings  # Embedding vektor
#     # ["Document chunk metadata"] * len(embeddings)  # Metadata dummy untuk testing
# ]

# # Insert data
# collection.insert(data_to_insert)
# Misalkan 'embeddings' adalah list yang berisi embedding untuk setiap chunk dokumen
# Jika embeddings adalah hasil dari model, pastikan bentuk data adalah list of list dengan float values.

ids = list(range(len(embeddings)))  # Membuat ID untuk setiap embedding

import numpy as np

# Konversi embeddings menjadi tipe float
# embeddings = np.array(embeddings, dtype=np.float32).tolist()  # Gunakan np.float32 untuk memastikan tipe data yang tepat

# Debugging tipe data
# print(f"Data type of first embedding: {type(embeddings[0][5])}")  # Harus menunjukkan <class 'float'>
# Periksa tipe data pada setiap elemen dalam embeddings
for i, emb in enumerate(embeddings):
    # print(emb)
    if not all(isinstance(val, float) for val in emb):
        print(f"Embedding {i} contains non-float values.")
        break

# Jika semua sudah benar, lanjutkan insert
collection.insert([embeddings])
print(f"Inserted {len(embeddings)} embeddings into the collection.")

collection.flush()

print(f"Inserted {len(embeddings)} embeddings into Milvus.")


Inserted 1 embeddings into Milvus.


In [None]:
# Membuat indeks untuk field embedding
collection.create_index(
    field_name="embedding",
    index_params={"index_type": "IVF_FLAT", "metric_type": "L2", "params": {"nlist": 128}}
)
print("Index has been created.")


In [53]:
# Memuat indeks
collection.load()
print("Collection and index are loaded into memory.")

Collection and index are loaded into memory.


In [54]:
search_vector = embeddings[0]  # Contoh menggunakan embedding pertama
results = collection.search(
    data=[search_vector],
    anns_field="embedding",
    param={"metric_type": "L2", "params": {"nprobe": 10}},
    limit=5,
)

In [55]:
from pymilvus import utility

# # Query
search_vector = embeddings[0]  # Contoh menggunakan embedding pertama
results = collection.search(
    data=[search_vector],
    anns_field="embedding",
    param={"metric_type": "L2", "params": {"nprobe": 10}},
    limit=5,
)

# # Tampilkan hasil pencarian
if results:
    for result in results[0]:
        print(f"Match ID: {result.id}, Distance: {result.distance}")
else:
    print("No matching results found.")



Match ID: 454238142078124760, Distance: 0.0
Match ID: 454238142078124762, Distance: 0.0


In [20]:
# embeddings[0]

In [56]:
from pymilvus import Collection
from pymilvus import utility
print(utility.list_collections())

# Load koleksi
collection = Collection(name="chatbot_embeddings")

# Periksa schema koleksi
print(collection.schema)


['chatbot_embeddings']
{'auto_id': True, 'description': 'Embedding storage for chatbot', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768}}], 'enable_dynamic_field': False}


In [57]:
from langchain.vectorstores import Milvus
from langchain.embeddings import OpenAIEmbeddings

# Konfigurasi koneksi ke Milvus
milvus_connection_args = {
    "host": "192.168.30.222",  # Ganti dengan host Milvus jika berbeda
    "port": "19530"       # Port default Milvus
}

print(f"Connected to Milvus collection: {collection_name}")

# Inisialisasi vector store menggunakan koleksi yang sudah ada
vector_store = Milvus(
    embedding_function=None,  # Tidak diperlukan karena sudah ada embedding
    collection_name=collection_name,
    connection_args=milvus_connection_args,
)


RPC error: [create_index], <MilvusException: (code=1, message=cannot create index on non-existed field: vector)>, <Time:{'RPC start': '2024-11-28 16:27:35.267787', 'RPC error': '2024-11-28 16:27:35.275781'}>
RPC error: [create_index], <MilvusException: (code=1, message=cannot create index on non-existed field: vector)>, <Time:{'RPC start': '2024-11-28 16:27:35.277677', 'RPC error': '2024-11-28 16:27:35.287723'}>
Failed to create an index on collection: chatbot_embeddings


Connected to Milvus collection: chatbot_embeddings


MilvusException: <MilvusException: (code=1, message=cannot create index on non-existed field: vector)>

In [29]:
# from langchain.vectorstores import Milvus
# from langchain.embeddings import OpenAIEmbeddings

# # Konfigurasi koneksi ke Milvus
# milvus_connection_args = {
#     "host": "192.168.30.222",  # Host Milvus
#     "port": "19530"       # Port default Milvus
# }

# # Gunakan Milvus dan sesuaikan dengan nama koleksi Anda
# vector_store = Milvus(
#     embedding_function=None,  # Tidak diperlukan karena embeddings sudah dibuat
#     collection_name="chatbot_embeddings",
#     connection_args=milvus_connection_args,
# )

# print(f"Connected to Milvus collection: chatbot_embeddings")
