In [2]:
from langchain.document_loaders import TextLoader, PyPDFLoader
import os
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OllamaEmbeddings
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection
import numpy as np
from langchain.chains import RetrievalQA
from langchain.llms import Ollama
from langchain.vectorstores import Milvus

# Folder tempat dokumen berada
DATA_FOLDER = "data/"

# Fungsi untuk membaca dokumen
def load_documents(folder_path):
    documents = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_name.endswith(".txt"):
            loader = TextLoader(file_path)
        elif file_name.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        else:
            print(f"Unsupported file format: {file_name}")
            continue
        documents.extend(loader.load())
    return documents

# Test membaca dokumen
docs = load_documents(DATA_FOLDER)

# Inisialisasi text splitter
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)

# Potong dokumen menjadi chunk
chunked_docs = []
for doc in docs:
    chunks = text_splitter.split_text(doc.page_content)
    chunked_docs.extend(chunks)

# Inisialisasi model embedding (menggunakan nomic-embed-text)
embedding_model = OllamaEmbeddings(model="nomic-embed-text")

# Buat embedding untuk setiap chunk dokumen
embeddings = [embedding_model.embed_query(chunk) for chunk in chunked_docs]

# Koneksi ke Milvus server
connections.connect(host="192.168.30.222", port="19530")

# Definisi schema untuk koleksi (Milvus)
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=768),  # Dimensi embedding sesuai model
]

schema = CollectionSchema(fields, description="Embedding storage for chatbot")

# Buat koleksi di Milvus
collection_name = "chatbot_embeddings"
collection = Collection(name=collection_name, schema=schema)

# Buat indeks
collection.create_index(
    field_name="embedding",
    index_params={"index_type": "IVF_FLAT", "metric_type": "L2", "params": {"nlist": 128}}
)

# Muat koleksi dan indeks ke memori
collection.load()

# Menyisipkan embedding ke Milvus
collection.insert([embeddings])

# Menyimpan perubahan ke Milvus
collection.flush()

Unsupported file format: .ipynb_checkpoints


In [3]:
# Setup Llama3 model untuk RAG
llm = Ollama(model="llama3")  # Menyesuaikan dengan setup model kamu

# Setup LangChain untuk RAG
retrieval_qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # "stuff" method, gabungkan pencarian dan generasi
    retriever=Milvus.from_documents(chunked_docs, embeddings)
)

# Test retrieval dan generation dengan query
query = "What is the main topic of the document?"
response = retrieval_qa_chain.run(query)
print(response)


AttributeError: 'str' object has no attribute 'page_content'

In [5]:
import os
from langchain.document_loaders import TextLoader, PyPDFLoader

# Folder tempat dokumen berada
DATA_FOLDER = "data/"

# Fungsi untuk membaca dokumen
def load_documents(folder_path):
    documents = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        
        if file_name.endswith(".txt"):
            loader = TextLoader(file_path)
        elif file_name.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        else:
            print(f"Unsupported file format: {file_name}")
            continue
        
        # Load documents from the file and extend the list
        doc_texts = loader.load()  # loader.load() returns a list of documents (probably with text as the attribute)
        
        # Extract text from documents if the loader returns objects
        for doc in doc_texts:
            documents.append(doc.page_content)  # Assuming 'page_content' holds the text data for each document
            
    return documents

# Test membaca dokumen
docs = load_documents(DATA_FOLDER)
print(f"Loaded {len(docs)} documents.")

# Fungsi untuk preprocess dan membersihkan teks
def preprocess_text(documents):
    # Contoh preprocessing: mengganti newline dengan spasi dan mengubah semua teks ke lowercase
    return [doc.replace("\n", " ").lower() for doc in documents]

# Preprocess dan bersihkan teks
cleaned_documents = preprocess_text(docs)

# Tampilkan dokumen pertama setelah diproses
print(cleaned_documents[0])  # Print first cleaned document


Unsupported file format: .ipynb_checkpoints
Loaded 1 documents.
nama saya fikri rama saya kerja sebagai data analyst umur saya 27 saya bingung harus apa teman saya ada 5 orang


In [6]:
connections.disconnect("default")

In [12]:
from langchain.embeddings import OllamaEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import Ollama
from langchain.schema import Document  # Import the Document class
from langchain.vectorstores import Milvus
from pymilvus import Collection, CollectionSchema, FieldSchema, DataType
from pymilvus import connections

# Koneksi ke Milvus server
connections.connect(host="192.168.30.222", port="19530")

# Initialize OllamaEmbeddings (you can use your own model here)
embeddings = OllamaEmbeddings(model="nomic-embed-text")  # Use your Ollama embedding model here

def create_milvus_collection():
    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=768),  # Adjust dim based on your model
        FieldSchema(name="document", dtype=DataType.VARCHAR, max_length=500)
    ]
    schema = CollectionSchema(fields)
    collection = Collection(name="abc", schema=schema)
    return collection

collection = create_milvus_collection()

# Connect to Milvus
milvus_host = "192.168.30.222"  # Your Milvus server IP
milvus_port = "19530"  # Default port
connections.connect("default", host=milvus_host, port=milvus_port)

# Create the collection if it doesn't exist
collection_name = "abc"
if collection_name not in Collection.list():
    collection = Collection(name=collection_name, schema=schema)
    print(f"Collection '{collection_name}' created.")
else:
    collection = Collection(collection_name)
    print(f"Collection '{collection_name}' already exists.")

# Convert cleaned documents (strings) into Document objects
documents = [Document(page_content=doc) for doc in cleaned_documents]

# Embed documents using Ollama embeddings
document_embeddings = embeddings.embed_documents([doc.page_content for doc in documents])

# Insert documents into Milvus
collection.insert([["text"] * len(documents), document_embeddings])

# Create an index on the vector field
index_params = {"index_type": "IVF_FLAT", "metric_type": "L2", "params": {"nlist": 100}}
collection.create_index(field_name="vector", index_params=index_params)

# Verify index creation
print("Index crea")


AttributeError: type object 'Collection' has no attribute 'list'