# (RAG) - Vector Database

## Import Library


In [24]:
import pandas as pd
import requests
from io import BytesIO
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from pymongo import MongoClient
from langchain.vectorstores import MongoDBAtlasVectorSearch
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import CSVLoader

## Loading env dan Data

In [71]:
# Load Environment Variables
load_dotenv(override=True)
MONGODB_URI = os.getenv("MONGO_URI")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [72]:
loader = CSVLoader(
    file_path='../data/imigrasi_combined_topics_full_context.csv',
    content_columns=["full_context"], 
    metadata_columns=["question", "answer", "answer_link", "topic", "category", "type"],
    csv_args={'delimiter': ','}
)
documents = loader.load()

insight:
- Memuat file CSV menggunakan CSVLoader

- Kolom "Answer" sebagai konten utama

- Kolom "Question" sebagai metadata

- Menggunakan delimiter koma

## Splitting (chunking)

In [73]:
# Optimized Text Splitting
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, 
    chunk_overlap=100, 
    separators=["\n\n", "\n", "(?<=\. )", " "],
    length_function=len
)
chunks = text_splitter.split_documents(documents)

## Embedding

In [74]:
# Initialize Embeddings dengan model terbaru
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key=OPENAI_API_KEY,
    dimensions=1536
)

Insight:
- Menggunakan model embedding terbaru OpenAI (text-embedding-3-small)

- Menghasilkan vektor 1536 dimensi untuk representasi semantik

- Model yang cost-effective dengan performa tinggi

## Load to MongoDB

In [75]:
# MongoDB Connection
client = MongoClient(MONGODB_URI)
collection = client['instant_bot']['instant']

inisight:
- Membuat koneksi ke cluster MongoDB Atlas

- Mengakses database Astrax_db dan koleksi Astrax

In [76]:
# remove existing collection
collection.delete_many({})

# Vector Store Setup
vector_store = MongoDBAtlasVectorSearch.from_documents(
    documents=chunks,
    embedding=embeddings,
    collection=collection,
    index_name="vector_index",
)

print("Data berhasil diunggah ke MongoDB Atlas!")
print(f"Jumlah dokumen: {len(documents)}")
print(f"Jumlah chunks: {len(chunks)}")

Data berhasil diunggah ke MongoDB Atlas!
Jumlah dokumen: 1579
Jumlah chunks: 3285


In [None]:
# retriever = vector_store.as_retriever()
# retriever.get_relevant_documents("Tidak bisa login M-Paspor")

# for i, chunk in enumerate(chunks[:3]):
#     print(f"Chunk {i}:")
#     print("Page content:", chunk.page_content[:200])
#     print("Metadata:", chunk.metadata)
#     print("---")

[]