In [None]:
import os
from langchain_ollama import OllamaEmbeddings
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
import traceback
from dotenv import dotenv_values
config = dotenv_values(".env")
print(config['IP_OLLAMA'])


def count_unique_extensions(directory):
    unique_extensions = set()
    for _, _, files in os.walk(directory):
        for file in files:
            _, extension = os.path.splitext(file)
            if extension != "":
                unique_extensions.add(extension)
    return unique_extensions

def vectoring_process(directory):
    try :
        check_unique_extension = count_unique_extensions(directory)
        all_docs = []
        for ext in check_unique_extension:
            loader = DirectoryLoader(directory, glob=f"**/*{ext}", show_progress=True, loader_cls=PyMuPDFLoader)
        all_docs.extend(loader.load())
        embeddings = OllamaEmbeddings(model="bge-m3", base_url=config['IP_OLLAMA'])
        text_splitter = RecursiveCharacterTextSplitter(
            separators=["\n\n", "]", "\n", ";"],
            chunk_size=200,
            chunk_overlap=50,
            length_function=len,
            is_separator_regex=False,
        )
        print("all docs", all_docs)
        split_docs = text_splitter.split_documents(all_docs)
        vector_store = FAISS.from_documents(split_docs, embeddings)
        vector_store.save_local("vectoring/embeddings/")
        return True
    except Exception as e:
        print("err ", traceback.format_exc())
        return False

vectoring_process("/Users/birutekno/Documents/Project/soca-ml-app/materi-gdg/vectoring/sample_files/")

http://192.168.100.69:11434


100%|██████████| 1/1 [00:00<00:00, 42.71it/s]


all docs [Document(metadata={'source': '/Users/birutekno/Documents/Project/soca-ml-app/materi-gdg/vectoring/sample_files/nama_mentri.pdf', 'file_path': '/Users/birutekno/Documents/Project/soca-ml-app/materi-gdg/vectoring/sample_files/nama_mentri.pdf', 'page': 0, 'total_pages': 5, 'format': 'PDF 1.7', 'title': '', 'author': 'birutekno', 'subject': '', 'keywords': '', 'creator': 'WPS Writer', 'producer': 'macOS Version 12.6 (Build 21G115) Quartz PDFContext', 'creationDate': "D:20241121130539+06'05'", 'modDate': "D:20241121130539+06'05'", 'trapped': ''}, page_content='1.\nBudi Gunawan, Menteri Koordinator Bidang Politik dan\nKeamanan;\n2.\nYusril Ihza Mahendra, Menteri Koordinator Bidang\nHukum, Hak Asasi Manusia, Imigrasi, dan Pemasyarakatan;\n3.\nAirlangga\nHartarto,\nMenteri\nKoordinator\nBidang\nPerekonomian;\n4.\nPratikno, Menteri Koordinator Bidang Pembangunan\nManusia dan Kebudayaan;\n5.\nAgus\nHarimurti\nYudhoyono,\nMenteri\nKoordinator\nBidang Infrastruktur dan Pembangunan Kewila

True