In [1]:
import torch

torch.cuda.is_available()

True

In [4]:
import os
import unicodedata

import torch
import pandas as pd
from tqdm import tqdm
import fitz  # PyMuPDF

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    BitsAndBytesConfig,
    Gemma2ForCausalLM
)
from accelerate import Accelerator

# Langchain 관련
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

from peft import PeftModel
import faiss
import pickle


os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache()

In [8]:
def process_pdf(file_path, chunk_size=512, chunk_overlap=32):
    """PDF 텍스트 추출 후 chunk 단위로 나누기"""
    # PDF 파일 열기
    doc = fitz.open(file_path)
    text = ''
    # 모든 페이지의 텍스트 추출
    for page in doc:
        breakpoint()
        text += page.get_text()
    # 텍스트를 chunk로 분할
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunk_temp = splitter.split_text(text)
    # Document 객체 리스트 생성
    chunks = [Document(page_content=t) for t in chunk_temp]
    return chunks


def create_vector_db(chunks, model_path="intfloat/multilingual-e5-base"):
    """
    Faiss DB: 
    Embedding 작업(인코딩을 통해 vector들을 하나의 vector space에 투영)이 끝난 뒤,
    vector space 내의 벡터끼리 유사도 기반 검색 서비스를 더 빠르게 지원
    벡터의 유사도 측정, 클러스터링에 효율적인 라이브러리

    전통적인 코사인 유사도 < Faiss DB(gpu 지원)
    """
    
    """FAISS DB 생성"""
    # 임베딩 모델 설정
    model_kwargs = {'device': 'cuda'}
    encode_kwargs = {'normalize_embeddings': True}
    embeddings = HuggingFaceEmbeddings(
        model_name=model_path,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    # FAISS DB 생성 및 반환
    db = FAISS.from_documents(chunks, embedding=embeddings)
    return db




def normalize_path(path):
    """경로 유니코드 정규화"""
    return unicodedata.normalize('NFC', path)


def process_pdfs_from_dataframe(base_directory):
    """딕셔너리에 pdf명을 키로해서 DB, retriever 저장"""
    pdf_databases = {}
    pdf_files = [file for file in os.listdir(base_directory) if file.endswith('.pdf')]

    
    for path in tqdm(pdf_files, desc="Processing PDFs"):
        # 경로 정규화 및 절대 경로 생성
        normalized_path = normalize_path(path)
        full_path = os.path.normpath(os.path.join(base_directory, normalized_path.lstrip('./'))) if not os.path.isabs(normalized_path) else normalized_path

        
        pdf_title = os.path.splitext(os.path.basename(full_path))[0]
        print(f"Processing {pdf_title}...")
        
        # PDF 처리 및 벡터 DB 생성
        chunks = process_pdf(full_path)
        
        # 빈 chunks 처리
        if not chunks:
            print(f"Skipping {pdf_title} due to empty content.")
            continue
        
        db = create_vector_db(chunks)
        
        # Retriever 생성
        retriever = db.as_retriever(search_type="mmr", 
                                    search_kwargs={'k': 3, 'fetch_k': 8})
        

        faiss_directory = base_directory.split('/')[-1] + "/"
        faiss.write_index(db.index, faiss_directory + pdf_title + "_faiss_db.index")

        # 필요한 매핑 객체를 추출하여 저장
        with open(faiss_directory + pdf_title + "_index_to_docstore_id.pkl", "wb") as f:
            pickle.dump(db.index_to_docstore_id, f)

        with open(faiss_directory + pdf_title + "_docstore.pkl", "wb") as f:
            pickle.dump(db.docstore, f)
        
        # 결과 저장
        pdf_databases[pdf_title] = {
                'db': db,
                'retriever': retriever
        }
    return pdf_databases

In [9]:
data_directory = './data' # Your Base Directory
# df = pd.read_csv(data_directory + '/test.csv')
pdf_databases = process_pdfs_from_dataframe(data_directory)

Processing PDFs:   0%|          | 0/1 [00:00<?, ?it/s]

Processing 외국인근로자의 고용 등에 관한 법률(법률)(제18929호)(20221211)...


Processing PDFs: 100%|██████████| 1/1 [00:06<00:00,  6.09s/it]
