In [1]:
from langchain_community.document_loaders import CSVLoader,PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain.vectorstores import FAISS
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
import os
from langchain_huggingface import HuggingFaceEmbeddings
from tqdm.autonotebook import tqdm, trange

  from tqdm.autonotebook import tqdm, trange


# Initialize

In [2]:
model_path = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name = model_path,      
    model_kwargs = model_kwargs,  
    encode_kwargs = encode_kwargs 
)

In [3]:
# Step 2: Create an empty FAISS index
dimension = 384  # Match the dimension of your embedding model
empty_index = faiss.IndexFlatL2(dimension)
docstore = InMemoryDocstore({})
faiss_db = FAISS(embedding_function=embeddings, index=empty_index,index_to_docstore_id={},docstore=docstore)

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000,
    chunk_overlap=750,
    length_function=len,
    is_separator_regex=False,)

# Get file path's

In [5]:
folder = './files/'
files = os.listdir(folder)

In [6]:
csv_count = 0
pdf_count = 0
csv_args={'delimiter': ','}
for file in files:

    ##### CSV Loader
    if file.endswith('.csv'):
        document = CSVLoader(file_path=folder + file,csv_args=csv_args,encoding='utf-8').load()
        splitted_text = text_splitter.split_documents(document)
        faiss_db.add_documents(splitted_text)
        csv_count += 1
        print(folder + file)
    

    ######PDF Loader
    elif file.endswith('.pdf'):
        document = PyPDFLoader(folder + file).load()
        try:
            splitted_text = text_splitter.split_documents(document)
            faiss_db.add_documents(splitted_text)
            pdf_count += 1
            print(folder + file)
        except:
            print(f"empty pdf: {folder + file}")

print(f"csv count: {csv_count} | pdf count: {pdf_count}")

./files/ODTUAkademikDurustluk-Kilavuzu-7.3.2016.son_.pdf
./files/7417_sayili_kanun_ile_2547_sayili_kanuna_eklenen_gecici_madde_83_uygulama_ilkeleri.pdf
./files/1.5.2547.pdf
empty pdf: ./files/7143_Sayili_Kanunun_Uygulama_ilkeleri.pdf
./files/suny_burs_yonergesi.pdf
./files/metu_programs_tr.csv
./files/reg_final.csv
./files/metudata.csv
./files/metu1_tr.csv
./files/metu_registration_tr.csv
./files/2016-2024-odtu-kazanimlari.pdf
./files/ODTU_20Sinav_20Kurallari-Kilavuz-7.4.2016.son_.pdf
./files/metu_int_registration_tr.csv
csv count: 6 | pdf count: 6


In [7]:
faiss_db.save_local("faiss_index")