In [7]:
import os
import glob
from langchain.memory import ConversationBufferWindowMemory
from langchain.llms.bedrock import Bedrock
from langchain.embeddings import BedrockEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import Docx2txtLoader
from langchain.document_loaders import UnstructuredXMLLoader
from langchain.document_loaders import TextLoader

In [8]:
documents_path='../datasets/UNv1.0-TEI/en/2010/a/hrc/14'

In [9]:
def generate_index_path(documents_path):
    # Remove initial '../' or './' if present
    clean_path = documents_path.lstrip('./').lstrip('../')
    # Replace '/' with '_'
    index_path_suffix = clean_path.replace('/', '_')
    return f'faiss_index_{index_path_suffix}'


## Embeddings model

In [10]:
embeddings_client = BedrockEmbeddings(
    region_name='us-east-1'
)

## Generate indexes

In [11]:
index_path = generate_index_path(documents_path)
if not os.path.exists(index_path):
    documents = []
    for file_path in glob.glob(os.path.join(documents_path, '**'), recursive=True):
        print(file_path)
        try:
            if file_path.endswith('.pdf'):
                loader = PyPDFLoader(file_path)
                documents.extend(loader.load())
            elif file_path.endswith('.docx') or file_path.endswith('.doc'):
                loader = Docx2txtLoader(file_path)
                documents.extend(loader.load())
            elif file_path.endswith('.txt'):
                loader = TextLoader(file_path)
                documents.extend(loader.load())
            elif file_path.endswith('.xml'):
                #loader = UnstructuredXMLLoader(file_path, mode="elements", strategy="fast")
                loader = TextLoader(file_path)
                documents.extend(loader.load())
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

    text_splitter = RecursiveCharacterTextSplitter( #create a text splitter
        separators=["\n\n", "\n", ".", " "], #split chunks at (1) paragraph, (2) line, (3) sentence, or (4) word, in that order
        chunk_size=1000, #divide into 1000-character chunks using the separators above
        chunk_overlap=100 #number of characters that can overlap with previous chunk
    )
    split_documents = text_splitter.split_documents(documents)
    print (len(split_documents))
    index = FAISS.from_documents(split_documents, embeddings_client)
    index.save_local(index_path)
else:
    index = FAISS.load_local(index_path, embeddings_client)

print('done!')

../datasets/UNv1.0-TEI/en/2010/a/hrc/14/
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/32
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/32/add_3.xml
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/32/add_2.xml
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/32/add_5.xml
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/32/add_4.xml
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/35
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/35/add_1.xml
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/ni
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/ni/8.xml
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/ni/9.xml
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/ni/11.xml
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/ni/10.xml
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/ni/4.xml
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/ni/5.xml
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/ni/7.xml
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/ni/6.xml
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/ni/2.xml
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/ni/3.xml
../datasets/UNv1.0-TEI/en/2010/a/hrc/14/ni/1.xml
../datasets/UNv1.0-TEI/e