In [1]:
import os
import sys

# Importing document loaders and parsers
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser

# Importing text splitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import Language

# Setting the directory path
DIRECTORY_PATH = "/Users/Chetan/PycharmProjects/RAG/Data"

# -------------------------------------------------------------------------------------------------------------
# Loading documents from the specified directory
print("-----LOADING------")

# Keyword arguments for text loader
text_loader_kwargs = {'autodetect_encoding': True}

# Create a DirectoryLoader instance
directory_loader = DirectoryLoader(DIRECTORY_PATH, glob="./*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)

# Load documents from the directory
directory_documents = directory_loader.load()

# -------------------------------------------------------------------------------------------------------------
# Splitting loaded documents into smaller chunks
print("-----SPLITTING------")

# Create a RecursiveCharacterTextSplitter instance
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=10,
    length_function=len,
    add_start_index=True,
)

# Split documents into chunks
chunks = text_splitter.split_documents(directory_documents)
print(f"Split {len(directory_documents)} documents into {len(chunks)} chunks.")

# -------------------------------------------------------------------------------------------------------------
# Creating a Chroma database from the chunks
print("-----CHROMA DB------")
from langchain_chroma import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings

# Create GPT4AllEmbeddings instance
gpt4all_embd = GPT4AllEmbeddings()

# Create Chroma database from chunks
db = Chroma.from_documents(chunks, gpt4all_embd)

# Create a retriever for the database
# Maximum marginal relevance search (MMR)
# The "k" parameter specifies the number of top results to retrieve from the database.
retriever = db.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 8},
)
print(retriever)


-----LOADING------
-----SPLITTING------
Split 1 documents into 105 chunks.
-----CHROMA DB------
tags=['Chroma', 'GPT4AllEmbeddings'] vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x123e4c310> search_type='mmr' search_kwargs={'k': 8}
