In [1]:
import os
import sys

# Importing document loaders and parsers
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser

# Importing text splitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import Language

# Setting the directory path
DIRECTORY_PATH = "/Users/Chetan/PycharmProjects/RAG/Data"

# -------------------------------------------------------------------------------------------------------------
# Loading documents from the specified directory
print("-----LOADING------")

# Keyword arguments for text loader
text_loader_kwargs = {'autodetect_encoding': True}

# Create a DirectoryLoader instance
directory_loader = DirectoryLoader(DIRECTORY_PATH, glob="./*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)

# Load documents from the directory
directory_documents = directory_loader.load()

# -------------------------------------------------------------------------------------------------------------
# Splitting loaded documents into smaller chunks
print("-----SPLITTING------")

# Create a RecursiveCharacterTextSplitter instance
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=10,
    length_function=len,
    add_start_index=True,
)

# Split documents into chunks
chunks = text_splitter.split_documents(directory_documents)
print(f"Split {len(directory_documents)} documents into {len(chunks)} chunks.")

# -------------------------------------------------------------------------------------------------------------
# Creating a Chroma database from the chunks
print("-----CHROMA DB------")
from langchain_chroma import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings

# Create GPT4AllEmbeddings instance
gpt4all_embd = GPT4AllEmbeddings()

# Create Chroma database from chunks
db = Chroma.from_documents(chunks, gpt4all_embd)

# Create a retriever for the database
retriever = db.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 8},
)

# Importing necessary modules for building retrieval chains and chat bot
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# -------------------------------------------------------------------------------------------------------------
# Setting up a chat bot
print("-----CHAT BOT------")
from langchain_community.chat_models.ollama import ChatOllama

# Create ChatOllama instance
llm = ChatOllama(model="mistral")

# Template for the system's response
system_template = """
Answer the user's questions based on the below context.
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Use three sentences maximum and keep the answer as concise as possible:

{context}
"""

# Creating a prompt for generating search queries
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_template),
        ("user", "{input}"),
    ]
)

# Creating a retrieval chain for answering user queries
document_chain = create_stuff_documents_chain(llm, prompt)
qa_chain = create_retrieval_chain(retriever, document_chain)

# Example queries
qa_chain.pick("answer").invoke({"input": "who is the Professor?"})


-----LOADING------
-----SPLITTING------
Split 1 documents into 105 chunks.
-----CHROMA DB------
-----CHAT BOT------


' Professor Mohammad Q. Azhar is a computer information systems professor at CUNY BMCC and an expert in Educational Robotics, having led numerous workshops. He has also co-authored research on AI and mental health during the COVID-19 pandemic.'