# Importing the Dependencies and Setting the Variables

In [4]:
import os
from git import Repo
from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chains import ConversationalRetrievalChain
from dotenv import load_dotenv

In [6]:
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


# Cloning the Repositories for testing the code

In [13]:
%mkdir test_repo

In [14]:
Repo.clone_from("https://github.com/d-pamneja/Grocery_Store_Application_V2",to_path="./test_repo/")

<git.repo.base.Repo '/Users/dhruv/Desktop/Machine_Learning/Projects/Source_Code_Analysis_Application/research/test_repo/.git'>

# Loading the Test Repository

In [17]:
# Here, we first only want to check of the APIs built in our test repo

loader = GenericLoader.from_filesystem(
    "./test_repo/api",
    glob="**/*",
    suffixes=[".py"], # Will load only python files, remove if you want to load multiple file types
    parser= LanguageParser(language=Language.PYTHON,parser_threshold=500)
)

In [19]:
documents = loader.load()
documents[:5]

[Document(page_content='def extract_role(user): #A function to extract the role of our user\n    roles = [role.name for role in user.roles]\n    return roles[0] if roles else None', metadata={'source': 'test_repo/api/resource.py', 'content_type': 'functions_classes', 'language': <Language.PYTHON: 'python'>}),
 Document(page_content='def parse_date(date_string):\n    if not date_string:\n        return None\n    \n    try:\n        return datetime.strptime(date_string, \'%Y-%m-%d\').date()\n    except ValueError:\n        raise ValueError("Invalid date format. Expected YYYY-MM-DD.")', metadata={'source': 'test_repo/api/resource.py', 'content_type': 'functions_classes', 'language': <Language.PYTHON: 'python'>}),
 Document(page_content="class DateToStringField(fields.Raw):\n    def format(self, value):\n        return value.strftime('%Y-%m-%d') if value else None", metadata={'source': 'test_repo/api/resource.py', 'content_type': 'functions_classes', 'language': <Language.PYTHON: 'python'>

As we can see, our API file has been loaded and we can see the contents of the file in the documents list.

# Chunking the Documents via Context-Aware Splitting

In [20]:
document_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size = 2000,
    chunk_overlap = 200,
    )

In [21]:
texts = document_splitter.split_documents(documents=documents)

In [24]:
print(f"The length of the chunks is : {len(texts)}")

The length of the chunks is : 45


As we can see, for the timebeing we are working with a smaller repositery and we have chunked the documents into smaller parts. We can see the first 5 chunks of the documents.

In [25]:
texts[:5]

[Document(page_content='def extract_role(user): #A function to extract the role of our user\n    roles = [role.name for role in user.roles]\n    return roles[0] if roles else None', metadata={'source': 'test_repo/api/resource.py', 'content_type': 'functions_classes', 'language': <Language.PYTHON: 'python'>}),
 Document(page_content='def parse_date(date_string):\n    if not date_string:\n        return None\n    \n    try:\n        return datetime.strptime(date_string, \'%Y-%m-%d\').date()\n    except ValueError:\n        raise ValueError("Invalid date format. Expected YYYY-MM-DD.")', metadata={'source': 'test_repo/api/resource.py', 'content_type': 'functions_classes', 'language': <Language.PYTHON: 'python'>}),
 Document(page_content="class DateToStringField(fields.Raw):\n    def format(self, value):\n        return value.strftime('%Y-%m-%d') if value else None", metadata={'source': 'test_repo/api/resource.py', 'content_type': 'functions_classes', 'language': <Language.PYTHON: 'python'>

# Embedding the Documents via OpenAIEmbedding

In [26]:
embeddings = OpenAIEmbeddings(disallowed_special=()) # Will load the model and disallow or ommit special characters in code base

# Creating the Knowledge Base or Vector DB using ChromaDB

In [27]:
vectordb = Chroma.from_documents(
    texts,
    embedding=embeddings,
    persist_directory="./data"
)
vectordb.persist()

# Creating the Wrapper for the LLM

In [31]:
llm = ChatOpenAI(
    model="gpt-3.5-turbo"
)

Now, it is also important that our llm has the memory of the previous queries and the responses. So, we will create a wrapper for the LLM with a conversation memory as well.

In [32]:
memory = ConversationSummaryMemory(
    llm = llm,
    memory_key="chat_history",
    return_messages=True
)

Also, we will have to also create a wrapper for the conversational retrieval model as well using.

In [33]:
qa = ConversationalRetrievalChain.from_llm(
    llm = llm,
    retriever = vectordb.as_retriever(
        search_type = "mmr", # Here we are using Maximal Marginal Relevance to get the best results, which is a type of search where we get the most relevant results by removing the redundant ones
        search_kwargs = {"k":3} # Here we are getting the top 3 results
    ),
    memory = memory
)

# Q&A Chain

In [None]:
question = "what is the EditCartItemResource class?"