In [7]:
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent, AgentType
from langchain.tools import tool
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.memory import ConversationBufferMemory
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.chains import RetrievalQA
from pathlib import Path
import gradio as gr
from langchain.tools import Tool
import os

In [8]:
class ProjectRootDetector:
    def __init__(self):
        self.root_indicators = [
            '.git',        
            'package.json',  
            'requirements.txt', 
            'pyproject.toml', 
            'Cargo.toml',     
            'go.mod',       
            'pom.xml',       
            'build.gradle',  
            'Dockerfile',  
            'docker-compose.yml',
            'README.md',
            'setup.py',
            'CMakeLists.txt'
        ]
    
    def find_root(self, use_file_path_as_root=False, root_path=None):
        """Find project root starting from any file path"""
        if root_path: 
            return root_path

        if use_file_path_as_root:
            return str(Path.cwd())
            
        current_path = Path.cwd()
        
        # Walk up the directory tree
        while current_path != current_path.parent:
            for indicator in self.root_indicators:
                if (current_path / indicator).exists():
                    return str(current_path)
            current_path = current_path.parent
       
        return str(Path.cwd())

In [None]:
class RepoIntelligentAssistant:
    system_prompt = """
    You are an intelligent software engineer and codebase assistant with deep expertise in all programming domains (software engineering, data science, web development, etc.).

    BEHAVIOR GUIDELINES:

    1. QUESTION EVALUATION:
    - First determine if the question requires repository context or general coding knowledge
    - For repo-specific questions: Use tools to fetch relevant code/files
    - For general coding questions: Use your expertise to answer directly

    2. CODE EXPLANATION:
    - When explaining code: Break it down line-by-line
    - Show the actual code and explain what each line does
    - Focus on the specific implementation in the repository
    - Explain the purpose and context within the codebase

    3. CODE IMPROVEMENTS:
    - When suggesting improvements: Analyze the repository structure first
    - If suggesting new functions/classes: Write complete, working code
    - Base suggestions on the existing codebase patterns and architecture
    - Show the user the complete code implementation

    4. CODE MODIFICATION:
    - NEVER modify files directly
    - Always show the user the updated code
    - Let the user decide whether to implement changes
    - Provide clear instructions on where/how to apply changes

    5. RESPONSE SEQUENCE:
    a) Evaluate question type (repo-specific vs general)
    b) If repo-specific: Fetch relevant context using tools
    c) Provide detailed explanation with code examples
    d) If suggesting improvements: Show complete code implementation
    e) Always explain the reasoning behind suggestions

    Your tone: Precise, technical, and helpful - like a senior developer mentoring a colleague.
    """

    MODEL = "gpt-4o-mini"

    def __init__(self, use_file_path_as_root = False, root_path=None):
        self.llm = ChatOpenAI(
            temperature=0.7, 
            callbacks=[StreamingStdOutCallbackHandler()],
            model_name=self.MODEL
        )
        self.root_folder = ProjectRootDetector().find_root(root_path=root_path, use_file_path_as_root=use_file_path_as_root)
        self.agent = None  # Initialize agent as None

       
    def get_file(self, file_name: str):
        matches = []
        for root, dirs, files in os.walk(self.root_folder):
            print(files, flush=True)
            for f in files:
                if f == file_name:
                    matches.append(os.path.join(root, f))

        if not matches:
            return f"No file named '{file_name}' found in the repository."

        if len(matches) > 1:
            listed = "\n".join(matches)
            return (
                f"There are multiple files named '{file_name}'. "
                f"Please specify the full path.\n\n{listed}"
            )

        file_path = matches[0]
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
            return f"Content of '{file_name}':\n\n{content}"
        except Exception as e:
            return f"Error reading file: {e}"
        
    
    def _get_all_folders_recursive(self, root_path):
        """Get all folders recursively from root path"""
        all_folders = []
        restricted_extensions = {
            '.exe', '.dll', '.so', '.dylib', '.bin', '.app', '.deb', '.rpm',
            '.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz',
            '.pyc', '.pyo', '.pyd', '.class', '.jar', '.war',
            '.db', '.sqlite', '.sqlite3', '.mdb', '.accdb'
        }

        skip_folders = {
            '.git', '.svn', '.hg', '.bzr', 'node_modules', 'bower_components', 'vendor', '.venv',
            '__pycache__', '.pytest_cache', '.coverage', 'venv', '.venv', 'env', '.env',
            'dist', 'build', 'target', 'out', 'bin', 'obj', '.vscode', '.idea', '.vs', '.eclipse',
            '.DS_Store', 'Thumbs.db', '.Trash', 'temp', 'tmp', 'cache', 'logs', '.cache', '.tmp',
            'packages', 'lib', 'libs', 'dependencies', 'generated', 'auto-generated', '.generated',
            'backup', 'backups', '.backup', 'coverage', '.nyc_output', 'htmlcov','docs/_build', 'site', '_site'
        }
        
        for root, dirs, files in os.walk(root_path):
            dirs[:] = [d for d in dirs if d not in skip_folders]
            
            root_name = os.path.basename(root)
            if root_name in skip_folders:
                continue
                
            has_restricted = any(os.path.splitext(file)[1].lower() in restricted_extensions for file in files)
            if has_restricted:
                continue

            all_folders.append(root)
                
        return all_folders

    def create_doc(self):
        folders = self._get_all_folders_recursive(self.root_folder)
        documents = []
        for folder in folders:
            doc_type = os.path.basename(folder)
            print(doc_type)
            loader = DirectoryLoader(
                folder, 
                glob=["**/*.py", "**/*.ipynb", "**/*.md", "**/*.txt", "**/*.json", "**/*.csv", "**/*.html", "**/*.css", "**/*.js", "**/*.ts", "**/*.java", "**/*.c", "**/*.cpp", "**/*.go", "**/*.rust", "**/*.scala", "**/*.swift", "**/*.kotlin"], 
                loader_cls=TextLoader, loader_kwargs={"encoding": "utf-8"}
            )
            folder_docs = loader.load()
            for doc in folder_docs:
                doc.metadata["doc_type"] = doc_type
                documents.append(doc)
        return documents
    
    def create_retriever(self):
        embeddings =  OpenAIEmbeddings()

        documents = self.create_doc()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        chunks = text_splitter.split_documents(documents)

        vectorstore = FAISS.from_documents(chunks, embeddings)

        return vectorstore.as_retriever(k=30)

    def create_agent(self):
        retriever = self.create_retriever()
        rag_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            retriever=retriever,
            chain_type="stuff"
        )

        # Wrap RAG chain as a Tool
        rag_tool = Tool(
            name="repo_context_search",
            func=lambda q: rag_chain.run(q),
            description="Search the repository for specific code, functions, classes, or implementation details. Use this when the user asks about something specific in the codebase like 'what does this function do', 'explain this code', or 'how is X implemented'."
        )

        @tool("get_file", return_direct=False)
        def get_file(x):
            """
            Fetch the content of a specific file by name to analyze its code implementation.
            Use when user asks to 'explain this file', 'show me the code for X', or 'what's in this file'.
            If multiple files share the same name, list their paths instead.
            """
            return self.get_file(x)

        tools = [rag_tool, get_file]
        # Alternative: Use system message in memory
        memory = ConversationBufferMemory(
            memory_key="chat_history", 
            input_key="input", 
            return_messages=True,
            system_message=self.system_prompt  # Add system message here
        )

        agent = initialize_agent(
            tools,
            self.llm,
            agent_type=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,
            handle_parsing_errors=True,
            memory=memory,
            verbose=True
            # Remove agent_kwargs since we're using memory system_message
        )

        return agent

    def run(self, question, history=None):
        try:
            # Create agent only once
            if not self.agent:
                self.agent = self.create_agent()
            
            response = self.agent.stream(question)
            for chunk in response:
                if chunk.get("output"):
                    yield chunk["output"]
        except Exception as e:
            yield f"Error: {str(e)}"

            

In [10]:
G = RepoIntelligentAssistant(root_path="C:/Users/hp/projects/gen-ai/llm_engineering/week5/community-contributions/tourist-guide/")

In [None]:
interface = gr.ChatInterface(G.run, type="messages")
interface.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




  embeddings =  OpenAIEmbeddings()





  memory = ConversationBufferMemory(
  agent = initialize_agent(




[1m> Entering new None chain...[0m
[32;1m[1;3mI need to search the repository for the specific implementation of the `load_knowledge_base` function to explain its functionality line by line.  
Action: repo_context_search  
Action Input: "load_knowledge_base"  [0m

  func=lambda q: rag_chain.run(q),



Observation: [36;1m[1;3mI don't know.[0m
Thought:[32;1m[1;3mIt seems that my initial search for the `load_knowledge_base` function did not return any results. I'll try to broaden my search to find any definitions or references to `load_knowledge_base` in the repository.  
Action: repo_context_search  
Action Input: "def load_knowledge_base"  [0m
Observation: [36;1m[1;3mI don't know.[0m
Thought:[32;1m[1;3mIt appears that I'm unable to locate the `load_knowledge_base` function in the repository. It's possible that the function might be named differently or located in a different part of the codebase than I expected. I will try searching for just "load" in the hopes of finding related functions or any relevant implementations that could help identify the correct function.  
Action: repo_context_search  
Action Input: "load"  [0m
Observation: [36;1m[1;3mI don't know what you mean by "load." Could you please provide more context or clarify your question?[0m
Thought:[32;1m[

  self.memory.save_context(inputs, outputs)




[1m> Entering new None chain...[0m
[32;1m[1;3mTo provide an explanation, I need to know what specific code or concept I previously explained. However, since I don't have that context, I will need to look for any recent explanations in the repository. I'll perform a search to identify any recent discussions or explanations that might relate to the current question.

Action: repo_context_search
Action Input: "explain"  # I will search for explanations in the repository
[0m
Observation: [36;1m[1;3mI'm sorry, but I don't know what you're referring to with "I will search for explanations in the repository." Could you please provide more context or clarify your question?[0m
Thought:[32;1m[1;3mI need to clarify the user's request to provide a meaningful explanation. Since the initial question does not specify which code or concept to explain, I should ask the user for more details about what they would like me to explain. 

Final Answer: Could you please specify which code or conc

  self.memory.save_context(inputs, outputs)




[1m> Entering new None chain...[0m
[32;1m[1;3mI need to acknowledge the user's name. 
Action: None[0m
Observation: Invalid Format: Missing 'Action Input:' after 'Action:'
Thought:[32;1m[1;3mI need to acknowledge the user's name. However, I cannot perform any actions based on the provided statement since it's not a query about the codebase. 

Final Answer: Nice to meet you, Ed! How can I assist you today?[0m

[1m> Finished chain.[0m


  self.memory.save_context(inputs, outputs)




[1m> Entering new None chain...[0m
[32;1m[1;3mI don't have any information about your name or identity. 
Action: None[0m
Observation: Invalid Format: Missing 'Action Input:' after 'Action:'
Thought:[32;1m[1;3mI don't have any information about your name or identity. 
Action: None[0m
Observation: Invalid Format: Missing 'Action Input:' after 'Action:'
Thought:[32;1m[1;3mI don't have any information about your name or identity. 
Action: None[0m
Observation: Invalid Format: Missing 'Action Input:' after 'Action:'
Thought:[32;1m[1;3mI don't have any information about your name or identity. 
Action: None[0m
Observation: Invalid Format: Missing 'Action Input:' after 'Action:'
Thought:[32;1m[1;3mI don't have any information about your name or identity. 
Action: None[0m
Observation: Invalid Format: Missing 'Action Input:' after 'Action:'
Thought:[32;1m[1;3mI don't have any information about your name or identity. 
Action: None[0m
Observation: Invalid Format: Missing 'Acti

Stopping agent prematurely due to triggering stop condition


[32;1m[1;3mI don't have any information about your name or identity. 
Action: None[0m
Observation: Invalid Format: Missing 'Action Input:' after 'Action:'
Thought:[32;1m[1;3m[0m

[1m> Finished chain.[0m


  self.memory.save_context(inputs, outputs)




[1m> Entering new None chain...[0m
[32;1m[1;3mTo explain the `read_pdf` function and its implementation line by line, I first need to locate this specific function within the codebase. I will search for `read_pdf` in the repository to find its definition and details.

Action: repo_context_search  
Action Input: read_pdf  [0m
Observation: [36;1m[1;3mThe `read_pdf` function is designed to read a PDF file and extract its text content. It uses the `PdfReader` from the `pypdf` library to iterate through each page of the PDF and concatenate the extracted text into a single string. If you have a specific PDF file you want to read, please provide the file path, and I can guide you further.[0m
Thought:[32;1m[1;3mI have identified the purpose of the `read_pdf` function and its use of the `PdfReader` from the `pypdf` library to extract text from a PDF file. However, to provide a line-by-line explanation, I need to retrieve the actual code for the `read_pdf` function from the repositor

  self.memory.save_context(inputs, outputs)




[1m> Entering new None chain...[0m
[32;1m[1;3mI need to search the repository for the `search_attractions` function to understand its implementation and then explain each line of the code. 

Action: repo_context_search  
Action Input: "search_attractions"  [0m
Observation: [36;1m[1;3mI don't know.[0m
Thought:[32;1m[1;3mIt seems I couldn't locate the `search_attractions` function directly. I will try to search for it again with a more general query to see if I can find any relevant information about it. 

Action: repo_context_search  
Action Input: "def search_attractions"  [0m
Observation: [36;1m[1;3mI'm sorry, but I don't know the answer to that.[0m
Thought:[32;1m[1;3mIt appears that I'm unable to find the `search_attractions` function through my current searches. To provide a detailed explanation, I will try to fetch specific files that may contain the function definition. 

Action: get_file  
Action Input: "search_attractions.py"  # Assuming this file may contain t

  self.memory.save_context(inputs, outputs)




[1m> Entering new None chain...[0m
[32;1m[1;3mI need to locate the `search_attractions` function within the `tourist-assistant` file to understand its implementation and explain each line of code. I'll start by searching for the specific function in the relevant file.

Action: repo_context_search  
Action Input: search_attractions function in tourist-assistant  
[0m
Observation: [36;1m[1;3mI don't know.[0m
Thought:[32;1m[1;3mI need to fetch the content of the `tourist-assistant` file to find the `search_attractions` function directly and understand its implementation. 

Action: get_file  
Action Input: tourist-assistant  
[0m['README.md', 'requirements.txt', 'style.css', 'tourist-assistant.py', 'travel.jpg']

Observation: [33;1m[1;3mNo file named 'tourist-assistant  
' found in the repository.[0m
Thought:[32;1m[1;3mIt seems that there is no file named 'tourist-assistant' in the repository. I will search for any file that might contain the keyword 'search_attractions' 

  self.memory.save_context(inputs, outputs)
