In [1]:
# Importing the necessary Python libraries
import os
import json
import pandas as pd

In [2]:
# Loading the internal tool descriptions
with open('genericorp_internal_tools.json', 'r') as f:
    internal_tools = json.load(f)

In [12]:
# Setting the location of the tool filepath
tool_filepath = '../../genericorp/internal-tools/'

# Iterating over each of the internal tools
for tool, description in internal_tools.items():

    # Setting the tool name and description
    tool_name = tool
    tool_description = description

    # Setting the specific tool filepath
    current_tool_filepath = os.path.join(tool_filepath, tool_name)

    # Listing all .md files in the current tool's directory
    md_files = [f for f in os.listdir(current_tool_filepath) if f.endswith('.md')]

    # Instantiating a string to hold the combined content
    combined_doc_content = ""

    # Iterating over each of the markdown files
    for md_file in md_files:

        # Creating a readable title from the filename
        doc_title = os.path.splitext(md_file)[0].replace('_', ' ')
        
        # Read the content of the markdown file
        md_path = os.path.join(current_tool_filepath, md_file)
        with open(md_path, 'r', encoding='utf-8') as file:
            doc_content = file.read()
        
        # Append the tool, description, document title, and content to the combined_doc_content string
        combined_doc_content += f"Document Title: {doc_title}\n"
        combined_doc_content += f"Document Content:\n{doc_content}\n\n"

In [14]:
# Creating a prompt to generate a set of questions associated to each tool
rag_qa_generation_prompt_template = """
You are a technical writer helping to prepare content for a Retrieval-Augmented Generation (RAG) system.

Given the following block of documentation about an internal technical tool. Below, I have provided the tool name, tool description, and a number of knowledge item documents associated to the tool.

---
Tool name: {tool_name}
Tool description: {tool_description}
Knowledge document content:
{combined_doc_content}
---

Generate {num_questions} question-answer pairs that:
- Reflect the key information and capabilities presented.
- Vary in complexity (some simple factual, some more applied).
- Stay grounded in the source text (avoid hallucinating or inventing facts).
- Represent questions that a developer or internal user might realistically ask.

Each pair should be structured like this:

Q: [Question here]
A: [Answer grounded in the document]

Return the result as a list of Q&A pairs.
"""