# Code DocString  / Comment Generator

Submitted By : Bharat Puri

Goal: Build a code tool that scans Python modules, finds functions/classes
without docstrings, and uses an LLM (Claude / GPT / Gemini / Qwen etc.)
to generate high-quality Google or NumPy style docstrings.

In [11]:
# imports

import os
import io
import sys
import re
from dotenv import load_dotenv
import sys
sys.path.append(os.path.abspath(os.path.join("..", ".."))) 
from openai import OpenAI
import gradio as gr
import subprocess
from IPython.display import Markdown, display


In [None]:
load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')
grok_api_key = os.getenv('GROK_API_KEY')
groq_api_key = os.getenv('GROQ_API_KEY')
openrouter_api_key = os.getenv('OPENROUTER_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
else:
    print("Anthropic API Key not set (and this is optional)")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:2]}")
else:
    print("Google API Key not set (and this is optional)")

if grok_api_key:
    print(f"Grok API Key exists and begins {grok_api_key[:4]}")
else:
    print("Grok API Key not set (and this is optional)")

if groq_api_key:
    print(f"Groq API Key exists and begins {groq_api_key[:4]}")
else:
    print("Groq API Key not set (and this is optional)")

if openrouter_api_key:
    print(f"OpenRouter API Key exists and begins {openrouter_api_key[:6]}")
else:
    print("OpenRouter API Key not set (and this is optional)")



In [13]:
# Connect to client libraries

openai = OpenAI()

anthropic_url = "https://api.anthropic.com/v1/"
gemini_url = "https://generativelanguage.googleapis.com/v1beta/openai/"
grok_url = "https://api.x.ai/v1"
groq_url = "https://api.groq.com/openai/v1"
ollama_url = "http://localhost:11434/v1"
openrouter_url = "https://openrouter.ai/api/v1"

anthropic = OpenAI(api_key=anthropic_api_key, base_url=anthropic_url)
gemini = OpenAI(api_key=google_api_key, base_url=gemini_url)
grok = OpenAI(api_key=grok_api_key, base_url=grok_url)
groq = OpenAI(api_key=groq_api_key, base_url=groq_url)
ollama = OpenAI(api_key="ollama", base_url=ollama_url)
openrouter = OpenAI(api_key=openrouter_api_key, base_url=openrouter_url)

MODEL = os.getenv("DOCGEN_MODEL", "gpt-4o-mini")


# Registry for multiple model providers
MODEL_REGISTRY = {
    "gpt-4o-mini (OpenAI)": {
        "provider": "openai",
        "model": "gpt-4o-mini",
    },
    "gpt-4o (OpenAI)": {
        "provider": "openai",
        "model": "gpt-4o",
    },
    "claude-3.5-sonnet (Anthropic)": {
        "provider": "anthropic",
        "model": "claude-3.5-sonnet",
    },
    "gemini-1.5-pro (Google)": {
        "provider": "google",
        "model": "gemini-1.5-pro",
    },
    "codellama-7b (Open Source)": {
        "provider": "open_source",
        "model": "codellama-7b",
    },
    "starcoder2 (Open Source)": {
        "provider": "open_source",
        "model": "starcoder2",
    },
}



In [4]:
models = ["gpt-5", "claude-sonnet-4-5-20250929", "grok-4", "gemini-2.5-pro", "qwen2.5-coder", "deepseek-coder-v2", "gpt-oss:20b", "qwen/qwen3-coder-30b-a3b-instruct", "openai/gpt-oss-120b", ]

clients = {"gpt-5": openai, "claude-sonnet-4-5-20250929": anthropic, "grok-4": grok, "gemini-2.5-pro": gemini, "openai/gpt-oss-120b": groq, "qwen2.5-coder": ollama, "deepseek-coder-v2": ollama, "gpt-oss:20b": ollama, "qwen/qwen3-coder-30b-a3b-instruct": openrouter}

# Want to keep costs ultra-low? Replace this with models of your choice, using the examples from yesterday

In [5]:
# ================================================================
#  Prompt Templates and Utilities
# ================================================================

DOCSTYLE_TEMPLATES = {
    "google": (
        "You will write a concise Google-style Python docstring for the given function or class.\n"
        "Rules:\n"
        "- One-line summary followed by short details.\n"
        "- Include Args:, Returns:, Raises: only if relevant.\n"
        "- Keep under 12 lines, no code fences or markdown formatting.\n"
        "Return ONLY the text between triple quotes."
    ),
}

SYSTEM_PROMPT = (
    "You are a senior Python engineer and technical writer. "
    "Write precise, helpful docstrings."
)


def make_user_prompt(style: str, module_name: str, signature: str, code_context: str) -> str:
    """Build the user message for the model based on template and context."""
    instr = DOCSTYLE_TEMPLATES.get(style, DOCSTYLE_TEMPLATES["google"])
    prompt = (
        f"{instr}\n\n"
        f"Module: {module_name}\n"
        f"Signature:\n{signature}\n\n"
        f"Code context:\n{code_context}\n\n"
        "Return ONLY a triple-quoted docstring, for example:\n"
        '"""One-line summary.\n\n'
        "Args:\n"
        "    x: Description\n"
        "Returns:\n"
        "    y: Description\n"
        '"""'
    )
    return prompt



In [14]:
# ================================================================
# LLM Chat Helper — OpenAI GPT
# ================================================================
def llm_generate_docstring(signature: str, context: str, style: str = "google", 
                           module_name: str = "module", model_choice: str = "gpt-4o-mini (OpenAI)") -> str:
    """
    Generate a Python docstring using the selected model provider.
    """
    user_prompt = make_user_prompt(style, module_name, signature, context)
    model_info = MODEL_REGISTRY.get(model_choice, MODEL_REGISTRY["gpt-4o-mini (OpenAI)"])

    provider = model_info["provider"]
    model_name = model_info["model"]

    if provider == "openai":
        response = openai.chat.completions.create(
            model=model_name,
            temperature=0.2,
            messages=[
                {"role": "system", "content": "You are a senior Python engineer and technical writer."},
                {"role": "user", "content": user_prompt},
            ],
        )
        text = response.choices[0].message.content.strip()

    elif provider == "anthropic":
        # Future: integrate Anthropic SDK
        text = "Claude response simulation: " + user_prompt[:200]

    elif provider == "google":
        # Future: integrate Gemini API
        text = "Gemini response simulation: " + user_prompt[:200]

    else:
        # Simulated open-source fallback
        text = f"[Simulated output from {model_name}]\nAuto-generated docstring for {signature}"

    import re
    match = re.search(r'"""(.*?)"""', text, re.S)
    return match.group(1).strip() if match else text



In [15]:
# ================================================================
# 🧱 AST Parsing Utilities — find missing docstrings
# ================================================================
import ast

def node_signature(node: ast.AST) -> str:
    """
    Build a readable signature string from a FunctionDef or ClassDef node.
    Example: def add(x, y) -> int:
    """
    if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
        args = [a.arg for a in node.args.args]
        if node.args.vararg:
            args.append("*" + node.args.vararg.arg)
        for a in node.args.kwonlyargs:
            args.append(a.arg + "=?")
        if node.args.kwarg:
            args.append("**" + node.args.kwarg.arg)
        ret = ""
        if getattr(node, "returns", None):
            try:
                ret = f" -> {ast.unparse(node.returns)}"
            except Exception:
                pass
        return f"def {node.name}({', '.join(args)}){ret}:"

    elif isinstance(node, ast.ClassDef):
        return f"class {node.name}:"

    return ""


def context_snippet(src: str, node: ast.AST, max_lines: int = 60) -> str:
    """
    Extract a small snippet of source code around a node for context.
    This helps the LLM understand what the function/class does.
    """
    lines = src.splitlines()
    start = getattr(node, "lineno", 1) - 1
    end = getattr(node, "end_lineno", start + 1)
    snippet = lines[start:end]
    if len(snippet) > max_lines:
        snippet = snippet[:max_lines] + ["# ... (truncated) ..."]
    return "\n".join(snippet)


def find_missing_docstrings(src: str):
    """
    Parse the Python source code and return a list of nodes
    (module, class, function) that do NOT have docstrings.
    """
    tree = ast.parse(src)
    missing = []

    # Module-level docstring check
    if ast.get_docstring(tree) is None:
        missing.append(("module", tree))

    # Walk through the AST for classes and functions
    for node in ast.walk(tree):
        if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
            if ast.get_docstring(node) is None:
                kind = "class" if isinstance(node, ast.ClassDef) else "function"
                missing.append((kind, node))

    return missing


In [None]:
## Quick Test ##

code = '''
def add(x, y):
    return x + y

class Counter:
    def inc(self):
        self.total += 1
'''

for kind, node in find_missing_docstrings(code):
    print(f"Missing docstring → {kind}: {node_signature(node)}")



In [17]:
# ================================================================
# Insert Generated Docstrings into Code
# ================================================================
import difflib
import textwrap

def insert_docstring(src: str, node: ast.AST, docstring: str) -> str:
    """
    Insert a generated docstring inside a function/class node.
    Keeps indentation consistent with the original code.
    """
    lines = src.splitlines()
    if not hasattr(node, "body") or not node.body:
        return src  # nothing to insert into

    start_idx = node.body[0].lineno - 1
    indent = re.match(r"\s*", lines[start_idx]).group(0)
    ds_lines = textwrap.indent(f'"""{docstring.strip()}"""', indent).splitlines()

    new_lines = lines[:start_idx] + ds_lines + [""] + lines[start_idx:]
    return "\n".join(new_lines)


def insert_module_docstring(src: str, docstring: str) -> str:
    """Insert a module-level docstring at the top of the file."""
    lines = src.splitlines()
    ds_block = f'"""{docstring.strip()}"""\n'
    return ds_block + "\n".join(lines)


def diff_text(a: str, b: str) -> str:
    """Show unified diff of original vs updated code."""
    return "".join(
        difflib.unified_diff(
            a.splitlines(keepends=True),
            b.splitlines(keepends=True),
            fromfile="original.py",
            tofile="updated.py",
        )
    )


def generate_docstrings_for_source(src: str, style: str = "google", module_name: str = "module", model_choice: str = "gpt-4o-mini (OpenAI)"):
    targets = find_missing_docstrings(src)
    updated = src
    report = []

    for kind, node in sorted(targets, key=lambda t: 0 if t[0] == "module" else 1):
        sig = "module " + module_name if kind == "module" else node_signature(node)
        ctx = src if kind == "module" else context_snippet(src, node)
        doc = llm_generate_docstring(sig, ctx, style=style, module_name=module_name, model_choice=model_choice)

        if kind == "module":
            updated = insert_module_docstring(updated, doc)
        else:
            updated = insert_docstring(updated, node, doc)

        report.append({"kind": kind, "signature": sig, "model": model_choice, "doc_preview": doc[:150]})

    return updated, report


In [None]:
## Quick Test ##
new_code, report = generate_docstrings_for_source(code, style="google", module_name="demo")

print("=== Generated Docstrings ===")
for r in report:
    print(f"- {r['kind']}: {r['signature']}")
    print("  ", r['doc_preview'])
print("\n=== Updated Source ===")
print(new_code)


In [20]:
# ================================================================
# 📂 File-Based Workflow — preview or apply docstrings
# ================================================================
from pathlib import Path
import pandas as pd

def process_file(path: str, style: str = "google", apply: bool = False) -> pd.DataFrame:
    """
    Process a .py file: find missing docstrings, generate them via GPT,
    and either preview the diff or apply the updates in place.
    """
    p = Path(path)
    src = p.read_text(encoding="utf-8")
    updated, rows = generate_docstrings_for_source(src, style=style, module_name=p.stem)

    if apply:
        p.write_text(updated, encoding="utf-8")
        print(f"✅ Updated file written → {p}")
    else:
        print("🔍 Diff preview:")
        print(diff_text(src, updated))

    return pd.DataFrame(rows)

# Example usage:
# df = process_file("my_script.py", style="google", apply=False)  # preview
# df = process_file("my_script.py", style="google", apply=True)   # overwrite with docstrings
# df



In [21]:
# ================================================================
# 📂 File-Based Workflow — preview or apply docstrings
# ================================================================
from pathlib import Path
import pandas as pd

def process_file(path: str, style: str = "google", apply: bool = False) -> pd.DataFrame:
    """
    Process a .py file: find missing docstrings, generate them via GPT,
    and either preview the diff or apply the updates in place.
    """
    p = Path(path)
    src = p.read_text(encoding="utf-8")
    updated, rows = generate_docstrings_for_source(src, style=style, module_name=p.stem)

    if apply:
        p.write_text(updated, encoding="utf-8")
        print(f"✅ Updated file written → {p}")
    else:
        print("🔍 Diff preview:")
        print(diff_text(src, updated))

    return pd.DataFrame(rows)

# Example usage:
# df = process_file("my_script.py", style="google", apply=False)  # preview
# df = process_file("my_script.py", style="google", apply=True)   # overwrite with docstrings
# df



In [None]:
# ================================================================
# 🎨 Enhanced Gradio Interface with Model Selector
# ================================================================
import gradio as gr

def gradio_generate(code_text: str, style: str, model_choice: str):
    """Wrapper for Gradio — generates docstrings using selected model."""
    if not code_text.strip():
        return "⚠️ Please paste some Python code first."
    try:
        updated, _ = generate_docstrings_for_source(
            code_text, style=style, module_name="gradio_snippet", model_choice=model_choice
        )
        return updated
    except Exception as e:
        return f"❌ Error: {e}"

with gr.Blocks(theme=gr.themes.Soft()) as doc_ui:
    gr.Markdown("## 🧠 Auto Docstring Generator — by Bharat Puri\nChoose your model and generate high-quality docstrings.")

    with gr.Row():
        code_input = gr.Code(label="Paste your Python code", language="python", lines=18)
        code_output = gr.Code(label="Generated code with docstrings", language="python", lines=18)

    with gr.Row():
        style_choice = gr.Radio(["google"], value="google", label="Docstring Style")
        model_choice = gr.Dropdown(
            list(MODEL_REGISTRY.keys()),
            value="gpt-4o-mini (OpenAI)",
            label="Select Model",
        )

    generate_btn = gr.Button("🚀 Generate Docstrings")
    generate_btn.click(
        fn=gradio_generate,
        inputs=[code_input, style_choice, model_choice],
        outputs=[code_output],
    )

doc_ui.launch(share=False)
